diff --git a/.gitmodules b/.gitmodules
index 605fac63cc4..ecd69af403b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,7 +1,3 @@
-[submodule "cpp/nvgraph/cpp/thirdparty/cnmem"]
-	path = cpp/nvgraph/cpp/thirdparty/cnmem
-	url = https://github.com/NVIDIA/cnmem.git
-	branch = master
 [submodule "cpp/nvgraph/cpp/thirdparty/cub"]
 	path = cpp/nvgraph/cpp/thirdparty/cub
 	url = https://github.com/NVlabs/cub.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 05bb075c7b7..c5d81c232ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,19 @@
+# cuGraph 0.8.0 (Date TBD)
+
+## New Features
+- PR #287 SNMG power iteration step1 
+- PR #297 SNMG degree calculation 
+
+## Improvements
+- PR #291 nvGraph is updated to use RMM instead of directly invoking cnmem functions.
+- PR #286 Reorganized cugraph source directory
+
+
+## Bug Fixes
+- PR #283 Automerge fix
+- PR #291 Fixed a RMM memory allocation failure due to duplicate copies of cnmem.o
+- PR #291 Fixed a cub CsrMV call error when RMM pool allocator is used.
+
 # cuGraph 0.7.0 (Date TBD)
 
 ## New Features
@@ -54,6 +70,7 @@
 - PR #262 Removed networkx conda dependency for both build and runtime
 - PR #271 Removed nvgraph conda dependency
 - PR #276 Removed libgdf_cffi import from bindings
+- PR #288 Add boost as a conda dependency
 
 # cuGraph 0.6.0 (22 Mar 2019)
 
diff --git a/README.md b/README.md
index 85183d35f27..d2f699ab7b6 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ These limitations are being addressed and will be fixed future versions.
 
 ## Getting cuGraph
 ### Intro
-There are 4 ways to get cuGraph :
+There are 3 ways to get cuGraph :
 1. [Quick start with Docker Demo Repo](#quick)
 1. [Conda Installation](#conda)
 1. [Build from Source](#source)
@@ -133,5 +133,4 @@ The RAPIDS suite of open source software libraries aim to enable execution of en
 
 ### Apache Arrow on GPU
 
-The GPU version of [Apache Arrow](https://arrow.apache.org/) is a common API that enables efficient interchange of tabular data between processes running on the GPU. End-to-end computation on the GPU avoids unnecessary copying and converting of data off the GPU, reducing compute time and cost for high-performance analytics common in artificial intelligence workloads. As the name implies, cuDF uses the Apache Arrow columnar data format on the GPU. Currently, a subset of the features in Apache Arrow are supported.
-
+The GPU version of [Apache Arrow](https://arrow.apache.org/) is a common API that enables efficient interchange of tabular data between processes running on the GPU. End-to-end computation on the GPU avoids unnecessary copying and converting of data off the GPU, reducing compute time and cost for high-performance analytics common in artificial intelligence workloads. As the name implies, cuDF uses the Apache Arrow columnar data format on the GPU. Currently, a subset of the features in Apache Arrow are supported.
\ No newline at end of file
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index 25a7d426bd2..156db3777a3 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -26,11 +26,11 @@ requirements:
   build:
     - python x.x
     - libcugraph={{ version }}
-    - cudf=0.7*
+    - cudf=0.8*
   run:
     - python x.x
     - libcugraph={{ version }}
-    - cudf=0.7*
+    - cudf=0.8*
 
 #test:
 #  commands:
diff --git a/conda/recipes/libcugraph/meta.yaml b/conda/recipes/libcugraph/meta.yaml
index 0111fef13c8..18bb757eaf9 100644
--- a/conda/recipes/libcugraph/meta.yaml
+++ b/conda/recipes/libcugraph/meta.yaml
@@ -25,15 +25,15 @@ build:
 requirements:
   build:
     - cmake>=3.12.4
-    - libcudf=0.7*
+    - libcudf=0.8*
     - cython
     - cudatoolkit {{ cuda_version }}.*
-    - boost-cpp
+    - boost
   run:
-    - libcudf=0.7*
+    - libcudf=0.8*
     - cython
     - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
-    - boost-cpp
+    - boost
 
 #test:
 #  commands:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 464fed08342..f37b0024489 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -16,7 +16,7 @@
 #=============================================================================
 cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 
-project(cuGraph VERSION 0.6.0 LANGUAGES C CXX CUDA)
+project(cuGraph VERSION 0.8.0 LANGUAGES C CXX CUDA)
 
 ###################################################################################################
 # - cmake modules ---------------------------------------------------------------------------------
@@ -136,7 +136,7 @@ include(ConfigureNvgraph)
 
 ###################################################################################################
 # - Find and add different modules and supporting repos -------------------------------------------
-find_package(Boost 1.45.0 COMPONENTS system)
+find_package(Boost REQUIRED)
 
 find_package(OpenMP)
 if (OPENMP_FOUND)
@@ -229,14 +229,21 @@ link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT
 ###################################################################################################
 # - library targets -------------------------------------------------------------------------------
 add_library(cugraph SHARED
-    src/grmat.cu
-    src/cugraph.cu
-    src/pagerank.cu
-    src/bfs.cu
-    src/jaccard.cu
-    src/overlap.cu
-    src/nvgraph_gdf.cu
-    src/two_hop_neighbors.cu
+    src/utilities/grmat.cu
+    src/utilities/degree.cu
+    src/structure/cugraph.cu
+    src/link_analysis/pagerank.cu
+    src/traversal/bfs.cu
+    src/link_prediction/jaccard.cu
+    src/link_prediction/overlap.cu
+    src/converters/nvgraph.cu
+    src/converters/renumber.cu
+    src/community/nvgraph_gdf.cu
+    src/traversal/nvgraph_sssp.cu
+    src/traversal/two_hop_neighbors.cu
+    src/snmg/blas/spmv.cu
+    src/snmg/degree/degree.cu
+    src/snmg/utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/test_utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/error_utils.cu
     ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/misc_utils.cu
diff --git a/cpp/include/algorithms.h b/cpp/include/algorithms.h
index 77fbc6525f5..277f2599512 100644
--- a/cpp/include/algorithms.h
+++ b/cpp/include/algorithms.h
@@ -170,3 +170,19 @@ gdf_error gdf_louvain(gdf_graph *graph,
                       void *final_modularity,
                       void *num_level,
                       gdf_column *louvain_parts);
+
+/**
+ * Computes the in-degree, out-degree, or the sum of both (determined by x) for the given graph. This is
+ * a multi-gpu operation operating on a partitioned graph.
+ * @param x 0 for in+out, 1 for in, 2 for out
+ * @param part_offsets Contains the start/end of each partitions vertex id range
+ * @param off The local partition offsets
+ * @param ind The local partition indices
+ * @param x_cols The results (located on each GPU)
+ * @return Error code
+ */
+gdf_error gdf_snmg_degree(int x,
+                          size_t* part_offsets,
+                          gdf_column* off,
+                          gdf_column* ind,
+                          gdf_column** x_cols);
diff --git a/cpp/include/rmm_utils.h b/cpp/include/rmm_utils.h
index 12b1b988fb6..d5ca2b3c346 100755
--- a/cpp/include/rmm_utils.h
+++ b/cpp/include/rmm_utils.h
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 #pragma once
-///#define DEBUG_NO_RMM
 
 #include <sstream>
 #include <stdexcept>
@@ -27,58 +26,10 @@
       throw std::runtime_error(ss.str());               \
     }
 
-#ifdef DEBUG_NO_RMM
-
-#include <thrust/device_malloc_allocator.h>
-#include <thrust/system_error.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/execution_policy.h>
-
-template<typename T>
-//using rmm_allocator = thrust::device_malloc_allocator<T>;
-class rmm_allocator : public thrust::device_malloc_allocator<T>
-{
-  public:
-    using value_type = T;
-
-    rmm_allocator(cudaStream_t stream = 0) : stream(stream) {}
-    ~rmm_allocator() {}
-
-private:
-    cudaStream_t stream;
-};
-
-using rmm_temp_allocator = rmm_allocator<char>; // Use this alias for thrust::cuda::par(allocator).on(stream)
-
-#define ALLOC_TRY(ptr, sz, stream){            \
-    if (stream == nullptr) ;                      \
-    cudaMalloc((ptr), (sz));                   \
-}
-
-#define ALLOC_MANAGED_TRY(ptr, sz, stream){    \
-    if (stream == nullptr) ;                      \
-    cudaMallocManaged((ptr), (sz));            \
-}
-
-  //#define REALLOC_TRY(ptr, new_sz, stream)
-
-#define ALLOC_FREE_TRY(ptr, stream){                \
-    if (stream == nullptr) ;                      \
-    cudaFree( (ptr) );                              \
-}
-#else
-
 #include <rmm/rmm.h>
 #include <rmm/thrust_rmm_allocator.h>
 
-using rmm_temp_allocator = rmm_allocator<char>;
-
-#define ALLOC_TRY( ptr, sz, stream ){                   \
-      RMM_TRY_THROW( RMM_ALLOC((ptr), (sz), (stream)) ) \
-    }
-
-//TODO: change this when RMM alloc managed will be available !!!!!
-#define ALLOC_MANAGED_TRY(ptr, sz, stream){         \
+#define ALLOC_TRY( ptr, sz, stream ){               \
   RMM_TRY_THROW( RMM_ALLOC((ptr), (sz), (stream)) ) \
 }
 
@@ -86,9 +37,6 @@ using rmm_temp_allocator = rmm_allocator<char>;
   RMM_TRY_THROW( RMM_REALLOC((ptr), (sz), (stream)) ) \
 }
 
-#define ALLOC_FREE_TRY(ptr, stream){                \
+#define ALLOC_FREE_TRY(ptr, stream){            \
   RMM_TRY_THROW( RMM_FREE( (ptr), (stream) ) )  \
 }
-
-#endif
-
diff --git a/cpp/nvgraph/cpp/CMakeLists.txt b/cpp/nvgraph/cpp/CMakeLists.txt
index 42d365400e6..27342aeea70 100644
--- a/cpp/nvgraph/cpp/CMakeLists.txt
+++ b/cpp/nvgraph/cpp/CMakeLists.txt
@@ -20,12 +20,12 @@ project(NV_GRAPH VERSION 0.4.0 LANGUAGES C CXX CUDA)
 ###################################################################################################
 # - compiler options ------------------------------------------------------------------------------
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_C_COMPILER $ENV{CC})
 set(CMAKE_CXX_COMPILER $ENV{CXX})
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
-set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_STANDARD 14)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 if(CMAKE_COMPILER_IS_GNUCXX)
@@ -47,7 +47,7 @@ option(BUILD_TESTS "Configure CMake to build tests"
 if(CMAKE_COMPILER_IS_GNUCXX)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
 
-    option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" OFF)
+    option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON)
     if(CMAKE_CXX11_ABI)
         message(STATUS "nvGraph: Enabling the GLIBCXX11 ABI")
     else()
@@ -67,6 +67,25 @@ include(FeatureSummary)
 include(CheckIncludeFiles)
 include(CheckLibraryExists)
 
+###################################################################################################
+# - add rmm --------------------------------------------------------------------------------------
+find_path(RMM_INCLUDE "rmm"
+          HINTS "$ENV{RMM_ROOT}/include"
+                "$ENV{CONDA_PREFIX}/include/rmm"
+                "$ENV{CONDA_PREFIX}/include")
+
+find_library(RMM_LIBRARY "rmm"
+             HINTS "$ENV{RMM_ROOT}/lib"
+                   "$ENV{CONDA_PREFIX}/lib")
+
+message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}")
+message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
+
+add_library(rmm SHARED IMPORTED ${RMM_LIBRARY})
+if (RMM_INCLUDE AND RMM_LIBRARY)
+    set_target_properties(rmm PROPERTIES IMPORTED_LOCATION ${RMM_LIBRARY})
+endif (RMM_INCLUDE AND RMM_LIBRARY)
+
 ###################################################################################################
 # - add gtest -------------------------------------------------------------------------------------
 
@@ -90,9 +109,9 @@ include_directories(
                     "${CMAKE_BINARY_DIR}/include"
                     "${CMAKE_SOURCE_DIR}/include"
                     "${CMAKE_SOURCE_DIR}/thirdparty/cub"
-                    "${CMAKE_SOURCE_DIR}/thirdparty/cnmem/include"
                     "${CMAKE_SOURCE_DIR}/../external"
                     "${CMAKE_SOURCE_DIR}/../external/cusp"
+                    "${RMM_INCLUDE}"
                     "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
                    )
 
@@ -101,13 +120,13 @@ include_directories(
 
 link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc
                  "${CMAKE_BINARY_DIR}/lib"
-                 "${GTEST_LIBRARY_DIR}")
+                 "${GTEST_LIBRARY_DIR}"
+                 "${RMM_LIBRARY}")
 
 ###################################################################################################
 # - library targets -------------------------------------------------------------------------------
 if(NVGRAPH_LIGHT MATCHES True)
     add_library(nvgraph_rapids SHARED
-                thirdparty/cnmem/src/cnmem.cpp
                 src/arnoldi.cu
                 src/bfs.cu
                 src/bfs2d.cu
@@ -141,7 +160,6 @@ if(NVGRAPH_LIGHT MATCHES True)
                )
 else(NVGRAPH_LIGHT MATCHES True)
         add_library(nvgraph_rapids SHARED
-                thirdparty/cnmem/src/cnmem.cpp
                 src/arnoldi.cu
                 src/bfs.cu
                 src/bfs2d.cu
@@ -203,7 +221,7 @@ endif(NVGRAPH_LIGHT MATCHES True)
 ###################################################################################################
 # - link libraries --------------------------------------------------------------------------------
 
-target_link_libraries(nvgraph_rapids cublas cusparse curand cusolver cudart )
+target_link_libraries(nvgraph_rapids cublas cusparse curand cusolver rmm cudart cuda)
 
 ###################################################################################################
 # - install targets -------------------------------------------------------------------------------
diff --git a/cpp/nvgraph/cpp/include/2d_partitioning.h b/cpp/nvgraph/cpp/include/2d_partitioning.h
index c344990db12..ca2be7a8b1f 100644
--- a/cpp/nvgraph/cpp/include/2d_partitioning.h
+++ b/cpp/nvgraph/cpp/include/2d_partitioning.h
@@ -42,1335 +42,1344 @@
 
 namespace nvgraph {
 
-	template<typename T, typename W>
-	struct CSR_Result_Weighted {
-		int64_t size;
-		int64_t nnz;
-		T* rowOffsets;
-		T* colIndices;
-		W* edgeWeights;
-
-		CSR_Result_Weighted() :
-				size(0), nnz(0), rowOffsets(NULL), colIndices(NULL), edgeWeights(NULL) {
-		}
-
-		void Destroy() {
-			if (rowOffsets)
-				cudaFree(rowOffsets);
-			if (colIndices)
-				cudaFree(colIndices);
-			if (edgeWeights)
-				cudaFree(edgeWeights);
-		}
-	};
-
-	// Define kernel for copying run length encoded values into offset slots.
-	template<typename T>
-	__global__ void offsetsKernel(T runCounts, T* unique, T* counts, T* offsets) {
-		for (int32_t idx = blockDim.x * blockIdx.x + threadIdx.x;
-				idx < runCounts;
-				idx += gridDim.x * blockDim.x) {
-			offsets[unique[idx]] = counts[idx];
-		}
-	}
-
-	/**
-	 * Method for converting COO to CSR format
-	 * @param sources The array of source indices
-	 * @param destinations The array of destination indices
-	 * @param edgeWeights The array of edge weights
-	 * @param nnz The number of non zero values
-	 * @param maxId The largest id contained in the matrix
-	 * @param result The result is stored here.
-	 */
-	template<typename T, typename W>
-	void ConvertCOOtoCSR_weighted(T* sources,
-											T* destinations,
-											W* edgeWeights,
-											int64_t nnz,
-											T maxId,
-											CSR_Result_Weighted<T, W>& result) {
-		// Sort source and destination columns by source
-		// Allocate local memory for operating on
-		T* srcs, *dests;
-		W* weights = NULL;
-		cudaMalloc(&srcs, sizeof(T) * nnz);
-		cudaMalloc(&dests, sizeof(T) * nnz);
-		if (edgeWeights)
-			cudaMalloc(&weights, sizeof(W) * nnz);
-		cudaMemcpy(srcs, sources, sizeof(T) * nnz, cudaMemcpyDefault);
-		cudaMemcpy(dests, destinations, sizeof(T) * nnz, cudaMemcpyDefault);
-		if (edgeWeights)
-			cudaMemcpy(weights, edgeWeights, sizeof(W) * nnz, cudaMemcpyDefault);
-
-		// Call Thrust::sort_by_key to sort the arrays with srcs as keys:
-		if (edgeWeights)
-			thrust::sort_by_key(thrust::device,
-										srcs,
-										srcs + nnz,
-										thrust::make_zip_iterator(thrust::make_tuple(dests, weights)));
-		else
-			thrust::sort_by_key(thrust::device, srcs, srcs + nnz, dests);
-
-		result.size = maxId + 1;
-
-		// Allocate offsets array
-		cudaMalloc(&result.rowOffsets, (maxId + 2) * sizeof(T));
-
-		// Set all values in offsets array to zeros
-		cudaMemset(result.rowOffsets, 0, (maxId + 2) * sizeof(T));
-
-		// Allocate temporary arrays same size as sources array, and single value to get run counts
-		T* unique, *counts, *runCount;
-		cudaMalloc(&unique, (maxId + 1) * sizeof(T));
-		cudaMalloc(&counts, (maxId + 1) * sizeof(T));
-		cudaMalloc(&runCount, sizeof(T));
-
-		// Use CUB run length encoding to get unique values and run lengths
-		void *tmpStorage = NULL;
-		size_t tmpBytes = 0;
-		cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
-		cudaMalloc(&tmpStorage, tmpBytes);
-		cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
-		cudaFree(tmpStorage);
-
-		// Set offsets to run sizes for each index
-		T runCount_h;
-		cudaMemcpy(&runCount_h, runCount, sizeof(T), cudaMemcpyDefault);
-		int threadsPerBlock = 1024;
-		int numBlocks = min(65535, (runCount_h + threadsPerBlock - 1) / threadsPerBlock);
-		offsetsKernel<<<numBlocks, threadsPerBlock>>>(runCount_h, unique, counts, result.rowOffsets);
-
-		// Scan offsets to get final offsets
-		thrust::exclusive_scan(thrust::device,
-										result.rowOffsets,
-										result.rowOffsets + maxId + 2,
-										result.rowOffsets);
-
-		// Clean up temporary allocations
-		result.nnz = nnz;
-		result.colIndices = dests;
-		result.edgeWeights = weights;
-		cudaFree(srcs);
-		cudaFree(unique);
-		cudaFree(counts);
-		cudaFree(runCount);
-	}
-
-	/**
-	 * Describes the 2D decomposition of a partitioned matrix.
-	 */
-	template<typename GlobalType, typename LocalType>
-	class MatrixDecompositionDescription {
-	protected:
-		GlobalType numRows; 	// Global number of rows in matrix
-		GlobalType numCols; 	// Global number of columns in matrix
-		GlobalType nnz;			// Global number of non-zeroes in matrix
-		GlobalType blockRows;	// Number of rows of blocks in the decomposition
-		GlobalType blockCols;	// Number of columns of rows in the decomposition
-		LocalType offset;
-		// Offsets-like arrays for rows and columns defining the start/end of the
-		// sections of the global id space belonging to each row and column.
-		std::vector<GlobalType> rowOffsets;
-		std::vector<GlobalType> colOffsets;
-		// Array of integers one for each block, defining the device it is assigned to
-		std::vector<int32_t> deviceAssignments;
-		std::vector<cudaStream_t> blockStreams;
-		public:
-
-		MatrixDecompositionDescription() :
-				numRows(0), numCols(0), nnz(0), blockRows(0), blockCols(0) {
-			rowOffsets.push_back(0);
-			colOffsets.push_back(0);
-			deviceAssignments.push_back(0);
-		}
-
-		// Basic constructor, just takes in the values of its members.
-		MatrixDecompositionDescription(GlobalType numRows,
-													GlobalType numCols,
-													GlobalType nnz,
-													GlobalType blockRows,
-													GlobalType blockCols,
-													std::vector<GlobalType> rowOffsets,
-													std::vector<GlobalType> colOffsets,
-													std::vector<int32_t> deviceAssignments) :
-				numRows(numRows), numCols(numCols), nnz(nnz), blockRows(blockRows),
-						blockCols(blockCols), rowOffsets(rowOffsets), colOffsets(colOffsets),
-						deviceAssignments(deviceAssignments) {
-		}
-
-		// Constructs a MatrixDecompositionDescription for a square matrix given the
-		// number of rows in the matrix and number of rows of blocks.
-		MatrixDecompositionDescription(GlobalType numRows,
-													GlobalType numBlockRows,
-													GlobalType nnz,
-													std::vector<int32_t> devices) :
-				numRows(numRows),
-						numCols(numRows),
-						blockRows(numBlockRows),
-						blockCols(numBlockRows),
-						nnz(nnz) {
-			// Tracking the current set device to change back
-			int currentDevice;
-			cudaGetDevice(&currentDevice);
-
-			// Setting up the row and col offsets into equally sized chunks
-			GlobalType remainder = numRows % blockRows;
-			if (remainder != 0)
-				offset = (numRows + blockRows - remainder) / blockRows;
-			else
-				offset = numRows / blockRows;
-
-			rowOffsets.resize(blockRows + 1);
-			colOffsets.resize(blockRows + 1);
-			for (int i = 0; i < blockRows; i++) {
-				rowOffsets[i] = i * offset;
-				colOffsets[i] = i * offset;
-			}
-			rowOffsets.back() = blockRows * offset;
-			colOffsets.back() = blockCols * offset;
-
-			// Setting up the device assignments using the given device ids and also
-			// setting up the stream associated with each block.
-			deviceAssignments.resize(getNumBlocks());
-			blockStreams.resize(getNumBlocks());
-			for (int i = 0; i < getNumBlocks(); i++) {
-				int device = devices[i % devices.size()];
-				deviceAssignments[i] = device;
-				cudaSetDevice(device);
-				cudaStream_t stream;
-				cudaStreamCreate(&stream);
-				blockStreams[i] = stream;
-			}
-
-			// Restoring to current device when called
-			cudaSetDevice(currentDevice);
-		}
-
-		// Gets the row id for the block containing the given global row id
-		int32_t getRowId(GlobalType val) const {
-			return std::upper_bound(rowOffsets.begin(), rowOffsets.end(), val) - rowOffsets.begin() - 1;
-		}
-
-		// Gets the column id for the block containing the given global column id
-		int32_t getColId(GlobalType val) const {
-			return std::upper_bound(colOffsets.begin(), colOffsets.end(), val) - colOffsets.begin() - 1;
-		}
-
-		// Gets the number of blocks in the decomposition:
-		int32_t getNumBlocks() const {
-			return blockRows * blockCols;
-		}
-
-		// Getter for offset
-		LocalType getOffset() const {
-			return offset;
-		}
-
-		// Getter for deviceAssignments
-		const std::vector<int32_t>& getDeviceAssignments() const {
-			return deviceAssignments;
-		}
-
-		/**
-		 * Getter for vector of streams for each block.
-		 * @return Reference to vector of streams for each block
-		 */
-		const std::vector<cudaStream_t>& getBlockStreams() const {
-			return blockStreams;
-		}
-
-		/**
-		 * Getter for nnz
-		 * @return The global number of non-zero elements
-		 */
-		GlobalType getNnz() const {
-			return nnz;
-		}
-
-		/**
-		 * Getter method for numRows
-		 * @return The number of global rows in the matrix
-		 */
-		GlobalType getNumRows() const {
-			return numRows;
-		}
-
-		/**
-		 * Getter for BlockRows
-		 * @return The number of blocks in a row in the decomposition.
-		 */
-		GlobalType getBlockRows() const {
-			return blockRows;
-		}
-
-		/**
-		 * Getter for BlockCols
-		 * @return The number of blocks in a column in the decomposition.
-		 */
-		GlobalType getBlockCols() const {
-			return blockCols;
-		}
-
-		/**
-		 * Given a block id, returns the row which that block is in.
-		 * @param bId The block ID
-		 * @return The row number
-		 */
-		int32_t getBlockRow(int32_t bId) const {
-			return bId / blockCols;
-		}
-
-		/**
-		 * Given a block id, returns the column which that block is in.
-		 * @param bId The block ID
-		 * @return The column number
-		 */
-		int32_t getBlockCol(int32_t bId) const {
-			return bId % blockCols;
-		}
-
-		/**
-		 * Takes a COO global row and produces the COO local row and the block to which it belongs.
-		 * @param globalRow The global row ID
-		 * @param globalCol The global column ID
-		 * @param localRow The block local row ID (return)
-		 * @param localCol The block local column ID (return)
-		 * @param blockId The block ID (return)
-		 */
-		void convertGlobaltoLocalRow(GlobalType globalRow,
-												GlobalType globalCol,
-												LocalType& localRow,
-												LocalType& localCol,
-												int32_t& blockId) const {
-			int32_t rowId = getRowId(globalRow);
-			int32_t colId = getColId(globalCol);
-			blockId = rowId * blockCols + colId;
-			localRow = globalRow - rowOffsets[rowId];
-			localCol = globalCol - colOffsets[colId];
-		}
-
-		/**
-		 * Takes in a row ID and column ID and returns the corresponding block ID
-		 * @param rowId The row ID
-		 * @param colId The column ID
-		 * @return The ID of the corresponding block
-		 */
-		int32_t getBlockId(int32_t rowId, int32_t colId) const {
-			return rowId * blockCols + colId;
-		}
-
-		/**
-		 * Helper method to synchronize all streams after operations are issued.
-		 */
-		void syncAllStreams() const {
-			int32_t numBlocks = getNumBlocks();
-			int32_t current_device;
-			cudaGetDevice(&current_device);
-			for (int32_t i = 0; i < numBlocks; i++) {
-				cudaSetDevice(deviceAssignments[i]);
-				cudaStreamSynchronize(blockStreams[i]);
-			}
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * This method is only for testing and debugging use.
-		 * @return A human readable string representation of the object
-		 */
-		std::string toString() const {
-			std::stringstream ss;
-			ss << "Global Info:\n\tnumRows: " << numRows << ", numCols: " << numCols << ", nnz: "
-					<< nnz;
-			ss << "\n";
-			ss << "Block Info:\n\tblockRows: " << blockRows << ", blockCols: " << blockCols;
-			ss << "\n";
-			ss << "rowOffsets: [";
-			for (int i = 0; i < (int) rowOffsets.size(); i++)
-				ss << rowOffsets[i] << (i == (int) rowOffsets.size() - 1 ? "]\n" : ", ");
-			ss << "colOffsets: [";
-			for (int i = 0; i < (int) colOffsets.size(); i++)
-				ss << colOffsets[i] << (i == (int) colOffsets.size() - 1 ? "]\n" : ", ");
-			ss << "deviceAssignments: [";
-			for (int i = 0; i < (int) deviceAssignments.size(); i++)
-				ss << deviceAssignments[i] << (i == (int) deviceAssignments.size() - 1 ? "]\n" : ", ");
-			return ss.str();
-		}
-	};
-
-	template<typename GlobalType, typename LocalType, typename ValueType>
-	class Matrix2d {
-	protected:
-		// Description of the matrix decomposition
-		MatrixDecompositionDescription<GlobalType, LocalType> description;
-
-		// Array of block matrices forming the decomposition
-		std::vector<MultiValuedCsrGraph<LocalType, ValueType>*> blocks;
-		public:
-		Matrix2d() {
-		}
-		Matrix2d(MatrixDecompositionDescription<GlobalType, LocalType> descr,
-					std::vector<MultiValuedCsrGraph<LocalType, ValueType>*> blocks) :
-				description(descr), blocks(blocks) {
-		}
-
-		const MatrixDecompositionDescription<GlobalType, LocalType>& getMatrixDecompositionDescription() {
-			return description;
-		}
-
-		MultiValuedCsrGraph<LocalType, ValueType>* getBlockMatrix(int32_t bId) {
-			return blocks[bId];
-		}
-
-		std::string toString() {
-			std::stringstream ss;
-			ss << "MatrixDecompositionDescription:\n" << description.toString();
-			for (int i = 0; i < (int) blocks.size(); i++) {
-				ss << "Block " << i << ":\n";
-				size_t numVerts = blocks[i]->get_num_vertices();
-				size_t numEdges = blocks[i]->get_num_edges();
-				size_t numValues = blocks[i]->getNumValues();
-				ss << "numVerts: " << numVerts << ", numEdges: " << numEdges << "\n";
-				LocalType* rowOffsets = (LocalType*) malloc((numVerts + 1) * sizeof(LocalType));
-				LocalType* colIndices = (LocalType*) malloc(numEdges * sizeof(LocalType));
-				ValueType* values = NULL;
-				if (numValues > 0)
-					values = (ValueType*) malloc(numEdges * sizeof(ValueType));
-				cudaMemcpy(rowOffsets,
-								blocks[i]->get_raw_row_offsets(),
-								(numVerts + 1) * sizeof(LocalType),
-								cudaMemcpyDefault);
-				cudaMemcpy(colIndices,
-								blocks[i]->get_raw_column_indices(),
-								numEdges * sizeof(LocalType),
-								cudaMemcpyDefault);
-				if (values)
-					cudaMemcpy(values,
-									blocks[i]->get_raw_edge_dim(0),
-									numEdges * sizeof(ValueType),
-									cudaMemcpyDefault);
-				int idxCount = numEdges >= (numVerts + 1) ? numEdges : (numVerts + 1);
-				ss << "Idx\tOffset\tColInd\tValue\n";
-				for (int j = 0; j < idxCount; j++) {
-					if (j < (int) numVerts + 1 && j < (int) numEdges)
-						ss << j << ":\t" << rowOffsets[j] << "\t" << colIndices[j] << "\t"
-								<< (values ? values[j] : 0)
-								<< "\n";
-					else if (j < (int) numVerts + 1 && j >= (int) numEdges)
-						ss << j << ":\t" << rowOffsets[j] << "\n";
-					else if (j >= (int) numVerts + 1 && j < (int) numEdges)
-						ss << j << ":\t" << "\t" << colIndices[j] << "\t" << (values ? values[j] : 0)
-								<< "\n";
-				}
-				free(rowOffsets);
-				free(colIndices);
-				free(values);
-			}
-			return ss.str();
-		}
-	};
-
-	template<typename GlobalType, typename LocalType, typename ValueType>
-	class VertexData2D {
-		const MatrixDecompositionDescription<GlobalType, LocalType>* description;
-		int32_t n;
-		std::vector<cub::DoubleBuffer<ValueType> > values;
-		public:
-		/**
-		 * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription
-		 * object which describes the matrix the data is attached to. Data buffers are
-		 * allocated for each block using the offset from the description to size the
-		 * buffers, and to locate the buffers on the same GPU as the matrix block.
-		 */
-		VertexData2D(const MatrixDecompositionDescription<GlobalType, LocalType>* descr) :
-				description(descr) {
-			// Resize the values array to be the same size as number of blocks
-			values.resize(descr->getNumBlocks());
-
-			// Grab the current device id to switch back after allocations are done
-			int current_device;
-			cudaGetDevice(&current_device);
-			LocalType allocSize = descr->getOffset();
-			n = allocSize;
-			// Allocate the data for each block
-			for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
-				int device = descr->getDeviceAssignments()[i];
-				cudaSetDevice(device);
-				ValueType* d_current, *d_alternate;
-				cudaMalloc(&d_current, sizeof(ValueType) * n);
-				cudaMalloc(&d_alternate, sizeof(ValueType) * n);
-				values[i].d_buffers[0] = d_current;
-				values[i].d_buffers[1] = d_alternate;
-			}
-
-			// Set the device back to what it was initially
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription
-		 * object, which describes the matrix the data is attached to, and an integer which indicates
-		 * how many data elements should be allocated for each block. Data buffers are allocated
-		 * for each block using the offset from the description to size the buffers, and to locate
-		 * the buffers on the same GPU as the matrix block.
-		 */
-		VertexData2D(const MatrixDecompositionDescription<GlobalType, LocalType>* descr, size_t _n) :
-				description(descr) {
-			// Resize the values array to be the same size as number of blocks
-			values.resize(descr->getNumBlocks());
-
-			// Grab the current device id to switch back after allocations are done
-			int current_device;
-			cudaGetDevice(&current_device);
-			LocalType allocSize = _n;
-			n = allocSize;
-			// Allocate the data for each block
-			for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
-				int device = descr->getDeviceAssignments()[i];
-				cudaSetDevice(device);
-				ValueType* d_current, *d_alternate;
-				cudaMalloc(&d_current, sizeof(ValueType) * n);
-				cudaMalloc(&d_alternate, sizeof(ValueType) * n);
-				values[i].d_buffers[0] = d_current;
-				values[i].d_buffers[1] = d_alternate;
-			}
-
-			// Set the device back to what it was initially
-			cudaSetDevice(current_device);
-		}
-
-		~VertexData2D() {
-			for (size_t i = 0; i < values.size(); i++) {
-				if (values[i].Current())
-					cudaFree(values[i].Current());
-				if (values[i].Alternate())
-					cudaFree(values[i].Alternate());
-			}
-		}
-
-		/**
-		 * Getter for n the size of each block's allocation in elements.
-		 * @return The value of n
-		 */
-		int32_t getN() {
-			return n;
-		}
-
-		/**
-		 * Getter for the MatrixDecompositionDescription associated with this VertexData2D
-		 * @return Pointer to the MatrixDecompositionDescription for this VertexData2D
-		 */
-		const MatrixDecompositionDescription<GlobalType, LocalType>* getDescription() {
-			return description;
-		}
-
-		/**
-		 * Gets the current buffer corresponding to the given block ID
-		 */
-		ValueType* getCurrent(int bId) {
-			return values[bId].Current();
-		}
-
-		/**
-		 * Gets the alternate buffer corresponding to the given block ID
-		 */
-		ValueType* getAlternate(int bId) {
-			return values[bId].Alternate();
-		}
-
-		/**
-		 * Swaps the current and alternate buffers for all block IDs
-		 */
-		void swapBuffers() {
-			for (size_t i = 0; i < values.size(); i++)
-				values[i].selector ^= 1;
-		}
-
-		/**
-		 * Sets an element in the global array, assuming that the data is currently
-		 * valid and in the diagonal blocks. After calling this method either columnScatter
-		 * or rowScatter should be called to propagate the change to all blocks.
-		 */
-		void setElement(GlobalType globalIndex, ValueType val) {
-			LocalType blockId = globalIndex / n;
-			LocalType blockOffset = globalIndex % n;
-			int32_t bId = description->getBlockId(blockId, blockId);
-			ValueType* copyTo = values[bId].Current() + blockOffset;
-			cudaMemcpy(copyTo, &val, sizeof(ValueType), cudaMemcpyDefault);
-		}
-
-		/**
-		 * Sets the elements of the global array, using the provided array of values. The values
-		 * are set in the blocks of the diagonal, columnScatter or rowScatter should be called
-		 * to propogate to all blocks.
-		 * @param vals Pointer to an array with the values to be set.
-		 */
-		void setElements(ValueType* vals) {
-			LocalType offset = description->getOffset();
-			int32_t numRows = description->getBlockRows();
-			for (int i = 0; i < numRows; i++) {
-				int32_t id = description->getBlockId(i, i);
-				cudaStream_t stream = description->getBlockStreams()[id];
-				ValueType* copyFrom = vals + i * n;
-				ValueType* copyTo = values[id].Current();
-				cudaMemcpyAsync(copyTo, copyFrom, sizeof(ValueType) * n, cudaMemcpyDefault, stream);
-			}
-			description->syncAllStreams();
-		}
-
-		/**
-		 * Fills the elements of the data array with the given value.
-		 * The elements on the diagonal are filled with the given value. After filling,
-		 * either rowScatter or columnScatter will copy the values across the blocks in
-		 * either the rows or columns depending on the use.
-		 * @param val The value to fill the array with
-		 */
-		void fillElements(ValueType val) {
-			int current_device;
-			cudaGetDevice(&current_device);
-			int32_t numRows = description->getBlockRows();
-			for (int32_t i = 0; i < numRows; i++) {
-				int32_t blockId = description->getBlockId(i, i);
-				ValueType* vals = getCurrent(blockId);
-				int deviceId = description->getDeviceAssignments()[blockId];
-				cudaStream_t stream = description->getBlockStreams()[blockId];
-				cudaSetDevice(deviceId);
-				thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val);
-			}
-			description->syncAllStreams();
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * Copies the values of the diagonal blocks in this VertexData2D into the
-		 * VertexData2D specified.
-		 * @param other Pointer to the VertexData2D to copy into
-		 */
-		void copyTo(VertexData2D<GlobalType, LocalType, ValueType>* other) {
-			const MatrixDecompositionDescription<GlobalType, LocalType>* otherDescr =
-					other->getDescription();
-			// Do a quick check that the sizes of both block arrays are the same.
-			if (description->getBlockRows() == otherDescr->getBlockRows() && n == other->getN()) {
-				// Issue asynchronous copies for each block's data
-				for (int i = 0; i < description->getBlockRows(); i++) {
-					int32_t bId = description->getBlockId(i, i);
-					ValueType* copyFrom = getCurrent(bId);
-					ValueType* copyTo = other->getCurrent(bId);
-					cudaStream_t stream = description->getBlockStreams()[bId];
-					cudaMemcpyAsync(copyTo, copyFrom, n * sizeof(ValueType), cudaMemcpyDefault, stream);
-				}
-				// Synchronize the streams after the copies are done
-				for (int i = 0; i < description->getBlockRows(); i++) {
-					int32_t bId = description->getBlockId(i, i);
-					cudaStream_t stream = description->getBlockStreams()[bId];
-					cudaStreamSynchronize(stream);
-				}
-			}
-		}
-
-		/**
-		 * This method implements a row-wise reduction of each blocks data into a
-		 * single array for each row. The block on the diagonal will have the result.
-		 */
-		template<typename Operator>
-		void rowReduce() {
-			int current_device;
-			cudaGetDevice(&current_device);
-			Operator op;
-
-			// For each row in the decomposition:
-			int32_t numRows = description->getBlockRows();
-			std::vector<int32_t> blockIds;
-			for (int32_t i = 0; i < numRows; i++) {
-				// Put all the block ids for the row into a vector, with the ID of the diagonal block
-				// at index 0.
-				std::vector<int32_t> blockIds;
-				blockIds.push_back(-1);
-				for (int32_t j = 0; j < numRows; j++) {
-					if (i == j) {
-						blockIds[0] = description->getBlockId(i, j);
-					}
-					else {
-						blockIds.push_back(description->getBlockId(i, j));
-					}
-				}
-
-				// Do a binary tree reduction. At each step the primary buffer of the sender is
-				// copied into the secondary buffer of the receiver. After the copy is done
-				// each receiver performs the reduction operator and stores the result in it's
-				// primary buffer.
-				for (int32_t j = 2; (j / 2) < numRows; j *= 2) {
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id] is the receiver
-							int32_t receiverId = blockIds[id];
-
-							// blockIds[id + j/2] is the sender
-							int32_t senderId = blockIds[id + j / 2];
-
-							// Get the stream associated with the receiver's block id
-							cudaStream_t stream = description->getBlockStreams()[receiverId];
-
-							// Copy from the sender to the receiver (use stream associated with receiver)
-							cudaMemcpyAsync(values[receiverId].Alternate(),
-													values[senderId].Current(),
-													sizeof(ValueType) * n,
-													cudaMemcpyDefault,
-													stream);
-
-							// Invoke the reduction operator on the receiver's GPU and values arrays.
-							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
-							ValueType* input1 = values[receiverId].Alternate();
-							ValueType* input2 = values[receiverId].Current();
-							thrust::transform(thrust::cuda::par.on(stream),
-													input1,
-													input1 + n,
-													input2,
-													input2,
-													op);
-						}
-					}
-					// Sync all active streams before next step
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id] is the receiver
-							int32_t receiverId = blockIds[id];
-
-							// Set the device to the receiver and sync the stream
-							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
-							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
-						}
-					}
-				}
-			}
-
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * This method implements a column-wise reduction of each blocks data into a
-		 * single array for each column. The block on the diagonal will have the result.
-		 */
-		template<typename Operator>
-		void columnReduce() {
-			int current_device;
-			cudaGetDevice(&current_device);
-			Operator op;
-
-			// For each column in the decomposition:
-			int32_t numRows = description->getBlockRows();
-			std::vector<int32_t> blockIds;
-			for (int32_t i = 0; i < numRows; i++) {
-				// Put all the block ids for the row into a vector, with the ID of the diagonal block
-				// at index 0.
-				std::vector<int32_t> blockIds;
-				blockIds.push_back(-1);
-				for (int32_t j = 0; j < numRows; j++) {
-					if (i == j) {
-						blockIds[0] = description->getBlockId(j, i);
-					}
-					else {
-						blockIds.push_back(description->getBlockId(j, i));
-					}
-				}
-
-				// Do a binary tree reduction. At each step the primary buffer of the sender is
-				// copied into the secondary buffer of the receiver. After the copy is done
-				// each receiver performs the reduction operator and stores the result in it's
-				// primary buffer.
-				for (int32_t j = 2; (j / 2) < numRows; j *= 2) {
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id] is the receiver
-							int32_t receiverId = blockIds[id];
-
-							// blockIds[id + j/2] is the sender
-							int32_t senderId = blockIds[id + j / 2];
-
-							// Get the stream associated with the receiver's block id
-							cudaStream_t stream = description->getBlockStreams()[receiverId];
-
-							// Copy from the sender to the receiver (use stream associated with receiver)
-							cudaMemcpyAsync(values[receiverId].Alternate(),
-													values[senderId].Current(),
-													sizeof(ValueType) * n,
-													cudaMemcpyDefault,
-													stream);
-
-							// Invoke the reduction operator on the receiver's GPU and values arrays.
-							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
-							ValueType* input1 = values[receiverId].Alternate();
-							ValueType* input2 = values[receiverId].Current();
-							thrust::transform(thrust::cuda::par.on(stream),
-													input1,
-													input1 + n,
-													input2,
-													input2,
-													op);
-						}
-					}
-					// Sync all active streams before next step
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id] is the receiver
-							int32_t receiverId = blockIds[id];
-
-							// Set the device to the receiver and sync the stream
-							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
-							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
-						}
-					}
-				}
-			}
-
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * This implements a column-wise scatter of the global data from the corresponding
-		 * row. i.e. The data reduced from row 1 is broadcast to all blocks in
-		 * column 1. It is assumed that the data to broadcast is located in the block on
-		 * the diagonal.
-		 */
-		void columnScatter() {
-			int current_device;
-			cudaGetDevice(&current_device);
-
-			// For each column in the decomposition:
-			int32_t numRows = description->getBlockRows();
-			std::vector<int32_t> blockIds;
-			for (int32_t i = 0; i < numRows; i++) {
-				// Put all the block ids for the column into a vector, with the ID of the diagonal block
-				// at index 0.
-				std::vector<int32_t> blockIds;
-				blockIds.push_back(-1);
-				for (int32_t j = 0; j < numRows; j++) {
-					if (i == j) {
-						blockIds[0] = description->getBlockId(j, i);
-					}
-					else {
-						blockIds.push_back(description->getBlockId(j, i));
-					}
-				}
-
-				// Do a binary tree scatter. At each step the primary buffer of the sender is
-				// copied into the primary buffer of the receiver.
-				int32_t max2pow = 2;
-				while (max2pow < numRows) {
-					max2pow *= 2;
-				}
-				for (int32_t j = max2pow; j >= 2; j /= 2) {
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id] is the sender
-							int32_t senderId = blockIds[id];
-
-							// blockIds[id + j/2] is the sender
-							int32_t receiverId = blockIds[id + j / 2];
-
-							// Get the stream associated with the receiver's block id
-							cudaStream_t stream = description->getBlockStreams()[receiverId];
-
-							// Copy from the sender to the receiver (use stream associated with receiver)
-							cudaMemcpyAsync(values[receiverId].Current(),
-													values[senderId].Current(),
-													sizeof(ValueType) * n,
-													cudaMemcpyDefault,
-													stream);
-						}
-					}
-					// Synchronize all the active streams before next step.
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id + j/2] is the sender
-							int32_t receiverId = blockIds[id + j / 2];
-
-							// Set device and sync receiver's stream
-							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
-							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
-						}
-					}
-				}
-			}
-
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * This implements a row-wise scatter of the global data from the corresponding
-		 * column. i.e. The data reduced from column 1 is broadcast to all blocks in
-		 * row 1. It is assumed that the data to broadcast is located in the block on
-		 * the diagonal.
-		 */
-		void rowScatter() {
-			int current_device;
-			cudaGetDevice(&current_device);
-
-			// For each row in the decomposition:
-			int32_t numRows = description->getBlockRows();
-			std::vector<int32_t> blockIds;
-			for (int32_t i = 0; i < numRows; i++) {
-				// Put all the block ids for the column into a vector, with the ID of the diagonal block
-				// at index 0.
-				std::vector<int32_t> blockIds;
-				blockIds.push_back(-1);
-				for (int32_t j = 0; j < numRows; j++) {
-					if (i == j) {
-						blockIds[0] = description->getBlockId(i, j);
-					}
-					else {
-						blockIds.push_back(description->getBlockId(i, j));
-					}
-				}
-
-				// Do a binary tree scatter. At each step the primary buffer of the sender is
-				// copied into the primary buffer of the receiver.
-				int32_t max2pow = 2;
-				while (max2pow < numRows) {
-					max2pow *= 2;
-				}
-				for (int32_t j = max2pow; j >= 2; j /= 2) {
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id] is the sender
-							int32_t senderId = blockIds[id];
-
-							// blockIds[id + j/2] is the receiver
-							int32_t receiverId = blockIds[id + j / 2];
-
-							// Get the stream associated with the receiver's block id
-							cudaStream_t stream = description->getBlockStreams()[receiverId];
-
-							// Copy from the sender to the receiver (use stream associated with receiver)
-							cudaMemcpyAsync(values[receiverId].Current(),
-													values[senderId].Current(),
-													sizeof(ValueType) * n,
-													cudaMemcpyDefault,
-													stream);
-						}
-					}
-					// Sync all the active streams before next step
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id + j/2] is the receiver
-							int32_t receiverId = blockIds[id + j / 2];
-
-							// Set device and sync receiver's stream
-							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
-							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
-						}
-					}
-				}
-			}
-
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * Outputs a human readable string representation of this Vertex2d object. This is only
-		 * intended to be used for de-bugging.
-		 * @return Human readable string representation
-		 */
-		std::string toString() {
-			std::stringstream ss;
-			ValueType* c = (ValueType*) malloc(sizeof(ValueType) * n);
-			ValueType* a = (ValueType*) malloc(sizeof(ValueType) * n);
-
-			int32_t numBlocks = description->getNumBlocks();
-
-			ss << "Vertex2d:\n";
-			for (int32_t i = 0; i < numBlocks; i++) {
-				ss << "Block " << i << ":\n";
-				ss << "Idx\tCur\tAlt\n";
-				cudaMemcpy(c, values[i].Current(), sizeof(ValueType) * n, cudaMemcpyDefault);
-				cudaMemcpy(a, values[i].Alternate(), sizeof(ValueType) * n, cudaMemcpyDefault);
-				for (int32_t j = 0; j < n; j++) {
-					ss << j << ":\t" << c[j] << "\t" << a[j] << "\n";
-				}
-			}
-
-			free(c);
-			free(a);
-
-			return ss.str();
-		}
-	};
-
-	template<typename GlobalType, typename LocalType, typename ValueType>
-	class VertexData2D_Unbuffered {
-		const MatrixDecompositionDescription<GlobalType, LocalType>* description;
-		int32_t n;
-		std::vector<ValueType*> values;
-
-	public:
-		/**
-		 * Sets up a VertexData2D_Unbuffered object with an element allocated for each vertex
-		 * in each block.
-		 * @param descr Pointer to a MatrixDecompositionDescription object describing the layout
-		 * of the 2D blocks.
-		 */
-		VertexData2D_Unbuffered(const MatrixDecompositionDescription<GlobalType, LocalType>* descr) :
-				description(descr) {
-			// Resize the values array to be the same size as number of blocks
-			values.resize(descr->getNumBlocks());
-
-			// Grab the current device id to switch back after allocations are done
-			int current_device;
-			cudaGetDevice(&current_device);
-			LocalType allocSize = descr->getOffset();
-			n = allocSize;
-			// Allocate the data for each block
-			for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
-				int device = descr->getDeviceAssignments()[i];
-				cudaSetDevice(device);
-				cudaMalloc(&(values[i]), sizeof(ValueType) * n);
-			}
-
-			// Set the device back to what it was initially
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * Sets up a VertexData2D_Unbuffered object with _n elements allocated per block.
-		 * @param descr Pointer to a MatrixDecompositionDescription object describing the layout
-		 * of the 2D blocks.
-		 * @param _n The number of elements to allocate per block.
-		 */
-		VertexData2D_Unbuffered(const MatrixDecompositionDescription<GlobalType, LocalType>* descr,
-										size_t _n) :
-				description(descr), n(_n) {
-			// Resize the values array to be the same size as number of blocks
-			values.resize(descr->getNumBlocks());
-
-			// Grab the current device id to switch back after allocations are done
-			int current_device;
-			cudaGetDevice(&current_device);
-			// Allocate the data for each block
-			for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
-				int device = descr->getDeviceAssignments()[i];
-				cudaSetDevice(device);
-				cudaMalloc(&(values[i]), sizeof(ValueType) * n);
-			}
-
-			// Set the device back to what it was initially
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * Destructor. Frees all allocated memory.
-		 */
-		~VertexData2D_Unbuffered() {
-			for (size_t i = 0; i < values.size(); i++) {
-				if (values[i]) {
-					cudaFree(values[i]);
-				}
-			}
-		}
-
-		/**
-		 * Fills the elements of the data array with the given value.
-		 * The elements on the diagonal are filled with the given value. After filling,
-		 * either rowScatter or columnScatter will copy the values across the blocks in
-		 * either the rows or columns depending on the use.
-		 * @param val The value to fill the array with
-		 */
-		void fillElements(ValueType val) {
-			int current_device;
-			cudaGetDevice(&current_device);
-			int32_t numRows = description->getBlockRows();
-			for (int32_t i = 0; i < numRows; i++) {
-				int32_t blockId = description->getBlockId(i, i);
-				ValueType* vals = get(blockId);
-				int deviceId = description->getDeviceAssignments()[blockId];
-				cudaStream_t stream = description->getBlockStreams()[blockId];
-				cudaSetDevice(deviceId);
-				thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val);
-			}
-			description->syncAllStreams();
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * This implements a column-wise scatter of the global data from the corresponding
-		 * row. i.e. The data reduced from row 1 is broadcast to all blocks in
-		 * column 1. It is assumed that the data to broadcast is located in the block on
-		 * the diagonal.
-		 */
-		void columnScatter() {
-			int current_device;
-			cudaGetDevice(&current_device);
-
-			// For each column in the decomposition:
-			int32_t numRows = description->getBlockRows();
-			std::vector<int32_t> blockIds;
-			for (int32_t i = 0; i < numRows; i++) {
-				// Put all the block ids for the column into a vector, with the ID of the diagonal block
-				// at index 0.
-				std::vector<int32_t> blockIds;
-				blockIds.push_back(-1);
-				for (int32_t j = 0; j < numRows; j++) {
-					if (i == j) {
-						blockIds[0] = description->getBlockId(j, i);
-					}
-					else {
-						blockIds.push_back(description->getBlockId(j, i));
-					}
-				}
-
-				// Do a binary tree scatter. At each step the primary buffer of the sender is
-				// copied into the primary buffer of the receiver.
-				int32_t max2pow = 2;
-				while (max2pow < numRows) {
-					max2pow *= 2;
-				}
-				for (int32_t j = max2pow; j >= 2; j /= 2) {
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id] is the sender
-							int32_t senderId = blockIds[id];
-
-							// blockIds[id + j/2] is the sender
-							int32_t receiverId = blockIds[id + j / 2];
-
-							// Get the stream associated with the receiver's block id
-							cudaStream_t stream = description->getBlockStreams()[receiverId];
-
-							// Copy from the sender to the receiver (use stream associated with receiver)
-							cudaMemcpyAsync(values[receiverId],
-													values[senderId],
-													sizeof(ValueType) * n,
-													cudaMemcpyDefault,
-													stream);
-						}
-					}
-					// Synchronize all the active streams before next step.
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id + j/2] is the sender
-							int32_t receiverId = blockIds[id + j / 2];
-
-							// Set device and sync receiver's stream
-							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
-							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
-						}
-					}
-				}
-			}
-
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * This implements a row-wise scatter of the global data from the corresponding
-		 * column. i.e. The data reduced from column 1 is broadcast to all blocks in
-		 * row 1. It is assumed that the data to broadcast is located in the block on
-		 * the diagonal.
-		 */
-		void rowScatter() {
-			int current_device;
-			cudaGetDevice(&current_device);
-
-			// For each row in the decomposition:
-			int32_t numRows = description->getBlockRows();
-			std::vector<int32_t> blockIds;
-			for (int32_t i = 0; i < numRows; i++) {
-				// Put all the block ids for the column into a vector, with the ID of the diagonal block
-				// at index 0.
-				std::vector<int32_t> blockIds;
-				blockIds.push_back(-1);
-				for (int32_t j = 0; j < numRows; j++) {
-					if (i == j) {
-						blockIds[0] = description->getBlockId(i, j);
-					}
-					else {
-						blockIds.push_back(description->getBlockId(i, j));
-					}
-				}
-
-				// Do a binary tree scatter. At each step the primary buffer of the sender is
-				// copied into the primary buffer of the receiver.
-				int32_t max2pow = 2;
-				while (max2pow < numRows) {
-					max2pow *= 2;
-				}
-				for (int32_t j = max2pow; j >= 2; j /= 2) {
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id] is the sender
-							int32_t senderId = blockIds[id];
-
-							// blockIds[id + j/2] is the receiver
-							int32_t receiverId = blockIds[id + j / 2];
-
-							// Get the stream associated with the receiver's block id
-							cudaStream_t stream = description->getBlockStreams()[receiverId];
-
-							// Copy from the sender to the receiver (use stream associated with receiver)
-							cudaMemcpyAsync(values[receiverId],
-													values[senderId],
-													sizeof(ValueType) * n,
-													cudaMemcpyDefault,
-													stream);
-						}
-					}
-					// Sync all the active streams before next step
-					for (int32_t id = 0; id < numRows; id++) {
-						if (id % j == 0 && id + j / 2 < numRows) {
-							// blockIds[id + j/2] is the receiver
-							int32_t receiverId = blockIds[id + j / 2];
-
-							// Set device and sync receiver's stream
-							cudaSetDevice(description->getDeviceAssignments()[receiverId]);
-							cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
-						}
-					}
-				}
-			}
-
-			cudaSetDevice(current_device);
-		}
-
-		/**
-		 * Getter for n
-		 * @return The value of n
-		 */
-		int32_t getN() {
-			return n;
-		}
-
-		/**
-		 * Gets the pointer to the allocated memory for a specified block.
-		 * @param bId The block id to get the memory for.
-		 * @return A pointer to the allocated memory for the given block.
-		 */
-		ValueType* get(int32_t bId) {
-			return values[bId];
-		}
-	};
-
-	/**
-	 * This method takes in COO format matrix data and a MatrixDecompositionDescription and
-	 * returns a Matrix2d object containing the given data.
-	 */
-	template<typename GlobalType, typename LocalType, typename ValueType>
-	Matrix2d<GlobalType, LocalType, ValueType> COOto2d(MatrixDecompositionDescription<GlobalType,
-																				LocalType> descr,
-																		GlobalType* rowIds,
-																		GlobalType* colIds,
-																		ValueType* values) {
-		// Grab the current device id to switch back after allocations are done
-		int current_device;
-		cudaGetDevice(&current_device);
-
-		int32_t blockCount = descr.getNumBlocks();
-
-		// Allocate array of size global nnz to hold the block labels
-		int32_t* blockLabels = (int32_t*) malloc(descr.getNnz() * sizeof(int32_t));
-
-		// Allocate array to contain row counts for each block and initialize to zero
-		// Allocate array to contain position offsets for writing each blocks data
-		LocalType* blockCounts = (LocalType*) malloc(blockCount * sizeof(LocalType));
-		LocalType* blockPos = (LocalType*) malloc(blockCount * sizeof(LocalType));
-		for (int i = 0; i < blockCount; i++) {
-			blockCounts[i] = 0;
-			blockPos[i] = 0;
-		}
-
-		// For each edge mark in the array the id of the block to which it will belong
-		int32_t blockId;
-		LocalType localRow;
-		LocalType localCol;
-		for (int i = 0; i < descr.getNnz(); i++) {
-			descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId);
-			blockLabels[i] = blockId;
-			blockCounts[blockId]++;
-		}
-
-		// Allocate arrays for putting each blocks data into
-		LocalType** blockRowIds = (LocalType**) malloc(blockCount * sizeof(LocalType*));
-		LocalType** blockColIds = (LocalType**) malloc(blockCount * sizeof(LocalType*));
-		ValueType** blockValues = NULL;
-		if (values)
-			blockValues = (ValueType**) malloc(blockCount * sizeof(ValueType*));
-		for (int i = 0; i < blockCount; i++) {
-			blockRowIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType));
-			blockColIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType));
-			if (values)
-				blockValues[i] = (ValueType*) malloc(blockCounts[i] * sizeof(ValueType));
-		}
-
-		// Convert each blocks global rows to local ids and copy into block arrays
-		for (int i = 0; i < descr.getNnz(); i++) {
-			descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId);
-			blockRowIds[blockId][blockPos[blockId]] = localRow;
-			blockColIds[blockId][blockPos[blockId]] = localCol;
-			if (values)
-				blockValues[blockId][blockPos[blockId]] = values[i];
-			blockPos[blockId]++;
-		}
-
-		// Allocate the result blocks vector
-		std::vector<MultiValuedCsrGraph<LocalType, ValueType>*> blockVector(blockCount);
-
-		// Convert each blocks COO rows into CSR and create it's graph object.
-		for (int i = 0; i < blockCount; i++) {
-			// Set the device as indicated so the data ends up on the right GPU
-			cudaSetDevice(descr.getDeviceAssignments()[i]);
-			cudaStream_t stream = descr.getBlockStreams()[i];
-
-			if (blockCounts[i] > 0) {
-				CSR_Result_Weighted<LocalType, ValueType> result;
-				ConvertCOOtoCSR_weighted(blockRowIds[i],
-													blockColIds[i],
-													values ? blockValues[i] : NULL,
-													(int64_t) blockCounts[i],
-													(descr.getOffset() - 1),
-													result);
-				MultiValuedCsrGraph<LocalType, ValueType>* csrGraph = new MultiValuedCsrGraph<LocalType,
-						ValueType>((size_t) result.size, (size_t) result.nnz, stream);
-				if (values)
-					csrGraph->allocateEdgeData(1, NULL);
-				cudaMemcpy(csrGraph->get_raw_row_offsets(),
-								result.rowOffsets,
-								(result.size + 1) * sizeof(LocalType),
-								cudaMemcpyDefault);
-				cudaMemcpy(csrGraph->get_raw_column_indices(),
-								result.colIndices,
-								result.nnz * sizeof(LocalType),
-								cudaMemcpyDefault);
-				if (values)
-					cudaMemcpy(csrGraph->get_raw_edge_dim(0),
-									result.edgeWeights,
-									result.nnz * sizeof(LocalType),
-									cudaMemcpyDefault);
-				blockVector[i] = csrGraph;
-				result.Destroy();
-			}
-			else {
-				MultiValuedCsrGraph<LocalType, ValueType>* csrGraph = new MultiValuedCsrGraph<LocalType,
-						ValueType>((size_t) descr.getOffset(), (size_t) 0, stream);
-				cudaMemset(	csrGraph->get_raw_row_offsets(),
-								0,
-								sizeof(LocalType) * (descr.getOffset() + 1));
-				blockVector[i] = csrGraph;
-			}
-		}
-
-		// Free temporary memory
-		for (int i = 0; i < blockCount; i++) {
-			free(blockRowIds[i]);
-			free(blockColIds[i]);
-			if (values)
-				free(blockValues[i]);
-		}
-		free(blockRowIds);
-		free(blockColIds);
-		if (values)
-			free(blockValues);
-
-		cudaSetDevice(current_device);
-
-		// Put it all together into a Matrix2d object for return
-		return Matrix2d<GlobalType, LocalType, ValueType>(descr, blockVector);
-	}
+  template<typename T, typename W>
+  struct CSR_Result_Weighted {
+    int64_t size;
+    int64_t nnz;
+    T* rowOffsets;
+    T* colIndices;
+    W* edgeWeights;
+
+    CSR_Result_Weighted() :
+        size(0), nnz(0), rowOffsets(NULL), colIndices(NULL), edgeWeights(NULL) {
+    }
+
+    void Destroy() {
+      cudaStream_t stream{nullptr};
+      if (rowOffsets)
+        RMM_FREE(rowOffsets, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+      if (colIndices)
+        RMM_FREE(colIndices, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+      if (edgeWeights)
+        RMM_FREE(edgeWeights, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    }
+  };
+
+  // Define kernel for copying run length encoded values into offset slots.
+  template<typename T>
+  __global__ void offsetsKernel(T runCounts, T* unique, T* counts, T* offsets) {
+    for (int32_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+        idx < runCounts;
+        idx += gridDim.x * blockDim.x) {
+      offsets[unique[idx]] = counts[idx];
+    }
+  }
+
+  /**
+   * Method for converting COO to CSR format
+   * @param sources The array of source indices
+   * @param destinations The array of destination indices
+   * @param edgeWeights The array of edge weights
+   * @param nnz The number of non zero values
+   * @param maxId The largest id contained in the matrix
+   * @param result The result is stored here.
+   */
+  template<typename T, typename W>
+  void ConvertCOOtoCSR_weighted(T* sources,
+                      T* destinations,
+                      W* edgeWeights,
+                      int64_t nnz,
+                      T maxId,
+                      CSR_Result_Weighted<T, W>& result) {
+    // Sort source and destination columns by source
+    // Allocate local memory for operating on
+    T* srcs, *dests;
+    W* weights = NULL;
+    cudaStream_t stream{nullptr};
+
+    RMM_ALLOC(&srcs, sizeof(T) * nnz, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_ALLOC(&dests, sizeof(T) * nnz, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    if (edgeWeights)
+      RMM_ALLOC(&weights, sizeof(W) * nnz, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    cudaMemcpy(srcs, sources, sizeof(T) * nnz, cudaMemcpyDefault);
+    cudaMemcpy(dests, destinations, sizeof(T) * nnz, cudaMemcpyDefault);
+    if (edgeWeights)
+      cudaMemcpy(weights, edgeWeights, sizeof(W) * nnz, cudaMemcpyDefault);
+
+    // Call Thrust::sort_by_key to sort the arrays with srcs as keys:
+    if (edgeWeights)
+      thrust::sort_by_key(thrust::device,
+                    srcs,
+                    srcs + nnz,
+                    thrust::make_zip_iterator(thrust::make_tuple(dests, weights)));
+    else
+      thrust::sort_by_key(thrust::device, srcs, srcs + nnz, dests);
+
+    result.size = maxId + 1;
+
+    // Allocate offsets array
+    RMM_ALLOC(&result.rowOffsets, (maxId + 2) * sizeof(T), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+
+    // Set all values in offsets array to zeros
+    cudaMemset(result.rowOffsets, 0, (maxId + 2) * sizeof(T));
+
+    // Allocate temporary arrays same size as sources array, and single value to get run counts
+    T* unique, *counts, *runCount;
+    RMM_ALLOC(&unique, (maxId + 1) * sizeof(T), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_ALLOC(&counts, (maxId + 1) * sizeof(T), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_ALLOC(&runCount, sizeof(T), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+
+    // Use CUB run length encoding to get unique values and run lengths
+    void *tmpStorage = NULL;
+    size_t tmpBytes = 0;
+    cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
+    RMM_ALLOC(&tmpStorage, tmpBytes, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
+    RMM_FREE(tmpStorage, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+
+    // Set offsets to run sizes for each index
+    T runCount_h;
+    cudaMemcpy(&runCount_h, runCount, sizeof(T), cudaMemcpyDefault);
+    int threadsPerBlock = 1024;
+    int numBlocks = min(65535, (runCount_h + threadsPerBlock - 1) / threadsPerBlock);
+    offsetsKernel<<<numBlocks, threadsPerBlock>>>(runCount_h, unique, counts, result.rowOffsets);
+
+    // Scan offsets to get final offsets
+    thrust::exclusive_scan(thrust::device,
+                    result.rowOffsets,
+                    result.rowOffsets + maxId + 2,
+                    result.rowOffsets);
+
+    // Clean up temporary allocations
+    result.nnz = nnz;
+    result.colIndices = dests;
+    result.edgeWeights = weights;
+    RMM_FREE(srcs, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(unique, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(counts, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(runCount, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+  }
+
+  /**
+   * Describes the 2D decomposition of a partitioned matrix.
+   */
+  template<typename GlobalType, typename LocalType>
+  class MatrixDecompositionDescription {
+  protected:
+    GlobalType numRows;   // Global number of rows in matrix
+    GlobalType numCols;   // Global number of columns in matrix
+    GlobalType nnz;      // Global number of non-zeroes in matrix
+    GlobalType blockRows;  // Number of rows of blocks in the decomposition
+    GlobalType blockCols;  // Number of columns of rows in the decomposition
+    LocalType offset;
+    // Offsets-like arrays for rows and columns defining the start/end of the
+    // sections of the global id space belonging to each row and column.
+    std::vector<GlobalType> rowOffsets;
+    std::vector<GlobalType> colOffsets;
+    // Array of integers one for each block, defining the device it is assigned to
+    std::vector<int32_t> deviceAssignments;
+    std::vector<cudaStream_t> blockStreams;
+    public:
+
+    MatrixDecompositionDescription() :
+        numRows(0), numCols(0), nnz(0), blockRows(0), blockCols(0) {
+      rowOffsets.push_back(0);
+      colOffsets.push_back(0);
+      deviceAssignments.push_back(0);
+    }
+
+    // Basic constructor, just takes in the values of its members.
+    MatrixDecompositionDescription(GlobalType numRows,
+                          GlobalType numCols,
+                          GlobalType nnz,
+                          GlobalType blockRows,
+                          GlobalType blockCols,
+                          std::vector<GlobalType> rowOffsets,
+                          std::vector<GlobalType> colOffsets,
+                          std::vector<int32_t> deviceAssignments) :
+        numRows(numRows), numCols(numCols), nnz(nnz), blockRows(blockRows),
+            blockCols(blockCols), rowOffsets(rowOffsets), colOffsets(colOffsets),
+            deviceAssignments(deviceAssignments) {
+    }
+
+    // Constructs a MatrixDecompositionDescription for a square matrix given the
+    // number of rows in the matrix and number of rows of blocks.
+    MatrixDecompositionDescription(GlobalType numRows,
+                          GlobalType numBlockRows,
+                          GlobalType nnz,
+                          std::vector<int32_t> devices) :
+        numRows(numRows),
+            numCols(numRows),
+            blockRows(numBlockRows),
+            blockCols(numBlockRows),
+            nnz(nnz) {
+      // Tracking the current set device to change back
+      int currentDevice;
+      cudaGetDevice(&currentDevice);
+
+      // Setting up the row and col offsets into equally sized chunks
+      GlobalType remainder = numRows % blockRows;
+      if (remainder != 0)
+        offset = (numRows + blockRows - remainder) / blockRows;
+      else
+        offset = numRows / blockRows;
+
+      rowOffsets.resize(blockRows + 1);
+      colOffsets.resize(blockRows + 1);
+      for (int i = 0; i < blockRows; i++) {
+        rowOffsets[i] = i * offset;
+        colOffsets[i] = i * offset;
+      }
+      rowOffsets.back() = blockRows * offset;
+      colOffsets.back() = blockCols * offset;
+
+      // Setting up the device assignments using the given device ids and also
+      // setting up the stream associated with each block.
+      deviceAssignments.resize(getNumBlocks());
+      blockStreams.resize(getNumBlocks());
+      for (int i = 0; i < getNumBlocks(); i++) {
+        int device = devices[i % devices.size()];
+        deviceAssignments[i] = device;
+        cudaSetDevice(device);
+        cudaStream_t stream;
+        cudaStreamCreate(&stream);
+        blockStreams[i] = stream;
+      }
+
+      // Restoring to current device when called
+      cudaSetDevice(currentDevice);
+    }
+
+    // Gets the row id for the block containing the given global row id
+    int32_t getRowId(GlobalType val) const {
+      return std::upper_bound(rowOffsets.begin(), rowOffsets.end(), val) - rowOffsets.begin() - 1;
+    }
+
+    // Gets the column id for the block containing the given global column id
+    int32_t getColId(GlobalType val) const {
+      return std::upper_bound(colOffsets.begin(), colOffsets.end(), val) - colOffsets.begin() - 1;
+    }
+
+    // Gets the number of blocks in the decomposition:
+    int32_t getNumBlocks() const {
+      return blockRows * blockCols;
+    }
+
+    // Getter for offset
+    LocalType getOffset() const {
+      return offset;
+    }
+
+    // Getter for deviceAssignments
+    const std::vector<int32_t>& getDeviceAssignments() const {
+      return deviceAssignments;
+    }
+
+    /**
+     * Getter for vector of streams for each block.
+     * @return Reference to vector of streams for each block
+     */
+    const std::vector<cudaStream_t>& getBlockStreams() const {
+      return blockStreams;
+    }
+
+    /**
+     * Getter for nnz
+     * @return The global number of non-zero elements
+     */
+    GlobalType getNnz() const {
+      return nnz;
+    }
+
+    /**
+     * Getter method for numRows
+     * @return The number of global rows in the matrix
+     */
+    GlobalType getNumRows() const {
+      return numRows;
+    }
+
+    /**
+     * Getter for BlockRows
+     * @return The number of blocks in a row in the decomposition.
+     */
+    GlobalType getBlockRows() const {
+      return blockRows;
+    }
+
+    /**
+     * Getter for BlockCols
+     * @return The number of blocks in a column in the decomposition.
+     */
+    GlobalType getBlockCols() const {
+      return blockCols;
+    }
+
+    /**
+     * Given a block id, returns the row which that block is in.
+     * @param bId The block ID
+     * @return The row number
+     */
+    int32_t getBlockRow(int32_t bId) const {
+      return bId / blockCols;
+    }
+
+    /**
+     * Given a block id, returns the column which that block is in.
+     * @param bId The block ID
+     * @return The column number
+     */
+    int32_t getBlockCol(int32_t bId) const {
+      return bId % blockCols;
+    }
+
+    /**
+     * Takes a COO global row and produces the COO local row and the block to which it belongs.
+     * @param globalRow The global row ID
+     * @param globalCol The global column ID
+     * @param localRow The block local row ID (return)
+     * @param localCol The block local column ID (return)
+     * @param blockId The block ID (return)
+     */
+    void convertGlobaltoLocalRow(GlobalType globalRow,
+                        GlobalType globalCol,
+                        LocalType& localRow,
+                        LocalType& localCol,
+                        int32_t& blockId) const {
+      int32_t rowId = getRowId(globalRow);
+      int32_t colId = getColId(globalCol);
+      blockId = rowId * blockCols + colId;
+      localRow = globalRow - rowOffsets[rowId];
+      localCol = globalCol - colOffsets[colId];
+    }
+
+    /**
+     * Takes in a row ID and column ID and returns the corresponding block ID
+     * @param rowId The row ID
+     * @param colId The column ID
+     * @return The ID of the corresponding block
+     */
+    int32_t getBlockId(int32_t rowId, int32_t colId) const {
+      return rowId * blockCols + colId;
+    }
+
+    /**
+     * Helper method to synchronize all streams after operations are issued.
+     */
+    void syncAllStreams() const {
+      int32_t numBlocks = getNumBlocks();
+      int32_t current_device;
+      cudaGetDevice(&current_device);
+      for (int32_t i = 0; i < numBlocks; i++) {
+        cudaSetDevice(deviceAssignments[i]);
+        cudaStreamSynchronize(blockStreams[i]);
+      }
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * This method is only for testing and debugging use.
+     * @return A human readable string representation of the object
+     */
+    std::string toString() const {
+      std::stringstream ss;
+      ss << "Global Info:\n\tnumRows: " << numRows << ", numCols: " << numCols << ", nnz: "
+          << nnz;
+      ss << "\n";
+      ss << "Block Info:\n\tblockRows: " << blockRows << ", blockCols: " << blockCols;
+      ss << "\n";
+      ss << "rowOffsets: [";
+      for (int i = 0; i < (int) rowOffsets.size(); i++)
+        ss << rowOffsets[i] << (i == (int) rowOffsets.size() - 1 ? "]\n" : ", ");
+      ss << "colOffsets: [";
+      for (int i = 0; i < (int) colOffsets.size(); i++)
+        ss << colOffsets[i] << (i == (int) colOffsets.size() - 1 ? "]\n" : ", ");
+      ss << "deviceAssignments: [";
+      for (int i = 0; i < (int) deviceAssignments.size(); i++)
+        ss << deviceAssignments[i] << (i == (int) deviceAssignments.size() - 1 ? "]\n" : ", ");
+      return ss.str();
+    }
+  };
+
+  template<typename GlobalType, typename LocalType, typename ValueType>
+  class Matrix2d {
+  protected:
+    // Description of the matrix decomposition
+    MatrixDecompositionDescription<GlobalType, LocalType> description;
+
+    // Array of block matrices forming the decomposition
+    std::vector<MultiValuedCsrGraph<LocalType, ValueType>*> blocks;
+    public:
+    Matrix2d() {
+    }
+    Matrix2d(MatrixDecompositionDescription<GlobalType, LocalType> descr,
+          std::vector<MultiValuedCsrGraph<LocalType, ValueType>*> blocks) :
+        description(descr), blocks(blocks) {
+    }
+
+    const MatrixDecompositionDescription<GlobalType, LocalType>& getMatrixDecompositionDescription() {
+      return description;
+    }
+
+    MultiValuedCsrGraph<LocalType, ValueType>* getBlockMatrix(int32_t bId) {
+      return blocks[bId];
+    }
+
+    std::string toString() {
+      std::stringstream ss;
+      ss << "MatrixDecompositionDescription:\n" << description.toString();
+      for (int i = 0; i < (int) blocks.size(); i++) {
+        ss << "Block " << i << ":\n";
+        size_t numVerts = blocks[i]->get_num_vertices();
+        size_t numEdges = blocks[i]->get_num_edges();
+        size_t numValues = blocks[i]->getNumValues();
+        ss << "numVerts: " << numVerts << ", numEdges: " << numEdges << "\n";
+        LocalType* rowOffsets = (LocalType*) malloc((numVerts + 1) * sizeof(LocalType));
+        LocalType* colIndices = (LocalType*) malloc(numEdges * sizeof(LocalType));
+        ValueType* values = NULL;
+        if (numValues > 0)
+          values = (ValueType*) malloc(numEdges * sizeof(ValueType));
+        cudaMemcpy(rowOffsets,
+                blocks[i]->get_raw_row_offsets(),
+                (numVerts + 1) * sizeof(LocalType),
+                cudaMemcpyDefault);
+        cudaMemcpy(colIndices,
+                blocks[i]->get_raw_column_indices(),
+                numEdges * sizeof(LocalType),
+                cudaMemcpyDefault);
+        if (values)
+          cudaMemcpy(values,
+                  blocks[i]->get_raw_edge_dim(0),
+                  numEdges * sizeof(ValueType),
+                  cudaMemcpyDefault);
+        int idxCount = numEdges >= (numVerts + 1) ? numEdges : (numVerts + 1);
+        ss << "Idx\tOffset\tColInd\tValue\n";
+        for (int j = 0; j < idxCount; j++) {
+          if (j < (int) numVerts + 1 && j < (int) numEdges)
+            ss << j << ":\t" << rowOffsets[j] << "\t" << colIndices[j] << "\t"
+                << (values ? values[j] : 0)
+                << "\n";
+          else if (j < (int) numVerts + 1 && j >= (int) numEdges)
+            ss << j << ":\t" << rowOffsets[j] << "\n";
+          else if (j >= (int) numVerts + 1 && j < (int) numEdges)
+            ss << j << ":\t" << "\t" << colIndices[j] << "\t" << (values ? values[j] : 0)
+                << "\n";
+        }
+        free(rowOffsets);
+        free(colIndices);
+        free(values);
+      }
+      return ss.str();
+    }
+  };
+
+  template<typename GlobalType, typename LocalType, typename ValueType>
+  class VertexData2D {
+    const MatrixDecompositionDescription<GlobalType, LocalType>* description;
+    int32_t n;
+    std::vector<cub::DoubleBuffer<ValueType> > values;
+    public:
+    /**
+     * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription
+     * object which describes the matrix the data is attached to. Data buffers are
+     * allocated for each block using the offset from the description to size the
+     * buffers, and to locate the buffers on the same GPU as the matrix block.
+     */
+    VertexData2D(const MatrixDecompositionDescription<GlobalType, LocalType>* descr) :
+        description(descr) {
+      // Resize the values array to be the same size as number of blocks
+      values.resize(descr->getNumBlocks());
+
+      // Grab the current device id to switch back after allocations are done
+      int current_device;
+      cudaGetDevice(&current_device);
+      LocalType allocSize = descr->getOffset();
+      n = allocSize;
+      // Allocate the data for each block
+      cudaStream_t stream{nullptr};
+      for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+        int device = descr->getDeviceAssignments()[i];
+        cudaSetDevice(device);
+        ValueType* d_current, *d_alternate;
+        RMM_ALLOC(&d_current, sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+        RMM_ALLOC(&d_alternate, sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+        values[i].d_buffers[0] = d_current;
+        values[i].d_buffers[1] = d_alternate;
+      }
+
+      // Set the device back to what it was initially
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription
+     * object, which describes the matrix the data is attached to, and an integer which indicates
+     * how many data elements should be allocated for each block. Data buffers are allocated
+     * for each block using the offset from the description to size the buffers, and to locate
+     * the buffers on the same GPU as the matrix block.
+     */
+    VertexData2D(const MatrixDecompositionDescription<GlobalType, LocalType>* descr, size_t _n) :
+        description(descr) {
+      // Resize the values array to be the same size as number of blocks
+      values.resize(descr->getNumBlocks());
+
+      // Grab the current device id to switch back after allocations are done
+      int current_device;
+      cudaGetDevice(&current_device);
+      LocalType allocSize = _n;
+      n = allocSize;
+      // Allocate the data for each block
+      cudaStream_t stream{nullptr};
+      for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+        int device = descr->getDeviceAssignments()[i];
+        cudaSetDevice(device);
+        ValueType* d_current, *d_alternate;
+        RMM_ALLOC(&d_current, sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+        RMM_ALLOC(&d_alternate, sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+        values[i].d_buffers[0] = d_current;
+        values[i].d_buffers[1] = d_alternate;
+      }
+
+      // Set the device back to what it was initially
+      cudaSetDevice(current_device);
+    }
+
+    ~VertexData2D() {
+      cudaStream_t stream{nullptr};
+      for (size_t i = 0; i < values.size(); i++) {
+        if (values[i].Current())
+          RMM_FREE(values[i].Current(), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+        if (values[i].Alternate())
+          RMM_FREE(values[i].Alternate(), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+      }
+    }
+
+    /**
+     * Getter for n the size of each block's allocation in elements.
+     * @return The value of n
+     */
+    int32_t getN() {
+      return n;
+    }
+
+    /**
+     * Getter for the MatrixDecompositionDescription associated with this VertexData2D
+     * @return Pointer to the MatrixDecompositionDescription for this VertexData2D
+     */
+    const MatrixDecompositionDescription<GlobalType, LocalType>* getDescription() {
+      return description;
+    }
+
+    /**
+     * Gets the current buffer corresponding to the given block ID
+     */
+    ValueType* getCurrent(int bId) {
+      return values[bId].Current();
+    }
+
+    /**
+     * Gets the alternate buffer corresponding to the given block ID
+     */
+    ValueType* getAlternate(int bId) {
+      return values[bId].Alternate();
+    }
+
+    /**
+     * Swaps the current and alternate buffers for all block IDs
+     */
+    void swapBuffers() {
+      for (size_t i = 0; i < values.size(); i++)
+        values[i].selector ^= 1;
+    }
+
+    /**
+     * Sets an element in the global array, assuming that the data is currently
+     * valid and in the diagonal blocks. After calling this method either columnScatter
+     * or rowScatter should be called to propagate the change to all blocks.
+     */
+    void setElement(GlobalType globalIndex, ValueType val) {
+      LocalType blockId = globalIndex / n;
+      LocalType blockOffset = globalIndex % n;
+      int32_t bId = description->getBlockId(blockId, blockId);
+      ValueType* copyTo = values[bId].Current() + blockOffset;
+      cudaMemcpy(copyTo, &val, sizeof(ValueType), cudaMemcpyDefault);
+    }
+
+    /**
+     * Sets the elements of the global array, using the provided array of values. The values
+     * are set in the blocks of the diagonal, columnScatter or rowScatter should be called
+     * to propogate to all blocks.
+     * @param vals Pointer to an array with the values to be set.
+     */
+    void setElements(ValueType* vals) {
+      LocalType offset = description->getOffset();
+      int32_t numRows = description->getBlockRows();
+      for (int i = 0; i < numRows; i++) {
+        int32_t id = description->getBlockId(i, i);
+        cudaStream_t stream = description->getBlockStreams()[id];
+        ValueType* copyFrom = vals + i * n;
+        ValueType* copyTo = values[id].Current();
+        cudaMemcpyAsync(copyTo, copyFrom, sizeof(ValueType) * n, cudaMemcpyDefault, stream);
+      }
+      description->syncAllStreams();
+    }
+
+    /**
+     * Fills the elements of the data array with the given value.
+     * The elements on the diagonal are filled with the given value. After filling,
+     * either rowScatter or columnScatter will copy the values across the blocks in
+     * either the rows or columns depending on the use.
+     * @param val The value to fill the array with
+     */
+    void fillElements(ValueType val) {
+      int current_device;
+      cudaGetDevice(&current_device);
+      int32_t numRows = description->getBlockRows();
+      for (int32_t i = 0; i < numRows; i++) {
+        int32_t blockId = description->getBlockId(i, i);
+        ValueType* vals = getCurrent(blockId);
+        int deviceId = description->getDeviceAssignments()[blockId];
+        cudaStream_t stream = description->getBlockStreams()[blockId];
+        cudaSetDevice(deviceId);
+        thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val);
+      }
+      description->syncAllStreams();
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * Copies the values of the diagonal blocks in this VertexData2D into the
+     * VertexData2D specified.
+     * @param other Pointer to the VertexData2D to copy into
+     */
+    void copyTo(VertexData2D<GlobalType, LocalType, ValueType>* other) {
+      const MatrixDecompositionDescription<GlobalType, LocalType>* otherDescr =
+          other->getDescription();
+      // Do a quick check that the sizes of both block arrays are the same.
+      if (description->getBlockRows() == otherDescr->getBlockRows() && n == other->getN()) {
+        // Issue asynchronous copies for each block's data
+        for (int i = 0; i < description->getBlockRows(); i++) {
+          int32_t bId = description->getBlockId(i, i);
+          ValueType* copyFrom = getCurrent(bId);
+          ValueType* copyTo = other->getCurrent(bId);
+          cudaStream_t stream = description->getBlockStreams()[bId];
+          cudaMemcpyAsync(copyTo, copyFrom, n * sizeof(ValueType), cudaMemcpyDefault, stream);
+        }
+        // Synchronize the streams after the copies are done
+        for (int i = 0; i < description->getBlockRows(); i++) {
+          int32_t bId = description->getBlockId(i, i);
+          cudaStream_t stream = description->getBlockStreams()[bId];
+          cudaStreamSynchronize(stream);
+        }
+      }
+    }
+
+    /**
+     * This method implements a row-wise reduction of each blocks data into a
+     * single array for each row. The block on the diagonal will have the result.
+     */
+    template<typename Operator>
+    void rowReduce() {
+      int current_device;
+      cudaGetDevice(&current_device);
+      Operator op;
+
+      // For each row in the decomposition:
+      int32_t numRows = description->getBlockRows();
+      std::vector<int32_t> blockIds;
+      for (int32_t i = 0; i < numRows; i++) {
+        // Put all the block ids for the row into a vector, with the ID of the diagonal block
+        // at index 0.
+        std::vector<int32_t> blockIds;
+        blockIds.push_back(-1);
+        for (int32_t j = 0; j < numRows; j++) {
+          if (i == j) {
+            blockIds[0] = description->getBlockId(i, j);
+          }
+          else {
+            blockIds.push_back(description->getBlockId(i, j));
+          }
+        }
+
+        // Do a binary tree reduction. At each step the primary buffer of the sender is
+        // copied into the secondary buffer of the receiver. After the copy is done
+        // each receiver performs the reduction operator and stores the result in it's
+        // primary buffer.
+        for (int32_t j = 2; (j / 2) < numRows; j *= 2) {
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id] is the receiver
+              int32_t receiverId = blockIds[id];
+
+              // blockIds[id + j/2] is the sender
+              int32_t senderId = blockIds[id + j / 2];
+
+              // Get the stream associated with the receiver's block id
+              cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+              // Copy from the sender to the receiver (use stream associated with receiver)
+              cudaMemcpyAsync(values[receiverId].Alternate(),
+                          values[senderId].Current(),
+                          sizeof(ValueType) * n,
+                          cudaMemcpyDefault,
+                          stream);
+
+              // Invoke the reduction operator on the receiver's GPU and values arrays.
+              cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+              ValueType* input1 = values[receiverId].Alternate();
+              ValueType* input2 = values[receiverId].Current();
+              thrust::transform(thrust::cuda::par.on(stream),
+                          input1,
+                          input1 + n,
+                          input2,
+                          input2,
+                          op);
+            }
+          }
+          // Sync all active streams before next step
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id] is the receiver
+              int32_t receiverId = blockIds[id];
+
+              // Set the device to the receiver and sync the stream
+              cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+              cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+            }
+          }
+        }
+      }
+
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * This method implements a column-wise reduction of each blocks data into a
+     * single array for each column. The block on the diagonal will have the result.
+     */
+    template<typename Operator>
+    void columnReduce() {
+      int current_device;
+      cudaGetDevice(&current_device);
+      Operator op;
+
+      // For each column in the decomposition:
+      int32_t numRows = description->getBlockRows();
+      std::vector<int32_t> blockIds;
+      for (int32_t i = 0; i < numRows; i++) {
+        // Put all the block ids for the row into a vector, with the ID of the diagonal block
+        // at index 0.
+        std::vector<int32_t> blockIds;
+        blockIds.push_back(-1);
+        for (int32_t j = 0; j < numRows; j++) {
+          if (i == j) {
+            blockIds[0] = description->getBlockId(j, i);
+          }
+          else {
+            blockIds.push_back(description->getBlockId(j, i));
+          }
+        }
+
+        // Do a binary tree reduction. At each step the primary buffer of the sender is
+        // copied into the secondary buffer of the receiver. After the copy is done
+        // each receiver performs the reduction operator and stores the result in it's
+        // primary buffer.
+        for (int32_t j = 2; (j / 2) < numRows; j *= 2) {
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id] is the receiver
+              int32_t receiverId = blockIds[id];
+
+              // blockIds[id + j/2] is the sender
+              int32_t senderId = blockIds[id + j / 2];
+
+              // Get the stream associated with the receiver's block id
+              cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+              // Copy from the sender to the receiver (use stream associated with receiver)
+              cudaMemcpyAsync(values[receiverId].Alternate(),
+                          values[senderId].Current(),
+                          sizeof(ValueType) * n,
+                          cudaMemcpyDefault,
+                          stream);
+
+              // Invoke the reduction operator on the receiver's GPU and values arrays.
+              cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+              ValueType* input1 = values[receiverId].Alternate();
+              ValueType* input2 = values[receiverId].Current();
+              thrust::transform(thrust::cuda::par.on(stream),
+                          input1,
+                          input1 + n,
+                          input2,
+                          input2,
+                          op);
+            }
+          }
+          // Sync all active streams before next step
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id] is the receiver
+              int32_t receiverId = blockIds[id];
+
+              // Set the device to the receiver and sync the stream
+              cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+              cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+            }
+          }
+        }
+      }
+
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * This implements a column-wise scatter of the global data from the corresponding
+     * row. i.e. The data reduced from row 1 is broadcast to all blocks in
+     * column 1. It is assumed that the data to broadcast is located in the block on
+     * the diagonal.
+     */
+    void columnScatter() {
+      int current_device;
+      cudaGetDevice(&current_device);
+
+      // For each column in the decomposition:
+      int32_t numRows = description->getBlockRows();
+      std::vector<int32_t> blockIds;
+      for (int32_t i = 0; i < numRows; i++) {
+        // Put all the block ids for the column into a vector, with the ID of the diagonal block
+        // at index 0.
+        std::vector<int32_t> blockIds;
+        blockIds.push_back(-1);
+        for (int32_t j = 0; j < numRows; j++) {
+          if (i == j) {
+            blockIds[0] = description->getBlockId(j, i);
+          }
+          else {
+            blockIds.push_back(description->getBlockId(j, i));
+          }
+        }
+
+        // Do a binary tree scatter. At each step the primary buffer of the sender is
+        // copied into the primary buffer of the receiver.
+        int32_t max2pow = 2;
+        while (max2pow < numRows) {
+          max2pow *= 2;
+        }
+        for (int32_t j = max2pow; j >= 2; j /= 2) {
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id] is the sender
+              int32_t senderId = blockIds[id];
+
+              // blockIds[id + j/2] is the sender
+              int32_t receiverId = blockIds[id + j / 2];
+
+              // Get the stream associated with the receiver's block id
+              cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+              // Copy from the sender to the receiver (use stream associated with receiver)
+              cudaMemcpyAsync(values[receiverId].Current(),
+                          values[senderId].Current(),
+                          sizeof(ValueType) * n,
+                          cudaMemcpyDefault,
+                          stream);
+            }
+          }
+          // Synchronize all the active streams before next step.
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id + j/2] is the sender
+              int32_t receiverId = blockIds[id + j / 2];
+
+              // Set device and sync receiver's stream
+              cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+              cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+            }
+          }
+        }
+      }
+
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * This implements a row-wise scatter of the global data from the corresponding
+     * column. i.e. The data reduced from column 1 is broadcast to all blocks in
+     * row 1. It is assumed that the data to broadcast is located in the block on
+     * the diagonal.
+     */
+    void rowScatter() {
+      int current_device;
+      cudaGetDevice(&current_device);
+
+      // For each row in the decomposition:
+      int32_t numRows = description->getBlockRows();
+      std::vector<int32_t> blockIds;
+      for (int32_t i = 0; i < numRows; i++) {
+        // Put all the block ids for the column into a vector, with the ID of the diagonal block
+        // at index 0.
+        std::vector<int32_t> blockIds;
+        blockIds.push_back(-1);
+        for (int32_t j = 0; j < numRows; j++) {
+          if (i == j) {
+            blockIds[0] = description->getBlockId(i, j);
+          }
+          else {
+            blockIds.push_back(description->getBlockId(i, j));
+          }
+        }
+
+        // Do a binary tree scatter. At each step the primary buffer of the sender is
+        // copied into the primary buffer of the receiver.
+        int32_t max2pow = 2;
+        while (max2pow < numRows) {
+          max2pow *= 2;
+        }
+        for (int32_t j = max2pow; j >= 2; j /= 2) {
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id] is the sender
+              int32_t senderId = blockIds[id];
+
+              // blockIds[id + j/2] is the receiver
+              int32_t receiverId = blockIds[id + j / 2];
+
+              // Get the stream associated with the receiver's block id
+              cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+              // Copy from the sender to the receiver (use stream associated with receiver)
+              cudaMemcpyAsync(values[receiverId].Current(),
+                          values[senderId].Current(),
+                          sizeof(ValueType) * n,
+                          cudaMemcpyDefault,
+                          stream);
+            }
+          }
+          // Sync all the active streams before next step
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id + j/2] is the receiver
+              int32_t receiverId = blockIds[id + j / 2];
+
+              // Set device and sync receiver's stream
+              cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+              cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+            }
+          }
+        }
+      }
+
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * Outputs a human readable string representation of this Vertex2d object. This is only
+     * intended to be used for de-bugging.
+     * @return Human readable string representation
+     */
+    std::string toString() {
+      std::stringstream ss;
+      ValueType* c = (ValueType*) malloc(sizeof(ValueType) * n);
+      ValueType* a = (ValueType*) malloc(sizeof(ValueType) * n);
+
+      int32_t numBlocks = description->getNumBlocks();
+
+      ss << "Vertex2d:\n";
+      for (int32_t i = 0; i < numBlocks; i++) {
+        ss << "Block " << i << ":\n";
+        ss << "Idx\tCur\tAlt\n";
+        cudaMemcpy(c, values[i].Current(), sizeof(ValueType) * n, cudaMemcpyDefault);
+        cudaMemcpy(a, values[i].Alternate(), sizeof(ValueType) * n, cudaMemcpyDefault);
+        for (int32_t j = 0; j < n; j++) {
+          ss << j << ":\t" << c[j] << "\t" << a[j] << "\n";
+        }
+      }
+
+      free(c);
+      free(a);
+
+      return ss.str();
+    }
+  };
+
+  template<typename GlobalType, typename LocalType, typename ValueType>
+  class VertexData2D_Unbuffered {
+    const MatrixDecompositionDescription<GlobalType, LocalType>* description;
+    int32_t n;
+    std::vector<ValueType*> values;
+
+  public:
+    /**
+     * Sets up a VertexData2D_Unbuffered object with an element allocated for each vertex
+     * in each block.
+     * @param descr Pointer to a MatrixDecompositionDescription object describing the layout
+     * of the 2D blocks.
+     */
+    VertexData2D_Unbuffered(const MatrixDecompositionDescription<GlobalType, LocalType>* descr) :
+        description(descr) {
+      // Resize the values array to be the same size as number of blocks
+      values.resize(descr->getNumBlocks());
+
+      // Grab the current device id to switch back after allocations are done
+      int current_device;
+      cudaGetDevice(&current_device);
+      LocalType allocSize = descr->getOffset();
+      n = allocSize;
+      // Allocate the data for each block
+      cudaStream_t stream{nullptr};
+      for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+        int device = descr->getDeviceAssignments()[i];
+        cudaSetDevice(device);
+        RMM_ALLOC(&(values[i]), sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+      }
+
+      // Set the device back to what it was initially
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * Sets up a VertexData2D_Unbuffered object with _n elements allocated per block.
+     * @param descr Pointer to a MatrixDecompositionDescription object describing the layout
+     * of the 2D blocks.
+     * @param _n The number of elements to allocate per block.
+     */
+    VertexData2D_Unbuffered(const MatrixDecompositionDescription<GlobalType, LocalType>* descr,
+                    size_t _n) :
+        description(descr), n(_n) {
+      // Resize the values array to be the same size as number of blocks
+      values.resize(descr->getNumBlocks());
+
+      // Grab the current device id to switch back after allocations are done
+      int current_device;
+      cudaGetDevice(&current_device);
+      // Allocate the data for each block
+      cudaStream_t stream{nullptr};
+      for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+        int device = descr->getDeviceAssignments()[i];
+        cudaSetDevice(device);
+        RMM_ALLOC(&(values[i]), sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+      }
+
+      // Set the device back to what it was initially
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * Destructor. Frees all allocated memory.
+     */
+    ~VertexData2D_Unbuffered() {
+      cudaStream_t stream{nullptr};
+      for (size_t i = 0; i < values.size(); i++) {
+        if (values[i]) {
+          RMM_FREE(values[i], stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+        }
+      }
+    }
+
+    /**
+     * Fills the elements of the data array with the given value.
+     * The elements on the diagonal are filled with the given value. After filling,
+     * either rowScatter or columnScatter will copy the values across the blocks in
+     * either the rows or columns depending on the use.
+     * @param val The value to fill the array with
+     */
+    void fillElements(ValueType val) {
+      int current_device;
+      cudaGetDevice(&current_device);
+      int32_t numRows = description->getBlockRows();
+      for (int32_t i = 0; i < numRows; i++) {
+        int32_t blockId = description->getBlockId(i, i);
+        ValueType* vals = get(blockId);
+        int deviceId = description->getDeviceAssignments()[blockId];
+        cudaStream_t stream = description->getBlockStreams()[blockId];
+        cudaSetDevice(deviceId);
+        thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val);
+      }
+      description->syncAllStreams();
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * This implements a column-wise scatter of the global data from the corresponding
+     * row. i.e. The data reduced from row 1 is broadcast to all blocks in
+     * column 1. It is assumed that the data to broadcast is located in the block on
+     * the diagonal.
+     */
+    void columnScatter() {
+      int current_device;
+      cudaGetDevice(&current_device);
+
+      // For each column in the decomposition:
+      int32_t numRows = description->getBlockRows();
+      std::vector<int32_t> blockIds;
+      for (int32_t i = 0; i < numRows; i++) {
+        // Put all the block ids for the column into a vector, with the ID of the diagonal block
+        // at index 0.
+        std::vector<int32_t> blockIds;
+        blockIds.push_back(-1);
+        for (int32_t j = 0; j < numRows; j++) {
+          if (i == j) {
+            blockIds[0] = description->getBlockId(j, i);
+          }
+          else {
+            blockIds.push_back(description->getBlockId(j, i));
+          }
+        }
+
+        // Do a binary tree scatter. At each step the primary buffer of the sender is
+        // copied into the primary buffer of the receiver.
+        int32_t max2pow = 2;
+        while (max2pow < numRows) {
+          max2pow *= 2;
+        }
+        for (int32_t j = max2pow; j >= 2; j /= 2) {
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id] is the sender
+              int32_t senderId = blockIds[id];
+
+              // blockIds[id + j/2] is the sender
+              int32_t receiverId = blockIds[id + j / 2];
+
+              // Get the stream associated with the receiver's block id
+              cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+              // Copy from the sender to the receiver (use stream associated with receiver)
+              cudaMemcpyAsync(values[receiverId],
+                          values[senderId],
+                          sizeof(ValueType) * n,
+                          cudaMemcpyDefault,
+                          stream);
+            }
+          }
+          // Synchronize all the active streams before next step.
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id + j/2] is the sender
+              int32_t receiverId = blockIds[id + j / 2];
+
+              // Set device and sync receiver's stream
+              cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+              cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+            }
+          }
+        }
+      }
+
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * This implements a row-wise scatter of the global data from the corresponding
+     * column. i.e. The data reduced from column 1 is broadcast to all blocks in
+     * row 1. It is assumed that the data to broadcast is located in the block on
+     * the diagonal.
+     */
+    void rowScatter() {
+      int current_device;
+      cudaGetDevice(&current_device);
+
+      // For each row in the decomposition:
+      int32_t numRows = description->getBlockRows();
+      std::vector<int32_t> blockIds;
+      for (int32_t i = 0; i < numRows; i++) {
+        // Put all the block ids for the column into a vector, with the ID of the diagonal block
+        // at index 0.
+        std::vector<int32_t> blockIds;
+        blockIds.push_back(-1);
+        for (int32_t j = 0; j < numRows; j++) {
+          if (i == j) {
+            blockIds[0] = description->getBlockId(i, j);
+          }
+          else {
+            blockIds.push_back(description->getBlockId(i, j));
+          }
+        }
+
+        // Do a binary tree scatter. At each step the primary buffer of the sender is
+        // copied into the primary buffer of the receiver.
+        int32_t max2pow = 2;
+        while (max2pow < numRows) {
+          max2pow *= 2;
+        }
+        for (int32_t j = max2pow; j >= 2; j /= 2) {
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id] is the sender
+              int32_t senderId = blockIds[id];
+
+              // blockIds[id + j/2] is the receiver
+              int32_t receiverId = blockIds[id + j / 2];
+
+              // Get the stream associated with the receiver's block id
+              cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+              // Copy from the sender to the receiver (use stream associated with receiver)
+              cudaMemcpyAsync(values[receiverId],
+                          values[senderId],
+                          sizeof(ValueType) * n,
+                          cudaMemcpyDefault,
+                          stream);
+            }
+          }
+          // Sync all the active streams before next step
+          for (int32_t id = 0; id < numRows; id++) {
+            if (id % j == 0 && id + j / 2 < numRows) {
+              // blockIds[id + j/2] is the receiver
+              int32_t receiverId = blockIds[id + j / 2];
+
+              // Set device and sync receiver's stream
+              cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+              cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+            }
+          }
+        }
+      }
+
+      cudaSetDevice(current_device);
+    }
+
+    /**
+     * Getter for n
+     * @return The value of n
+     */
+    int32_t getN() {
+      return n;
+    }
+
+    /**
+     * Gets the pointer to the allocated memory for a specified block.
+     * @param bId The block id to get the memory for.
+     * @return A pointer to the allocated memory for the given block.
+     */
+    ValueType* get(int32_t bId) {
+      return values[bId];
+    }
+  };
+
+  /**
+   * This method takes in COO format matrix data and a MatrixDecompositionDescription and
+   * returns a Matrix2d object containing the given data.
+   */
+  template<typename GlobalType, typename LocalType, typename ValueType>
+  Matrix2d<GlobalType, LocalType, ValueType> COOto2d(MatrixDecompositionDescription<GlobalType,
+                                        LocalType> descr,
+                                    GlobalType* rowIds,
+                                    GlobalType* colIds,
+                                    ValueType* values) {
+    // Grab the current device id to switch back after allocations are done
+    int current_device;
+    cudaGetDevice(&current_device);
+
+    int32_t blockCount = descr.getNumBlocks();
+
+    // Allocate array of size global nnz to hold the block labels
+    int32_t* blockLabels = (int32_t*) malloc(descr.getNnz() * sizeof(int32_t));
+
+    // Allocate array to contain row counts for each block and initialize to zero
+    // Allocate array to contain position offsets for writing each blocks data
+    LocalType* blockCounts = (LocalType*) malloc(blockCount * sizeof(LocalType));
+    LocalType* blockPos = (LocalType*) malloc(blockCount * sizeof(LocalType));
+    for (int i = 0; i < blockCount; i++) {
+      blockCounts[i] = 0;
+      blockPos[i] = 0;
+    }
+
+    // For each edge mark in the array the id of the block to which it will belong
+    int32_t blockId;
+    LocalType localRow;
+    LocalType localCol;
+    for (int i = 0; i < descr.getNnz(); i++) {
+      descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId);
+      blockLabels[i] = blockId;
+      blockCounts[blockId]++;
+    }
+
+    // Allocate arrays for putting each blocks data into
+    LocalType** blockRowIds = (LocalType**) malloc(blockCount * sizeof(LocalType*));
+    LocalType** blockColIds = (LocalType**) malloc(blockCount * sizeof(LocalType*));
+    ValueType** blockValues = NULL;
+    if (values)
+      blockValues = (ValueType**) malloc(blockCount * sizeof(ValueType*));
+    for (int i = 0; i < blockCount; i++) {
+      blockRowIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType));
+      blockColIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType));
+      if (values)
+        blockValues[i] = (ValueType*) malloc(blockCounts[i] * sizeof(ValueType));
+    }
+
+    // Convert each blocks global rows to local ids and copy into block arrays
+    for (int i = 0; i < descr.getNnz(); i++) {
+      descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId);
+      blockRowIds[blockId][blockPos[blockId]] = localRow;
+      blockColIds[blockId][blockPos[blockId]] = localCol;
+      if (values)
+        blockValues[blockId][blockPos[blockId]] = values[i];
+      blockPos[blockId]++;
+    }
+
+    // Allocate the result blocks vector
+    std::vector<MultiValuedCsrGraph<LocalType, ValueType>*> blockVector(blockCount);
+
+    // Convert each blocks COO rows into CSR and create it's graph object.
+    for (int i = 0; i < blockCount; i++) {
+      // Set the device as indicated so the data ends up on the right GPU
+      cudaSetDevice(descr.getDeviceAssignments()[i]);
+      cudaStream_t stream = descr.getBlockStreams()[i];
+
+      if (blockCounts[i] > 0) {
+        CSR_Result_Weighted<LocalType, ValueType> result;
+        ConvertCOOtoCSR_weighted(blockRowIds[i],
+                          blockColIds[i],
+                          values ? blockValues[i] : NULL,
+                          (int64_t) blockCounts[i],
+                          (descr.getOffset() - 1),
+                          result);
+        MultiValuedCsrGraph<LocalType, ValueType>* csrGraph = new MultiValuedCsrGraph<LocalType,
+            ValueType>((size_t) result.size, (size_t) result.nnz, stream);
+        if (values)
+          csrGraph->allocateEdgeData(1, NULL);
+        cudaMemcpy(csrGraph->get_raw_row_offsets(),
+                result.rowOffsets,
+                (result.size + 1) * sizeof(LocalType),
+                cudaMemcpyDefault);
+        cudaMemcpy(csrGraph->get_raw_column_indices(),
+                result.colIndices,
+                result.nnz * sizeof(LocalType),
+                cudaMemcpyDefault);
+        if (values)
+          cudaMemcpy(csrGraph->get_raw_edge_dim(0),
+                  result.edgeWeights,
+                  result.nnz * sizeof(LocalType),
+                  cudaMemcpyDefault);
+        blockVector[i] = csrGraph;
+        result.Destroy();
+      }
+      else {
+        MultiValuedCsrGraph<LocalType, ValueType>* csrGraph = new MultiValuedCsrGraph<LocalType,
+            ValueType>((size_t) descr.getOffset(), (size_t) 0, stream);
+        cudaMemset(  csrGraph->get_raw_row_offsets(),
+                0,
+                sizeof(LocalType) * (descr.getOffset() + 1));
+        blockVector[i] = csrGraph;
+      }
+    }
+
+    // Free temporary memory
+    for (int i = 0; i < blockCount; i++) {
+      free(blockRowIds[i]);
+      free(blockColIds[i]);
+      if (values)
+        free(blockValues[i]);
+    }
+    free(blockRowIds);
+    free(blockColIds);
+    if (values)
+      free(blockValues);
+
+    cudaSetDevice(current_device);
+
+    // Put it all together into a Matrix2d object for return
+    return Matrix2d<GlobalType, LocalType, ValueType>(descr, blockVector);
+  }
 }
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_app.cu b/cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_app.cu
similarity index 100%
rename from cpp/nvgraph/cpp/include/app/nvlouvain_app.cu
rename to cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_app.cu
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_app_hierarchy.cu b/cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_app_hierarchy.cu
similarity index 100%
rename from cpp/nvgraph/cpp/include/app/nvlouvain_app_hierarchy.cu
rename to cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_app_hierarchy.cu
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu b/cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_sample.cu
similarity index 100%
rename from cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu
rename to cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_sample.cu
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu b/cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_sample_hierarchy.cu
similarity index 100%
rename from cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu
rename to cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_sample_hierarchy.cu
diff --git a/cpp/nvgraph/cpp/include/csr_graph.hxx b/cpp/nvgraph/cpp/include/csr_graph.hxx
index db77baed371..3abd3adc71b 100644
--- a/cpp/nvgraph/cpp/include/csr_graph.hxx
+++ b/cpp/nvgraph/cpp/include/csr_graph.hxx
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "graph.hxx"
-#include <cnmem_shared_ptr.hxx> // interface with CuMem (memory pool lib) for shared ptr
+#include "rmm_shared_ptr.hxx"
 
 namespace nvgraph
 {
@@ -41,11 +41,11 @@ protected:
 
     /*! Storage for the row offsets of the CSR data structure.  Also called the "row pointer" array.
      */
-    SHARED_PREFIX::shared_ptr<IndexType> row_offsets;
+    std::shared_ptr<IndexType> row_offsets;
 
     /*! Storage for the column indices of the CSR data structure.
      */
-    SHARED_PREFIX::shared_ptr<IndexType> column_indices;
+    std::shared_ptr<IndexType> column_indices;
 
 public:
         
@@ -109,8 +109,30 @@ public:
     }
     inline IndexType* get_raw_row_offsets() { return row_offsets.get(); }
     inline IndexType* get_raw_column_indices() { return column_indices.get(); }
-    inline void set_raw_row_offsets(IndexType* ptr) { row_offsets = attachDevicePtr<IndexType>(ptr, stream_); }
-    inline void set_raw_column_indices(IndexType* ptr) {column_indices = attachDevicePtr<IndexType>(ptr, stream_); }
+
+    inline void set_raw_row_offsets(IndexType* ptr) {
+        // This abuses std::shared_ptr. In this context, row_offsets does not
+        // participate in ownership (attachDevicePtr returns std::shared_ptr
+        // with a dummy deleter). row_offsets just work as a raw pointer, and
+        // this can be very misleading. However, to properly fix this, we need
+        // to modify gdf_column and gdf_graph as well, and we do not know yet
+        // how cudf people will modify gdf_column to address currently broken
+        // memory ownership model. So, we may leave this as is, but htis needs
+        // to be revisited, later.
+        row_offsets = attachDevicePtr<IndexType>(ptr, stream_);
+    }
+
+    inline void set_raw_column_indices(IndexType* ptr) {
+        // This abuses std::shared_ptr. In this context, column_indices does not
+        // participate in ownership (attachDevicePtr returns std::shared_ptr
+        // with a dummy deleter). column_indices just work as a raw pointer, and
+        // this can be very misleading. However, to properly fix this, we need
+        // to modify gdf_column and gdf_graph as well, and we do not know yet
+        // how cudf people will modify gdf_column to address currently broken
+        // memory ownership model. So, we may leave this as is, but htis needs
+        column_indices = attachDevicePtr<IndexType>(ptr, stream_);
+    }
+
     inline const IndexType* get_raw_row_offsets()  const { return row_offsets.get(); }
     inline const IndexType* get_raw_column_indices()  const { return column_indices.get(); }
     inline cudaStream_t get_stream() const { return stream_; }
diff --git a/cpp/nvgraph/cpp/include/delta_modularity.cuh b/cpp/nvgraph/cpp/include/delta_modularity.cuh
index b396757b30b..e7ad9466dd2 100644
--- a/cpp/nvgraph/cpp/include/delta_modularity.cuh
+++ b/cpp/nvgraph/cpp/include/delta_modularity.cuh
@@ -22,14 +22,16 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/generate.h>
 #include <thrust/transform.h>
+#include <cusparse.h>
+
+#include <rmm/rmm.h>
+#include <rmm/thrust_rmm_allocator.h>
 
 #include "util.cuh"
 #include "graph_utils.cuh"
 #include "functor.cuh"
 //#include "block_delta_modularity.cuh"
 
-#include <cusparse.h>
-
 
 namespace nvlouvain{
 
@@ -371,11 +373,11 @@ max_delta_modularity_vec(const int n_vertex,
 // Not used
 template<typename IdxType, typename ValType>
 void build_delta_modularity_vector_old(const int n_vertex, const int c_size, ValType m2, bool updated,
-                                       thrust::device_vector<IdxType>& csr_ptr_d, thrust::device_vector<IdxType>& csr_ind_d, thrust::device_vector<ValType>& csr_val_d, 
-                                       thrust::device_vector<IdxType>& cluster_d,
+                                       rmm::device_vector<IdxType>& csr_ptr_d, rmm::device_vector<IdxType>& csr_ind_d, rmm::device_vector<ValType>& csr_val_d, 
+                                       rmm::device_vector<IdxType>& cluster_d,
                                        IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr, // precompute cluster inverse
                                        ValType* k_vec_ptr, // precompute ki's 
-                                       thrust::device_vector<ValType>& temp_vec, // temp global memory with size n_vertex
+                                       rmm::device_vector<ValType>& temp_vec, // temp global memory with size n_vertex
                                        ValType* cluster_sum_vec_ptr, 
                                        ValType* delta_Q_arr_ptr){
 
@@ -425,8 +427,8 @@ void build_delta_modularity_vector_old(const int n_vertex, const int c_size, Val
 //
 template<typename IdxType, typename ValType>
 void build_delta_modularity_vector(cusparseHandle_t cusp_handle, const int n_vertex, const int c_size, ValType m2, bool updated,
-                                   thrust::device_vector<IdxType>& csr_ptr_d, thrust::device_vector<IdxType>& csr_ind_d, thrust::device_vector<ValType>& csr_val_d, 
-                                   thrust::device_vector<IdxType>& cluster_d,
+                                   rmm::device_vector<IdxType>& csr_ptr_d, rmm::device_vector<IdxType>& csr_ind_d, rmm::device_vector<ValType>& csr_val_d, 
+                                   rmm::device_vector<IdxType>& cluster_d,
                                    IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr, // precompute cluster inverse
                                    ValType* k_vec_ptr, // precompute ki's 
                                    ValType* cluster_sum_vec_ptr, 
@@ -449,7 +451,7 @@ void build_delta_modularity_vector(cusparseHandle_t cusp_handle, const int n_ver
   IdxType *cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
   
   // pre compute coo row indices using cusparse
-  thrust::device_vector<IdxType> coo_row_ind(n_edges);
+  rmm::device_vector<IdxType> coo_row_ind(n_edges);
   IdxType* coo_row_ind_ptr =  thrust::raw_pointer_cast(coo_row_ind.data());
   cusparseXcsr2coo(cusp_handle, csr_ptr_ptr,  
                    n_edges, n_vertex, coo_row_ind_ptr, 
diff --git a/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx b/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx
index 4b2222422fe..cb51ff8b9de 100644
--- a/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx
+++ b/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx
@@ -1420,8 +1420,8 @@ namespace nvgraph
   CsrGraph<IndexT>* extract_from_vertex_subset(CsrGraph<IndexT>& graph, 
 					       IndexT* pV, size_t n, cudaStream_t stream)
   {
-    typedef thrust::device_vector<IndexT> VectorI;
-    typedef thrust::device_vector<ValueT> VectorV;
+    typedef rmm::device_vector<IndexT> VectorI;
+    typedef rmm::device_vector<ValueT> VectorV;
     VectorI vSub(pV, pV+n);
 
     validate_input(vSub, graph.get_num_vertices());
@@ -1435,8 +1435,8 @@ namespace nvgraph
   CsrGraph<IndexT>* extract_from_edge_subset(CsrGraph<IndexT>& graph, 
 					     IndexT* pV, size_t n, cudaStream_t stream)
   {
-    typedef thrust::device_vector<IndexT> VectorI;
-    typedef thrust::device_vector<ValueT> VectorV;
+    typedef rmm::device_vector<IndexT> VectorI;
+    typedef rmm::device_vector<ValueT> VectorV;
     VectorI vSub(pV, pV+n);
 
     validate_input(vSub, graph.get_num_edges());
diff --git a/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx b/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx
index 36d3fced642..cacd7746e03 100644
--- a/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx
+++ b/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx
@@ -1692,13 +1692,13 @@ namespace nvgraph
     // The size of the GMEM buffers (number of elements).
     size_t m_gmem_size;
     // The status: OK if count_non_zeroes succeeded, FAILED otherwise.
-    SHARED_PREFIX::shared_ptr<IndexT> m_status;
+    std::shared_ptr<IndexT> m_status;
     // The work queue for dynamic load balancing in the kernels.
-    SHARED_PREFIX::shared_ptr<IndexT> m_work_queue;
+    std::shared_ptr<IndexT> m_work_queue;
     // The buffer to store keys in GMEM.
-    SHARED_PREFIX::shared_ptr<Key_type> m_keys;
+    std::shared_ptr<Key_type> m_keys;
     // The buffer to store values in GMEM.
-    SHARED_PREFIX::shared_ptr<Value_type> m_vals;
+    std::shared_ptr<Value_type> m_vals;
 
   public:
     // Create a workspace.
@@ -2198,8 +2198,8 @@ namespace nvgraph
 
       //AMGX uses pool allocator thrust::global_thread_handle::cudaMallocHost(), here...
       //
-      SHARED_PREFIX::shared_ptr<IndexT> h_status(new IndexT);
-      SHARED_PREFIX::shared_ptr<IndexT> h_work_offset(new IndexT);
+      std::shared_ptr<IndexT> h_status(new IndexT);
+      std::shared_ptr<IndexT> h_work_offset(new IndexT);
 
       cudaStream_t stream = 0; // for now...
 
diff --git a/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx b/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx
index e958a27ed0c..3b06c1cd567 100644
--- a/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx
+++ b/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx
@@ -42,6 +42,9 @@
 #include <cusp/print.h>
 #include <cusp/transpose.h>//
 
+#include <rmm/rmm.h>
+#include <rmm/thrust_rmm_allocator.h>
+
 //debugging only:
 #include <cstdio>
 
@@ -1624,8 +1627,8 @@ namespace{ //unnamed..
                                              const SemiRingFunctorTypes& eCombine,
                                              const SemiRingFunctorTypes& eReduce)
   {
-    typedef thrust::device_vector<IndexT> VectorI;
-    typedef thrust::device_vector<ValueT> VectorV;
+    typedef rmm::device_vector<IndexT> VectorI;
+    typedef rmm::device_vector<ValueT> VectorV;
 
     VectorI aggregates(p_aggregates, p_aggregates+n);
 
@@ -1664,8 +1667,8 @@ namespace{ //unnamed..
                                              const SemiRingFunctorTypes& eCombine,
                                              const SemiRingFunctorTypes& eReduce)
   {
-    typedef thrust::device_vector<IndexT> VectorI;
-    typedef thrust::device_vector<ValueT> VectorV;
+    typedef rmm::device_vector<IndexT> VectorI;
+    typedef rmm::device_vector<ValueT> VectorV;
 
     VectorI aggregates(p_aggregates, p_aggregates+n);
 
diff --git a/cpp/nvgraph/cpp/include/graph_utils.cuh b/cpp/nvgraph/cpp/include/graph_utils.cuh
index 29350213dcf..f57d0322fcb 100644
--- a/cpp/nvgraph/cpp/include/graph_utils.cuh
+++ b/cpp/nvgraph/cpp/include/graph_utils.cuh
@@ -31,6 +31,9 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
 
+#include <rmm/rmm.h>
+#include <rmm/thrust_rmm_allocator.h>
+
 #define USE_CG 1
 #define DEBUG 1
 
@@ -59,6 +62,20 @@ namespace nvlouvain
   #define WHERE ""
 #endif 
 
+// This is a gap filler, and should be replaced with a RAPIDS-wise error handling mechanism.
+#undef rmmCheckError
+#ifdef DEBUG
+  #define WHERE " at: " << __FILE__ << ':' << __LINE__
+  #define rmmCheckError(e) {                               \
+    if(e != RMM_SUCCESS) {                                 \
+      std::cerr << "RMM failure: "  << WHERE << std::endl; \
+    }                                                      \
+  }
+#else
+  #define rmmCheckError(e)
+  #define WHERE ""
+#endif
+
 template<typename T>
 static __device__ __forceinline__ T shfl_up(T r, int offset, int bound = 32, int mask = DEFAULT_MASK)
 {
@@ -279,7 +296,7 @@ flag_leafs ( const  IndexType n, IndexType *degree, ValueType *bookmark) {
 //notice that in the transposed matrix/csc a dangling node is a node without incomming edges
 template <typename IndexType, typename ValueType>
 void google_matrix ( const  IndexType n, const IndexType e, const IndexType *cooColInd, ValueType *cooVal, ValueType *bookmark) {
-  thrust::device_vector<IndexType> degree(n,0);
+  rmm::device_vector<IndexType> degree(n,0);
   dim3 nthreads, nblocks;
   nthreads.x = min(e,CUDA_MAX_KERNEL_THREADS); 
   nthreads.y = 1; 
diff --git a/cpp/nvgraph/cpp/include/modularity.cuh b/cpp/nvgraph/cpp/include/modularity.cuh
index 49917ce30d7..cc58771d04b 100644
--- a/cpp/nvgraph/cpp/include/modularity.cuh
+++ b/cpp/nvgraph/cpp/include/modularity.cuh
@@ -24,6 +24,9 @@
 #include <thrust/generate.h>
 #include <thrust/transform.h>
 
+#include <rmm/rmm.h>
+#include <rmm/thrust_rmm_allocator.h>
+
 #include "util.cuh"
 #include "graph_utils.cuh"
 #include "functor.cuh"
@@ -226,8 +229,8 @@ template<typename IdxType=int, typename IdxIter>
 void
 generate_cluster_inv(const int n_vertex, const int c_size, 
                     IdxIter cluster_iter, 
-                    thrust::device_vector<IdxType>& cluster_inv_ptr, 
-                    thrust::device_vector<IdxType>& cluster_inv_ind){
+                    rmm::device_vector<IdxType>& cluster_inv_ptr, 
+                    rmm::device_vector<IdxType>& cluster_inv_ind){
 
   int nthreads = min(n_vertex,CUDA_MAX_KERNEL_THREADS); 
   int nblocks = min((n_vertex + nthreads - 1)/nthreads,CUDA_MAX_BLOCKS); 
diff --git a/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx b/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx
index 2af20f252af..55a63c1295b 100644
--- a/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx
+++ b/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx
@@ -38,8 +38,8 @@ protected:
     //std::vector <nvgraph::Vector<ValueType>*> values_dim;
     //std::vector <nvgraph::Vector<ValueType>*> vertex_dim;
 
-    std::vector <SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> > > values_dim;
-    std::vector <SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> > > vertex_dim;
+    std::vector <std::shared_ptr<nvgraph::Vector<ValueType> > > values_dim;
+    std::vector <std::shared_ptr<nvgraph::Vector<ValueType> > > vertex_dim;
 public:
 
     /*! Storage for the nonzero entries of the Multi-CSR data structure.*/
@@ -78,28 +78,28 @@ public:
     {
         vertex_dim.resize(v_dim);
         for (size_t i = 0; i < vertex_dim.size(); ++i)
-          vertex_dim[i] = SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_vertices, stream)); 
+          vertex_dim[i] = std::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_vertices, stream)); 
     }
 
     inline void allocateEdgeData(size_t edges_dim, cudaStream_t stream) 
     {
         values_dim.resize(edges_dim);
          for (size_t i = 0; i < values_dim.size(); ++i)
-           values_dim[i] = SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_edges, stream)); 
+           values_dim[i] = std::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_edges, stream)); 
     }
 
     inline void attachVertexData(size_t i, ValueType* data, cudaStream_t stream) 
     {
         if (vertex_dim.size() <= i)
             vertex_dim.resize(i+1);
-         vertex_dim[i] = SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_vertices, data, stream)); 
+         vertex_dim[i] = std::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_vertices, data, stream)); 
     }
 
     inline void attachEdgeData(size_t i, ValueType* data, cudaStream_t stream) 
     {
          if (values_dim.size() <= i)
             values_dim.resize(i+1);
-        values_dim[i] = SHARED_PREFIX::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_edges, data, stream)); 
+        values_dim[i] = std::shared_ptr<nvgraph::Vector<ValueType> >(new Vector<ValueType>(this->num_edges, data, stream)); 
     }
     
     inline size_t getNumValues() {
@@ -124,7 +124,7 @@ public:
         //ValuedCsrGraph<IndexType, ValueType> *v = new ValuedCsrGraph<IndexType, ValueType>(static_cast<nvgraph::CsrGraph<IndexType> >(*this), *values_dim[dim_index]);
         //return *v;
       
-        //SHARED_PREFIX::shared_ptr<ValuedCsrGraph<IndexType, ValueType> > svcsr = SHARED_PREFIX::shared_ptr<ValuedCsrGraph<IndexType, ValueType> >(new ValuedCsrGraph<IndexType, ValueType>(static_cast<nvgraph::CsrGraph<IndexType> >(*this), *values_dim[dim_index]));
+        //std::shared_ptr<ValuedCsrGraph<IndexType, ValueType> > svcsr = std::shared_ptr<ValuedCsrGraph<IndexType, ValueType> >(new ValuedCsrGraph<IndexType, ValueType>(static_cast<nvgraph::CsrGraph<IndexType> >(*this), *values_dim[dim_index]));
         //return svcsr; //segfaults
 
         ///return ValuedCsrGraph<IndexType, ValueType>(static_cast<nvgraph::CsrGraph<IndexType> >(*this), *values_dim[dim_index]);//segfaults
diff --git a/cpp/nvgraph/cpp/include/nvgraph.h b/cpp/nvgraph/cpp/include/nvgraph.h
index f51daf68b0a..479c3faa51d 100644
--- a/cpp/nvgraph/cpp/include/nvgraph.h
+++ b/cpp/nvgraph/cpp/include/nvgraph.h
@@ -17,8 +17,10 @@
 #ifndef _NVGRAPH_H_
 #define _NVGRAPH_H_
 
-#include "stddef.h"
-#include "stdint.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#include <rmm/rmm.h>
 
 #include "library_types.h"
 
@@ -26,7 +28,13 @@
 #define NVG_CUDA_TRY(T) {\
                          if (T != cudaSuccess)\
                              return NVGRAPH_STATUS_ALLOC_FAILED;\
-		     }
+             }
+
+// This is a gap filler, and should be replaced with a RAPIDS-wise error handling mechanism.
+#define NVG_RMM_TRY(T) {\
+                         if (T != RMM_SUCCESS)\
+                             return NVGRAPH_STATUS_ALLOC_FAILED;\
+             }
 
 #ifndef NVGRAPH_API
 #ifdef _WIN32
@@ -40,478 +48,477 @@
 extern "C" {
 #endif
 
-	/* nvGRAPH status type returns */
-	typedef enum
-	{
-		NVGRAPH_STATUS_SUCCESS = 0,
-		NVGRAPH_STATUS_NOT_INITIALIZED = 1,
-		NVGRAPH_STATUS_ALLOC_FAILED = 2,
-		NVGRAPH_STATUS_INVALID_VALUE = 3,
-		NVGRAPH_STATUS_ARCH_MISMATCH = 4,
-		NVGRAPH_STATUS_MAPPING_ERROR = 5,
-		NVGRAPH_STATUS_EXECUTION_FAILED = 6,
-		NVGRAPH_STATUS_INTERNAL_ERROR = 7,
-		NVGRAPH_STATUS_TYPE_NOT_SUPPORTED = 8,
-		NVGRAPH_STATUS_NOT_CONVERGED = 9,
-		NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED = 10
-
-	} nvgraphStatus_t;
-
-	const char* nvgraphStatusGetString(nvgraphStatus_t status);
-
-	/* Opaque structure holding nvGRAPH library context */
-	struct nvgraphContext;
-	typedef struct nvgraphContext *nvgraphHandle_t;
-
-	/* Opaque structure holding the graph descriptor */
-	struct nvgraphGraphDescr;
-	typedef struct nvgraphGraphDescr *nvgraphGraphDescr_t;
-
-	/* Semi-ring types */
-	typedef enum
-	{
-		NVGRAPH_PLUS_TIMES_SR = 0,
-		NVGRAPH_MIN_PLUS_SR = 1,
-		NVGRAPH_MAX_MIN_SR = 2,
-		NVGRAPH_OR_AND_SR = 3,
-	} nvgraphSemiring_t;
-
-	/* Topology types */
-	typedef enum
-	{
-		NVGRAPH_CSR_32 = 0,
-		NVGRAPH_CSC_32 = 1,
-		NVGRAPH_COO_32 = 2,
-		NVGRAPH_2D_32I_32I = 3,
-		NVGRAPH_2D_64I_32I = 4
-	} nvgraphTopologyType_t;
-
-	typedef enum
-	{
-		NVGRAPH_DEFAULT = 0,  // Default is unsorted.
-		NVGRAPH_UNSORTED = 1,  //
-		NVGRAPH_SORTED_BY_SOURCE = 2,  // CSR
-		NVGRAPH_SORTED_BY_DESTINATION = 3   // CSC
-	} nvgraphTag_t;
-
-	typedef enum
-	{
-		NVGRAPH_MULTIPLY = 0,
-		NVGRAPH_SUM = 1,
-		NVGRAPH_MIN = 2,
-		NVGRAPH_MAX = 3
-	} nvgraphSemiringOps_t;
-
-	typedef enum
-	{
-		NVGRAPH_MODULARITY_MAXIMIZATION = 0, //maximize modularity with Lanczos solver
-		NVGRAPH_BALANCED_CUT_LANCZOS = 1, //minimize balanced cut with Lanczos solver
-		NVGRAPH_BALANCED_CUT_LOBPCG = 2 //minimize balanced cut with LOPCG solver
-	} nvgraphSpectralClusteringType_t;
-
-	struct SpectralClusteringParameter {
-		int n_clusters; //number of clusters
-		int n_eig_vects; // //number of eigenvectors
-		nvgraphSpectralClusteringType_t algorithm; // algorithm to use
-		float evs_tolerance; // tolerance of the eigensolver
-		int evs_max_iter; // maximum number of iterations of the eigensolver
-		float kmean_tolerance; // tolerance of kmeans
-		int kmean_max_iter; // maximum number of iterations of kemeans
-		void * opt; // optional parameter that can be used for preconditioning in the future
-	};
-
-	typedef enum
-	{
-		NVGRAPH_MODULARITY, // clustering score telling how good the clustering is compared to random assignment.
-		NVGRAPH_EDGE_CUT,  // total number of edges between clusters.
-		NVGRAPH_RATIO_CUT // sum for all clusters of the number of edges going outside of the cluster divided by the number of vertex inside the cluster
-	} nvgraphClusteringMetric_t;
-
-	struct nvgraphCSRTopology32I_st {
-		int nvertices; // n+1
-		int nedges; // nnz
-		int *source_offsets; // rowPtr
-		int *destination_indices; // colInd
-	};
-	typedef struct nvgraphCSRTopology32I_st *nvgraphCSRTopology32I_t;
-
-	struct nvgraphCSCTopology32I_st {
-		int nvertices; // n+1
-		int nedges; // nnz
-		int *destination_offsets; // colPtr
-		int *source_indices; // rowInd
-	};
-	typedef struct nvgraphCSCTopology32I_st *nvgraphCSCTopology32I_t;
-
-	struct nvgraphCOOTopology32I_st {
-		int nvertices; // n+1
-		int nedges; // nnz
-		int *source_indices; // rowInd
-		int *destination_indices; // colInd
-		nvgraphTag_t tag;
-	};
-	typedef struct nvgraphCOOTopology32I_st *nvgraphCOOTopology32I_t;
-
-	struct nvgraph2dCOOTopology32I_st {
-		int nvertices;
-		int nedges;
-		int *source_indices; 			// Row Indices
-		int *destination_indices;	// Column Indices
-		cudaDataType_t valueType;	// The type of values being given.
-		void *values;					// Pointer to array of values.
-		int numDevices; 				// Gives the number of devices to be used.
-		int *devices; 					// Array of device IDs to use.
-		int blockN; 						// Specifies the value of n for an n x n matrix decomposition.
-		nvgraphTag_t tag;
-	};
-	typedef struct nvgraph2dCOOTopology32I_st *nvgraph2dCOOTopology32I_t;
-
-	/* Return properties values for the nvGraph library, such as library version */
-	nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value);
-
-	/* Open the library and create the handle */
-	nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle);
-	nvgraphStatus_t NVGRAPH_API nvgraphCreateMulti(	nvgraphHandle_t *handle,
-																	int numDevices,
-																	int* devices);
-
-	/*  Close the library and destroy the handle  */
-	nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle);
-
-	/* Create an empty graph descriptor */
-	nvgraphStatus_t NVGRAPH_API nvgraphCreateGraphDescr(	nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t *descrG);
-
-	/* Destroy a graph descriptor */
-	nvgraphStatus_t NVGRAPH_API nvgraphDestroyGraphDescr(	nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG);
-
-	/* Set size, topology data in the graph descriptor  */
-	nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure(	nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			void* topologyData,
-																			nvgraphTopologyType_t TType);
-
-	/* Query size and topology information from the graph descriptor */
-	nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure(	nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			void* topologyData,
-																			nvgraphTopologyType_t* TType);
-
-	/* Allocate numsets vectors of size V representing Vertex Data and attached them the graph.
-	 * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */
-	nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData(nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			size_t numsets,
-																			cudaDataType_t *settypes);
-
-	/* Allocate numsets vectors of size E representing Edge Data and attached them the graph.
-	 * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */
-	nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData(	nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			size_t numsets,
-																			cudaDataType_t *settypes);
-
-	/* Update the vertex set #setnum with the data in *vertexData, sets have 0-based index
-	 *  Conversions are not supported so nvgraphTopologyType_t should match the graph structure */
-	nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData(	nvgraphHandle_t handle,
-																		nvgraphGraphDescr_t descrG,
-																		void *vertexData,
-																		size_t setnum);
-
-	/* Copy the edge set #setnum in *edgeData, sets have 0-based index
-	 *  Conversions are not supported so nvgraphTopologyType_t should match the graph structure */
-	nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData(	nvgraphHandle_t handle,
-																		nvgraphGraphDescr_t descrG,
-																		void *vertexData,
-																		size_t setnum);
-
-	/* Convert the edge data to another topology
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology(nvgraphHandle_t handle,
-																		nvgraphTopologyType_t srcTType,
-																		void *srcTopology,
-																		void *srcEdgeData,
-																		cudaDataType_t *dataType,
-																		nvgraphTopologyType_t dstTType,
-																		void *dstTopology,
-																		void *dstEdgeData);
-
-	/* Convert graph to another structure
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph(nvgraphHandle_t handle,
-																	nvgraphGraphDescr_t srcDescrG,
-																	nvgraphGraphDescr_t dstDescrG,
-																	nvgraphTopologyType_t dstTType);
-
-	/* Update the edge set #setnum with the data in *edgeData, sets have 0-based index
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData(	nvgraphHandle_t handle,
-																	nvgraphGraphDescr_t descrG,
-																	void *edgeData,
-																	size_t setnum);
-
-	/* Copy the edge set #setnum in *edgeData, sets have 0-based index
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData(	nvgraphHandle_t handle,
-																	nvgraphGraphDescr_t descrG,
-																	void *edgeData,
-																	size_t setnum);
-
-	/* create a new graph by extracting a subgraph given a list of vertices
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex(	nvgraphHandle_t handle,
-																					nvgraphGraphDescr_t descrG,
-																					nvgraphGraphDescr_t subdescrG,
-																					int *subvertices,
-																					size_t numvertices);
-	/* create a new graph by extracting a subgraph given a list of edges
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge(nvgraphHandle_t handle,
-																				nvgraphGraphDescr_t descrG,
-																				nvgraphGraphDescr_t subdescrG,
-																				int *subedges,
-																				size_t numedges);
-
-	/* nvGRAPH Semi-ring sparse matrix vector multiplication
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv(nvgraphHandle_t handle,
-															const nvgraphGraphDescr_t descrG,
-															const size_t weight_index,
-															const void *alpha,
-															const size_t x_index,
-															const void *beta,
-															const size_t y_index,
-															const nvgraphSemiring_t SR);
-
-	/* Helper struct for Traversal parameters
-	 */
-	typedef struct {
-		size_t pad[128];
-	} nvgraphTraversalParameter_t;
-
-	/* Initializes traversal parameters with default values
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalParameterInit(nvgraphTraversalParameter_t *param);
-
-	/* Stores/retrieves index of a vertex data where target distances will be stored
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetDistancesIndex(	nvgraphTraversalParameter_t *param,
-																						const size_t value);
-
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex(	const nvgraphTraversalParameter_t param,
-																						size_t *value);
-
-	/* Stores/retrieves index of a vertex data where path predecessors will be stored
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetPredecessorsIndex(	nvgraphTraversalParameter_t *param,
-																							const size_t value);
-
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex(	const nvgraphTraversalParameter_t param,
-																							size_t *value);
-
-	/* Stores/retrieves index of an edge data which tells traversal algorithm whether path can go through an edge or not
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetEdgeMaskIndex(	nvgraphTraversalParameter_t *param,
-																						const size_t value);
-
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex(	const nvgraphTraversalParameter_t param,
-																						size_t *value);
-
-	/* Stores/retrieves flag that tells an algorithm whether the graph is directed or not
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetUndirectedFlag(	nvgraphTraversalParameter_t *param,
-																						const size_t value);
-
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag(	const nvgraphTraversalParameter_t param,
-																						size_t *value);
-
-	/* Stores/retrieves 'alpha' and 'beta' parameters for BFS traversal algorithm
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetAlpha(	nvgraphTraversalParameter_t *param,
-																			const size_t value);
-
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetAlpha(	const nvgraphTraversalParameter_t param,
-																			size_t *value);
-
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetBeta(	nvgraphTraversalParameter_t *param,
-																			const size_t value);
-
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetBeta(	const nvgraphTraversalParameter_t param,
-																			size_t *value);
+    /* nvGRAPH status type returns */
+    typedef enum
+    {
+        NVGRAPH_STATUS_SUCCESS = 0,
+        NVGRAPH_STATUS_NOT_INITIALIZED = 1,
+        NVGRAPH_STATUS_ALLOC_FAILED = 2,
+        NVGRAPH_STATUS_INVALID_VALUE = 3,
+        NVGRAPH_STATUS_ARCH_MISMATCH = 4,
+        NVGRAPH_STATUS_MAPPING_ERROR = 5,
+        NVGRAPH_STATUS_EXECUTION_FAILED = 6,
+        NVGRAPH_STATUS_INTERNAL_ERROR = 7,
+        NVGRAPH_STATUS_TYPE_NOT_SUPPORTED = 8,
+        NVGRAPH_STATUS_NOT_CONVERGED = 9,
+        NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED = 10
+
+    } nvgraphStatus_t;
+
+    const char* nvgraphStatusGetString(nvgraphStatus_t status);
+
+    /* Opaque structure holding nvGRAPH library context */
+    struct nvgraphContext;
+    typedef struct nvgraphContext *nvgraphHandle_t;
+
+    /* Opaque structure holding the graph descriptor */
+    struct nvgraphGraphDescr;
+    typedef struct nvgraphGraphDescr *nvgraphGraphDescr_t;
+
+    /* Semi-ring types */
+    typedef enum
+    {
+        NVGRAPH_PLUS_TIMES_SR = 0,
+        NVGRAPH_MIN_PLUS_SR = 1,
+        NVGRAPH_MAX_MIN_SR = 2,
+        NVGRAPH_OR_AND_SR = 3,
+    } nvgraphSemiring_t;
+
+    /* Topology types */
+    typedef enum
+    {
+        NVGRAPH_CSR_32 = 0,
+        NVGRAPH_CSC_32 = 1,
+        NVGRAPH_COO_32 = 2,
+        NVGRAPH_2D_32I_32I = 3,
+        NVGRAPH_2D_64I_32I = 4
+    } nvgraphTopologyType_t;
+
+    typedef enum
+    {
+        NVGRAPH_DEFAULT = 0,  // Default is unsorted.
+        NVGRAPH_UNSORTED = 1,  //
+        NVGRAPH_SORTED_BY_SOURCE = 2,  // CSR
+        NVGRAPH_SORTED_BY_DESTINATION = 3   // CSC
+    } nvgraphTag_t;
+
+    typedef enum
+    {
+        NVGRAPH_MULTIPLY = 0,
+        NVGRAPH_SUM = 1,
+        NVGRAPH_MIN = 2,
+        NVGRAPH_MAX = 3
+    } nvgraphSemiringOps_t;
+
+    typedef enum
+    {
+        NVGRAPH_MODULARITY_MAXIMIZATION = 0, //maximize modularity with Lanczos solver
+        NVGRAPH_BALANCED_CUT_LANCZOS = 1, //minimize balanced cut with Lanczos solver
+        NVGRAPH_BALANCED_CUT_LOBPCG = 2 //minimize balanced cut with LOPCG solver
+    } nvgraphSpectralClusteringType_t;
+
+    struct SpectralClusteringParameter {
+        int n_clusters; //number of clusters
+        int n_eig_vects; // //number of eigenvectors
+        nvgraphSpectralClusteringType_t algorithm; // algorithm to use
+        float evs_tolerance; // tolerance of the eigensolver
+        int evs_max_iter; // maximum number of iterations of the eigensolver
+        float kmean_tolerance; // tolerance of kmeans
+        int kmean_max_iter; // maximum number of iterations of kemeans
+        void * opt; // optional parameter that can be used for preconditioning in the future
+    };
+
+    typedef enum
+    {
+        NVGRAPH_MODULARITY, // clustering score telling how good the clustering is compared to random assignment.
+        NVGRAPH_EDGE_CUT,  // total number of edges between clusters.
+        NVGRAPH_RATIO_CUT // sum for all clusters of the number of edges going outside of the cluster divided by the number of vertex inside the cluster
+    } nvgraphClusteringMetric_t;
+
+    struct nvgraphCSRTopology32I_st {
+        int nvertices; // n+1
+        int nedges; // nnz
+        int *source_offsets; // rowPtr
+        int *destination_indices; // colInd
+    };
+    typedef struct nvgraphCSRTopology32I_st *nvgraphCSRTopology32I_t;
+
+    struct nvgraphCSCTopology32I_st {
+        int nvertices; // n+1
+        int nedges; // nnz
+        int *destination_offsets; // colPtr
+        int *source_indices; // rowInd
+    };
+    typedef struct nvgraphCSCTopology32I_st *nvgraphCSCTopology32I_t;
+
+    struct nvgraphCOOTopology32I_st {
+        int nvertices; // n+1
+        int nedges; // nnz
+        int *source_indices; // rowInd
+        int *destination_indices; // colInd
+        nvgraphTag_t tag;
+    };
+    typedef struct nvgraphCOOTopology32I_st *nvgraphCOOTopology32I_t;
+
+    struct nvgraph2dCOOTopology32I_st {
+        int nvertices;
+        int nedges;
+        int *source_indices;             // Row Indices
+        int *destination_indices;    // Column Indices
+        cudaDataType_t valueType;    // The type of values being given.
+        void *values;                    // Pointer to array of values.
+        int numDevices;                 // Gives the number of devices to be used.
+        int *devices;                     // Array of device IDs to use.
+        int blockN;                         // Specifies the value of n for an n x n matrix decomposition.
+        nvgraphTag_t tag;
+    };
+    typedef struct nvgraph2dCOOTopology32I_st *nvgraph2dCOOTopology32I_t;
+
+    /* Return properties values for the nvGraph library, such as library version */
+    nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value);
+
+    /* Open the library and create the handle */
+    nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle);
+    nvgraphStatus_t NVGRAPH_API nvgraphCreateMulti(nvgraphHandle_t *handle,
+                                                   int numDevices,
+                                                   int* devices);
+
+    /*  Close the library and destroy the handle  */
+    nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle);
+
+    /* Create an empty graph descriptor */
+    nvgraphStatus_t NVGRAPH_API nvgraphCreateGraphDescr(nvgraphHandle_t handle,
+                                                        nvgraphGraphDescr_t *descrG);
+
+    /* Destroy a graph descriptor */
+    nvgraphStatus_t NVGRAPH_API nvgraphDestroyGraphDescr(nvgraphHandle_t handle,
+                                                         nvgraphGraphDescr_t descrG);
+
+    /* Set size, topology data in the graph descriptor  */
+    nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure(nvgraphHandle_t handle,
+                                                         nvgraphGraphDescr_t descrG,
+                                                         void* topologyData,
+                                                         nvgraphTopologyType_t TType);
+
+    /* Query size and topology information from the graph descriptor */
+    nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure(nvgraphHandle_t handle,
+                                                         nvgraphGraphDescr_t descrG,
+                                                         void* topologyData,
+                                                         nvgraphTopologyType_t* TType);
+
+    /* Allocate numsets vectors of size V representing Vertex Data and attached them the graph.
+     * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */
+    nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData(nvgraphHandle_t handle,
+                                                          nvgraphGraphDescr_t descrG,
+                                                          size_t numsets,
+                                                          cudaDataType_t *settypes);
+
+    /* Allocate numsets vectors of size E representing Edge Data and attached them the graph.
+     * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */
+    nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData(nvgraphHandle_t handle,
+                                                        nvgraphGraphDescr_t descrG,
+                                                        size_t numsets,
+                                                        cudaDataType_t *settypes);
+
+    /* Update the vertex set #setnum with the data in *vertexData, sets have 0-based index
+     *  Conversions are not supported so nvgraphTopologyType_t should match the graph structure */
+    nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData(nvgraphHandle_t handle,
+                                                     nvgraphGraphDescr_t descrG,
+                                                     void *vertexData,
+                                                     size_t setnum);
+
+    /* Copy the edge set #setnum in *edgeData, sets have 0-based index
+     *  Conversions are not supported so nvgraphTopologyType_t should match the graph structure */
+    nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData(nvgraphHandle_t handle,
+                                                     nvgraphGraphDescr_t descrG,
+                                                     void *vertexData,
+                                                     size_t setnum);
+
+    /* Convert the edge data to another topology
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology(nvgraphHandle_t handle,
+                                                       nvgraphTopologyType_t srcTType,
+                                                       void *srcTopology,
+                                                       void *srcEdgeData,
+                                                       cudaDataType_t *dataType,
+                                                       nvgraphTopologyType_t dstTType,
+                                                       void *dstTopology,
+                                                       void *dstEdgeData);
+
+    /* Convert graph to another structure
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph(nvgraphHandle_t handle,
+                                                    nvgraphGraphDescr_t srcDescrG,
+                                                    nvgraphGraphDescr_t dstDescrG,
+                                                    nvgraphTopologyType_t dstTType);
+
+    /* Update the edge set #setnum with the data in *edgeData, sets have 0-based index
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData(nvgraphHandle_t handle,
+                                                   nvgraphGraphDescr_t descrG,
+                                                   void *edgeData,
+                                                   size_t setnum);
+
+    /* Copy the edge set #setnum in *edgeData, sets have 0-based index
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData(nvgraphHandle_t handle,
+                                                   nvgraphGraphDescr_t descrG,
+                                                   void *edgeData,
+                                                   size_t setnum);
+
+    /* create a new graph by extracting a subgraph given a list of vertices
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex(nvgraphHandle_t handle,
+                                                               nvgraphGraphDescr_t descrG,
+                                                               nvgraphGraphDescr_t subdescrG,
+                                                               int *subvertices,
+                                                               size_t numvertices);
+    /* create a new graph by extracting a subgraph given a list of edges
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge(nvgraphHandle_t handle,
+                                                             nvgraphGraphDescr_t descrG,
+                                                             nvgraphGraphDescr_t subdescrG,
+                                                             int *subedges,
+                                                             size_t numedges);
+
+    /* nvGRAPH Semi-ring sparse matrix vector multiplication
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv(nvgraphHandle_t handle,
+                                              const nvgraphGraphDescr_t descrG,
+                                              const size_t weight_index,
+                                              const void *alpha,
+                                              const size_t x_index,
+                                              const void *beta,
+                                              const size_t y_index,
+                                              const nvgraphSemiring_t SR);
+
+    /* Helper struct for Traversal parameters
+     */
+    typedef struct {
+        size_t pad[128];
+    } nvgraphTraversalParameter_t;
+
+    /* Initializes traversal parameters with default values
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalParameterInit(nvgraphTraversalParameter_t *param);
+
+    /* Stores/retrieves index of a vertex data where target distances will be stored
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetDistancesIndex(nvgraphTraversalParameter_t *param,
+                                                                                        const size_t value);
+
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex(const nvgraphTraversalParameter_t param,
+                                                                                        size_t *value);
+
+    /* Stores/retrieves index of a vertex data where path predecessors will be stored
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetPredecessorsIndex(nvgraphTraversalParameter_t *param,
+                                                                     const size_t value);
+
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex(const nvgraphTraversalParameter_t param,
+                                                                     size_t *value);
+
+    /* Stores/retrieves index of an edge data which tells traversal algorithm whether path can go through an edge or not
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetEdgeMaskIndex(nvgraphTraversalParameter_t *param,
+                                                                 const size_t value);
+
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex(const nvgraphTraversalParameter_t param,
+                                                                 size_t *value);
+
+    /* Stores/retrieves flag that tells an algorithm whether the graph is directed or not
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetUndirectedFlag(nvgraphTraversalParameter_t *param,
+                                                                  const size_t value);
+
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag(const nvgraphTraversalParameter_t param,
+                                                                  size_t *value);
+
+    /* Stores/retrieves 'alpha' and 'beta' parameters for BFS traversal algorithm
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetAlpha(nvgraphTraversalParameter_t *param,
+                                                         const size_t value);
+
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetAlpha(const nvgraphTraversalParameter_t param,
+                                                         size_t *value);
+
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetBeta(nvgraphTraversalParameter_t *param,
+                                                        const size_t value);
+
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetBeta(const nvgraphTraversalParameter_t param,
+                                                        size_t *value);
 
 //Traversal available
-	typedef enum {
-		NVGRAPH_TRAVERSAL_BFS = 0
-	} nvgraphTraversal_t;
-
-	/* nvGRAPH Traversal API
-	 * Compute a traversal of the graph from a single vertex using algorithm specified by traversalT parameter
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle,
-																const nvgraphGraphDescr_t descrG,
-																const nvgraphTraversal_t traversalT,
-																const int *source_vert,
-																const nvgraphTraversalParameter_t params);
-
-	/**
-	 * CAPI Method for calling 2d BFS algorithm.
-	 * @param handle Nvgraph context handle.
-	 * @param descrG Graph handle (must be 2D partitioned)
-	 * @param source_vert The source vertex ID
-	 * @param distances Pointer to memory allocated to store the distances.
-	 * @param predecessors Pointer to memory allocated to store the predecessors
-	 * @return Status code.
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraph2dBfs(	nvgraphHandle_t handle,
-															const nvgraphGraphDescr_t descrG,
-															const int32_t source_vert,
-															int32_t* distances,
-															int32_t* predecessors);
-
-	/* nvGRAPH Single Source Shortest Path (SSSP)
-	 * Calculate the shortest path distance from a single vertex in the graph to all other vertices.
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphSssp(nvgraphHandle_t handle,
-															const nvgraphGraphDescr_t descrG,
-															const size_t weight_index,
-															const int *source_vert,
-															const size_t sssp_index);
-
-	/* nvGRAPH WidestPath
-	 * Find widest path potential from source_index to every other vertices.
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphWidestPath(nvgraphHandle_t handle,
-																	const nvgraphGraphDescr_t descrG,
-																	const size_t weight_index,
-																	const int *source_vert,
-																	const size_t widest_path_index);
-
-	/* nvGRAPH PageRank
-	 * Find PageRank for each vertex of a graph with a given transition probabilities, a bookmark vector of dangling vertices, and the damping factor.
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphPagerank(nvgraphHandle_t handle,
-																const nvgraphGraphDescr_t descrG,
-																const size_t weight_index,
-																const void *alpha,
-																const size_t bookmark_index,
-																const int has_guess,
-																const size_t pagerank_index,
-																const float tolerance,
-																const int max_iter);
-
-	/* nvGRAPH contraction
-	 * given array of agregates contract graph with
-	 * given (Combine, Reduce) operators for Vertex Set
-	 * and Edge Set;
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphContractGraph(nvgraphHandle_t handle,
-																		nvgraphGraphDescr_t descrG,
-																		nvgraphGraphDescr_t contrdescrG,
-																		int *aggregates,
-																		size_t numaggregates,
-																		nvgraphSemiringOps_t VertexCombineOp,
-																		nvgraphSemiringOps_t VertexReduceOp,
-																		nvgraphSemiringOps_t EdgeCombineOp,
-																		nvgraphSemiringOps_t EdgeReduceOp,
-																		int flag);
-
-	/* nvGRAPH spectral clustering
-	 * given a graph and solver parameters of struct SpectralClusteringParameter,
-	 * assign vertices to groups such as
-	 * intra-group connections are strong and/or inter-groups connections are weak
-	 * using spectral technique.
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering(nvgraphHandle_t handle,
-																			const nvgraphGraphDescr_t graph_descr,
-																			const size_t weight_index,
-																			const struct SpectralClusteringParameter *params,
-																			int* clustering,
-																			void* eig_vals,
-																			void* eig_vects);
-
-	/* nvGRAPH analyze clustering
-	 * Given a graph, a clustering, and a metric
-	 * compute the score that measures the clustering quality according to the metric.
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering(nvgraphHandle_t handle,
-																			const nvgraphGraphDescr_t graph_descr,
-																			const size_t weight_index,
-																			const int n_clusters,
-																			const int* clustering,
-																			nvgraphClusteringMetric_t metric,
-																			float * score);
-
-	/* nvGRAPH Triangles counting
-	 * count number of triangles (cycles of size 3) formed by graph edges
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount(nvgraphHandle_t handle,
-																		const nvgraphGraphDescr_t graph_descr,
-																		uint64_t* result);
-
-        /* nvGRAPH Louvain implementation
-        */
-        nvgraphStatus_t NVGRAPH_API nvgraphLouvain (												cudaDataType_t index_type, 
-																		cudaDataType_t val_type, 
-																		const size_t num_vertex, 
-																		const size_t num_edges,
-                            															void* csr_ptr, 
-																		void* csr_ind, 
-																		void* csr_val, 
-																		int weighted, 
-																		int has_init_cluster, 
-																		void* init_cluster,
-                            															void* final_modularity, 
-																		void* best_cluster_vec, 
-																		void* num_level);
-
-
-       /* nvGRAPH Jaccard implementation
-       */
-       nvgraphStatus_t NVGRAPH_API nvgraphJaccard (												cudaDataType_t index_type, 
-																		cudaDataType_t val_type, 
-																		const size_t n,
-                            															const size_t e, 
-																		void* csr_ptr, 
-																		void *csr_ind, 
-																		void* csr_val, 
-																		int weighted, 
-																		void* v, 
-																		void* gamma, 
-																		void* weight_j);
-
-	/* nvGRAPH attach structure
-	 * Warp external device data into a nvgraphGraphDescr_t
-	 * Warning : this data remain owned by the user
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure(nvgraphHandle_t handle,
-															nvgraphGraphDescr_t descrG,
-															void* topologyData,
-															nvgraphTopologyType_t TT);
-
-	/* nvGRAPH attach Vertex Data
-	 * Warp external device data into a vertex dim
-	 * Warning : this data remain owned by the user
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData(nvgraphHandle_t handle,
-														 nvgraphGraphDescr_t descrG,
-														 size_t setnum,
-														 cudaDataType_t settype,
-														 void *vertexData);
-
-	/* nvGRAPH attach Edge Data
-	 * Warp external device data into an edge dim
-	 * Warning : this data remain owned by the user
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData(nvgraphHandle_t handle,
-												      nvgraphGraphDescr_t descrG,
-												      size_t setnum,
-												      cudaDataType_t settype,
-												      void *edgeData);
+    typedef enum {
+        NVGRAPH_TRAVERSAL_BFS = 0
+    } nvgraphTraversal_t;
+
+    /* nvGRAPH Traversal API
+     * Compute a traversal of the graph from a single vertex using algorithm specified by traversalT parameter
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle,
+                                                 const nvgraphGraphDescr_t descrG,
+                                                 const nvgraphTraversal_t traversalT,
+                                                 const int *source_vert,
+                                                 const nvgraphTraversalParameter_t params);
+
+    /**
+     * CAPI Method for calling 2d BFS algorithm.
+     * @param handle Nvgraph context handle.
+     * @param descrG Graph handle (must be 2D partitioned)
+     * @param source_vert The source vertex ID
+     * @param distances Pointer to memory allocated to store the distances.
+     * @param predecessors Pointer to memory allocated to store the predecessors
+     * @return Status code.
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraph2dBfs(nvgraphHandle_t handle,
+                                             const nvgraphGraphDescr_t descrG,
+                                             const int32_t source_vert,
+                                             int32_t* distances,
+                                             int32_t* predecessors);
+
+    /* nvGRAPH Single Source Shortest Path (SSSP)
+     * Calculate the shortest path distance from a single vertex in the graph to all other vertices.
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphSssp(nvgraphHandle_t handle,
+                                            const nvgraphGraphDescr_t descrG,
+                                            const size_t weight_index,
+                                            const int *source_vert,
+                                            const size_t sssp_index);
+
+    /* nvGRAPH WidestPath
+     * Find widest path potential from source_index to every other vertices.
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphWidestPath(nvgraphHandle_t handle,
+                                                  const nvgraphGraphDescr_t descrG,
+                                                  const size_t weight_index,
+                                                  const int *source_vert,
+                                                  const size_t widest_path_index);
+
+    /* nvGRAPH PageRank
+     * Find PageRank for each vertex of a graph with a given transition probabilities, a bookmark vector of dangling vertices, and the damping factor.
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphPagerank(nvgraphHandle_t handle,
+                                                const nvgraphGraphDescr_t descrG,
+                                                const size_t weight_index,
+                                                const void *alpha,
+                                                const size_t bookmark_index,
+                                                const int has_guess,
+                                                const size_t pagerank_index,
+                                                const float tolerance,
+                                                const int max_iter);
+
+    /* nvGRAPH contraction
+     * given array of agregates contract graph with
+     * given (Combine, Reduce) operators for Vertex Set
+     * and Edge Set;
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphContractGraph(nvgraphHandle_t handle,
+                                                     nvgraphGraphDescr_t descrG,
+                                                     nvgraphGraphDescr_t contrdescrG,
+                                                     int *aggregates,
+                                                     size_t numaggregates,
+                                                     nvgraphSemiringOps_t VertexCombineOp,
+                                                     nvgraphSemiringOps_t VertexReduceOp,
+                                                     nvgraphSemiringOps_t EdgeCombineOp,
+                                                     nvgraphSemiringOps_t EdgeReduceOp,
+                                                     int flag);
+
+    /* nvGRAPH spectral clustering
+     * given a graph and solver parameters of struct SpectralClusteringParameter,
+     * assign vertices to groups such as
+     * intra-group connections are strong and/or inter-groups connections are weak
+     * using spectral technique.
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering(nvgraphHandle_t handle,
+                                                          const nvgraphGraphDescr_t graph_descr,
+                                                          const size_t weight_index,
+                                                          const struct SpectralClusteringParameter *params,
+                                                          int* clustering,
+                                                          void* eig_vals,
+                                                          void* eig_vects);
+
+    /* nvGRAPH analyze clustering
+     * Given a graph, a clustering, and a metric
+     * compute the score that measures the clustering quality according to the metric.
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering(nvgraphHandle_t handle,
+                                                         const nvgraphGraphDescr_t graph_descr,
+                                                         const size_t weight_index,
+                                                         const int n_clusters,
+                                                         const int* clustering,
+                                                         nvgraphClusteringMetric_t metric,
+                                                         float * score);
+
+    /* nvGRAPH Triangles counting
+     * count number of triangles (cycles of size 3) formed by graph edges
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount(nvgraphHandle_t handle,
+                                                     const nvgraphGraphDescr_t graph_descr,
+                                                     uint64_t* result);
+
+    /* nvGRAPH Louvain implementation
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphLouvain(cudaDataType_t index_type,
+                                               cudaDataType_t val_type,
+                                               const size_t num_vertex,
+                                               const size_t num_edges,
+                                               void* csr_ptr,
+                                               void* csr_ind,
+                                               void* csr_val,
+                                               int weighted,
+                                               int has_init_cluster,
+                                               void* init_cluster,
+                                               void* final_modularity,
+                                               void* best_cluster_vec,
+                                               void* num_level);
+
+
+    /* nvGRAPH Jaccard implementation
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphJaccard(cudaDataType_t index_type,
+                                               cudaDataType_t val_type,
+                                               const size_t n,
+                                               const size_t e,
+                                               void* csr_ptr,
+                                               void *csr_ind,
+                                               void* csr_val,
+                                               int weighted,
+                                               void* v,
+                                               void* gamma,
+                                               void* weight_j);
+
+    /* nvGRAPH attach structure
+     * Warp external device data into a nvgraphGraphDescr_t
+     * Warning : this data remain owned by the user
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure(nvgraphHandle_t handle,
+                                                            nvgraphGraphDescr_t descrG,
+                                                            void* topologyData,
+                                                            nvgraphTopologyType_t TT);
+
+    /* nvGRAPH attach Vertex Data
+     * Warp external device data into a vertex dim
+     * Warning : this data remain owned by the user
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData(nvgraphHandle_t handle,
+                                                        nvgraphGraphDescr_t descrG,
+                                                        size_t setnum,
+                                                        cudaDataType_t settype,
+                                                        void *vertexData);
+
+    /* nvGRAPH attach Edge Data
+     * Warp external device data into an edge dim
+     * Warning : this data remain owned by the user
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData(nvgraphHandle_t handle,
+                                                      nvgraphGraphDescr_t descrG,
+                                                      size_t setnum,
+                                                      cudaDataType_t settype,
+                                                      void *edgeData);
 
 #if defined(__cplusplus)
 } /* extern "C" */
 #endif
 
 #endif /* _NVGRAPH_H_ */
-
diff --git a/cpp/nvgraph/cpp/include/nvgraphP.h b/cpp/nvgraph/cpp/include/nvgraphP.h
index 8e6080e874d..5ca1c369b1b 100644
--- a/cpp/nvgraph/cpp/include/nvgraphP.h
+++ b/cpp/nvgraph/cpp/include/nvgraphP.h
@@ -24,7 +24,7 @@
 
 #pragma once
 #include "nvgraph.h"
-#include "cnmem.h"
+#include "rmm/rmm.h"
 
 #if defined(__cplusplus) 
   extern "C" {
@@ -41,7 +41,6 @@ typedef enum
 
 struct nvgraphContext {
    cudaStream_t stream;
-   cnmemDevice_t cnmem_device;  
    int nvgraphIsInitialized;  
 };
 
diff --git a/cpp/nvgraph/cpp/include/nvgraph_convert.hxx b/cpp/nvgraph/cpp/include/nvgraph_convert.hxx
index f0c5620e7e7..0cd29195470 100644
--- a/cpp/nvgraph/cpp/include/nvgraph_convert.hxx
+++ b/cpp/nvgraph/cpp/include/nvgraph_convert.hxx
@@ -17,7 +17,6 @@
 
 #include <nvgraph.h>
 #include <nvgraph_cusparse.hxx>
-#include <cnmem_shared_ptr.hxx>
 
 namespace nvgraph{
   void csr2coo( const int *csrSortedRowPtr,
diff --git a/cpp/nvgraph/cpp/include/nvgraph_error.hxx b/cpp/nvgraph/cpp/include/nvgraph_error.hxx
index 14815c83acd..a8fe364ebff 100644
--- a/cpp/nvgraph/cpp/include/nvgraph_error.hxx
+++ b/cpp/nvgraph/cpp/include/nvgraph_error.hxx
@@ -136,6 +136,25 @@ int NVGRAPH_GetErrorString( NVGRAPH_ERROR error, char* buffer, int buf_len);
     }
 #endif
 
+// This is a gap filler, and should be replaced with a RAPIDS-wise error handling mechanism.
+#undef rmmCheckError
+#if defined(DEBUG) || defined(VERBOSE_DIAG)
+#define rmmCheckError(e) {                              \
+  if (e != RMM_SUCCESS) {                               \
+    std::stringstream _error;                           \
+    _error << "RMM failure.";                           \
+    FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE); \
+  }                                                     \
+}
+#else // NO DEBUG
+#define rmmCheckError(e)                              \
+    {                                                 \
+        if (e != RMM_SUCCESS) {                       \
+            FatalError("", NVGRAPH_ERR_CUDA_FAILURE); \
+        }                                             \
+    }
+#endif
+
 #define CHECK_CUDA(call)                                                      \
     {                                                                         \
         cudaError_t _e = (call);                                              \
diff --git a/cpp/nvgraph/cpp/include/nvgraph_vector.hxx b/cpp/nvgraph/cpp/include/nvgraph_vector.hxx
index 33a69e9c1a1..5e03ccbde73 100644
--- a/cpp/nvgraph/cpp/include/nvgraph_vector.hxx
+++ b/cpp/nvgraph/cpp/include/nvgraph_vector.hxx
@@ -15,7 +15,7 @@
  */
  
 #pragma once
-#include <cnmem_shared_ptr.hxx>
+#include <rmm_shared_ptr.hxx>
 #include "nvgraph_error.hxx"
 #include "nvgraph_vector_kernels.hxx"
 
@@ -36,7 +36,7 @@ public:
 protected:
     /*! Storage for the values.
      */
-    SHARED_PREFIX::shared_ptr<ValueType> values;
+    std::shared_ptr<ValueType> values;
 
     /*! Size of the array
      */
diff --git a/cpp/nvgraph/cpp/include/nvlouvain.cuh b/cpp/nvgraph/cpp/include/nvlouvain.cuh
index 9644a17d40d..cabc923575f 100644
--- a/cpp/nvgraph/cpp/include/nvlouvain.cuh
+++ b/cpp/nvgraph/cpp/include/nvlouvain.cuh
@@ -30,6 +30,9 @@
 #include <thrust/functional.h>
 #include <cusparse.h>
 
+#include <rmm/rmm.h>
+#include <rmm/thrust_rmm_allocator.h>
+
 #include "graph_utils.cuh"
 #include "modularity.cuh"
 #include "delta_modularity.cuh"
@@ -66,12 +69,12 @@ NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val,
   int n_edges = num_edges;
   int n_vertex = num_vertex;
 
-  thrust::device_vector<IdxType> csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1);
-  thrust::device_vector<IdxType> csr_ind_d(csr_ind, csr_ind + n_edges);
-  thrust::device_vector<ValType> csr_val_d(csr_val, csr_val + n_edges);
+  rmm::device_vector<IdxType> csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1);
+  rmm::device_vector<IdxType> csr_ind_d(csr_ind, csr_ind + n_edges);
+  rmm::device_vector<ValType> csr_val_d(csr_val, csr_val + n_edges);
 
   //std::vector<IdxType> clustering(n_vertex);
-  thrust::device_vector<IdxType> clustering(n_vertex);
+  rmm::device_vector<IdxType> clustering(n_vertex);
   int upper_bound = 100;
 
   HighResClock hr_clock;
@@ -87,18 +90,18 @@ NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val,
 
   ValType best_modularity = -1;
 
-  thrust::device_vector<IdxType> new_csr_ptr(n_vertex, 0);
-  thrust::device_vector<IdxType> new_csr_ind(n_edges, 0);
-  thrust::device_vector<ValType> new_csr_val(n_edges, 0);
-
-  thrust::device_vector<IdxType> cluster_d(n_vertex);
-  thrust::device_vector<IdxType> aggregates_tmp_d(n_vertex, 0);
-  thrust::device_vector<IdxType> cluster_inv_ptr(c_size + 1, 0);
-  thrust::device_vector<IdxType> cluster_inv_ind(n_vertex, 0);
-  thrust::device_vector<ValType> k_vec(n_vertex, 0);
-  thrust::device_vector<ValType> Q_arr(n_vertex, 0);
-  thrust::device_vector<ValType> delta_Q_arr(n_edges, 0);
-  thrust::device_vector<ValType> cluster_sum_vec(c_size, 0);
+  rmm::device_vector<IdxType> new_csr_ptr(n_vertex, 0);
+  rmm::device_vector<IdxType> new_csr_ind(n_edges, 0);
+  rmm::device_vector<ValType> new_csr_val(n_edges, 0);
+
+  rmm::device_vector<IdxType> cluster_d(n_vertex);
+  rmm::device_vector<IdxType> aggregates_tmp_d(n_vertex, 0);
+  rmm::device_vector<IdxType> cluster_inv_ptr(c_size + 1, 0);
+  rmm::device_vector<IdxType> cluster_inv_ind(n_vertex, 0);
+  rmm::device_vector<ValType> k_vec(n_vertex, 0);
+  rmm::device_vector<ValType> Q_arr(n_vertex, 0);
+  rmm::device_vector<ValType> delta_Q_arr(n_edges, 0);
+  rmm::device_vector<ValType> cluster_sum_vec(c_size, 0);
   thrust::host_vector<IdxType> best_cluster_h(n_vertex, 0);
   Vector<IdxType> aggregates((int) current_n_vertex, 0);
 
@@ -454,9 +457,9 @@ NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val,
   int n_edges = num_edges;
   int n_vertex = num_vertex;
 
-  thrust::device_vector<IdxType> csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1);
-  thrust::device_vector<IdxType> csr_ind_d(csr_ind, csr_ind + n_edges);
-  thrust::device_vector<ValType> csr_val_d(csr_val, csr_val + n_edges);
+  rmm::device_vector<IdxType> csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1);
+  rmm::device_vector<IdxType> csr_ind_d(csr_ind, csr_ind + n_edges);
+  rmm::device_vector<ValType> csr_val_d(csr_val, csr_val + n_edges);
 
 
   int upper_bound = 100;
@@ -472,18 +475,18 @@ NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val,
 
   ValType best_modularity = -1;
 
-  thrust::device_vector<IdxType> new_csr_ptr(n_vertex, 0);
-  thrust::device_vector<IdxType> new_csr_ind(n_edges, 0);
-  thrust::device_vector<ValType> new_csr_val(n_edges, 0);
-
-  thrust::device_vector<IdxType> cluster_d(n_vertex);
-  thrust::device_vector<IdxType> aggregates_tmp_d(n_vertex, 0);
-  thrust::device_vector<IdxType> cluster_inv_ptr(c_size + 1, 0);
-  thrust::device_vector<IdxType> cluster_inv_ind(n_vertex, 0);
-  thrust::device_vector<ValType> k_vec(n_vertex, 0);
-  thrust::device_vector<ValType> Q_arr(n_vertex, 0);
-  thrust::device_vector<ValType> delta_Q_arr(n_edges, 0);
-  thrust::device_vector<ValType> cluster_sum_vec(c_size, 0);
+  rmm::device_vector<IdxType> new_csr_ptr(n_vertex, 0);
+  rmm::device_vector<IdxType> new_csr_ind(n_edges, 0);
+  rmm::device_vector<ValType> new_csr_val(n_edges, 0);
+
+  rmm::device_vector<IdxType> cluster_d(n_vertex);
+  rmm::device_vector<IdxType> aggregates_tmp_d(n_vertex, 0);
+  rmm::device_vector<IdxType> cluster_inv_ptr(c_size + 1, 0);
+  rmm::device_vector<IdxType> cluster_inv_ind(n_vertex, 0);
+  rmm::device_vector<ValType> k_vec(n_vertex, 0);
+  rmm::device_vector<ValType> Q_arr(n_vertex, 0);
+  rmm::device_vector<ValType> delta_Q_arr(n_edges, 0);
+  rmm::device_vector<ValType> cluster_sum_vec(c_size, 0);
   std::vector<IdxType> best_cluster_h(n_vertex, 0);
   Vector<IdxType> aggregates(current_n_vertex, 0);
 
diff --git a/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx b/cpp/nvgraph/cpp/include/rmm_shared_ptr.hxx
similarity index 60%
rename from cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx
rename to cpp/nvgraph/cpp/include/rmm_shared_ptr.hxx
index 2143ec8e4ac..da777bfdd86 100644
--- a/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx
+++ b/cpp/nvgraph/cpp/include/rmm_shared_ptr.hxx
@@ -13,42 +13,30 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
- 
+
 #pragma once
 
-#include <cnmem.h>
 #include <cstring>
-
-
-// 
-
-#if __cplusplus > 199711L
+#include <iostream>
 #include <memory>
-#define SHARED_PREFIX std
 
-#else
-#include <boost/shared_ptr.hpp>
-#define SHARED_PREFIX boost
+#include "rmm/rmm.h"
 
-#endif
-
-#include <iostream>
 #include "nvgraph_error.hxx"
 
 namespace nvgraph
 {
 
 template< typename T >
-class DeviceDeleter 
+class DeviceDeleter
 {
     cudaStream_t mStream;
 public:
     DeviceDeleter(cudaStream_t stream) : mStream(stream) {}
-    void operator()(T *ptr) 
+    void operator()(T *ptr)
     {
-        cnmemStatus_t status = cnmemFree(ptr, mStream);
-        if( status != CNMEM_STATUS_SUCCESS ) 
-        {
+        auto status = RMM_FREE(ptr, mStream);
+        if (status != RMM_SUCCESS) {
             FatalError("Memory manager internal error (free)", NVGRAPH_ERR_UNKNOWN);
         }
     }
@@ -56,38 +44,36 @@ public:
 
 
 template< typename T >
-inline SHARED_PREFIX::shared_ptr<T> allocateDevice(size_t n, cudaStream_t stream) 
+inline std::shared_ptr<T> allocateDevice(size_t n, cudaStream_t stream)
 {
     T *ptr = NULL;
-    cnmemStatus_t status = cnmemMalloc((void**) &ptr, n*sizeof(T), stream);
-    if( status == CNMEM_STATUS_OUT_OF_MEMORY) 
-    {
+    auto status = RMM_ALLOC(&ptr, n * sizeof(T), stream);
+    if (status == RMM_ERROR_OUT_OF_MEMORY) {
         FatalError("Not enough memory", NVGRAPH_ERR_NO_MEMORY);
     }
-    else if (status != CNMEM_STATUS_SUCCESS)
-    {
-        FatalError("Memory manager internal error (alloc)", NVGRAPH_ERR_UNKNOWN);        
+    else if (status != RMM_SUCCESS) {
+        FatalError("Memory manager internal error (alloc)", NVGRAPH_ERR_UNKNOWN);
     }
-    return SHARED_PREFIX::shared_ptr<T>(ptr, DeviceDeleter<T>(stream));
+    return std::shared_ptr<T>(ptr, DeviceDeleter<T>(stream));
 }
 
 template< typename T >
-class DeviceReleaser 
+class DeviceReleaser
 {
     cudaStream_t mStream;
 public:
     DeviceReleaser(cudaStream_t stream) : mStream(stream) {}
-    void operator()(T *ptr) 
+    void operator()(T *ptr)
     {
 
     }
 };
 
 template< typename T >
-inline SHARED_PREFIX::shared_ptr<T> attachDevicePtr(T * ptr_in, cudaStream_t stream) 
+inline std::shared_ptr<T> attachDevicePtr(T * ptr_in, cudaStream_t stream)
 {
     T *ptr = ptr_in;
-    return SHARED_PREFIX::shared_ptr<T>(ptr, DeviceReleaser<T>(stream));
+    return std::shared_ptr<T>(ptr, DeviceReleaser<T>(stream));
 }
 
 
diff --git a/cpp/nvgraph/cpp/include/test/cluster_inv.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/cluster_inv.cuh
similarity index 100%
rename from cpp/nvgraph/cpp/include/test/cluster_inv.cuh
rename to cpp/nvgraph/cpp/include/test_to_be_removed/cluster_inv.cuh
diff --git a/cpp/nvgraph/cpp/include/test/delta_modularity_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/delta_modularity_test.cuh
similarity index 100%
rename from cpp/nvgraph/cpp/include/test/delta_modularity_test.cuh
rename to cpp/nvgraph/cpp/include/test_to_be_removed/delta_modularity_test.cuh
diff --git a/cpp/nvgraph/cpp/include/test/k_compute_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/k_compute_test.cuh
similarity index 100%
rename from cpp/nvgraph/cpp/include/test/k_compute_test.cuh
rename to cpp/nvgraph/cpp/include/test_to_be_removed/k_compute_test.cuh
diff --git a/cpp/nvgraph/cpp/include/test/k_in_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/k_in_test.cuh
similarity index 100%
rename from cpp/nvgraph/cpp/include/test/k_in_test.cuh
rename to cpp/nvgraph/cpp/include/test_to_be_removed/k_in_test.cuh
diff --git a/cpp/nvgraph/cpp/include/test/mem_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/mem_test.cuh
similarity index 100%
rename from cpp/nvgraph/cpp/include/test/mem_test.cuh
rename to cpp/nvgraph/cpp/include/test_to_be_removed/mem_test.cuh
diff --git a/cpp/nvgraph/cpp/include/test/modularity_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/modularity_test.cuh
similarity index 100%
rename from cpp/nvgraph/cpp/include/test/modularity_test.cuh
rename to cpp/nvgraph/cpp/include/test_to_be_removed/modularity_test.cuh
diff --git a/cpp/nvgraph/cpp/include/test/phase_1_color_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/phase_1_color_test.cuh
similarity index 100%
rename from cpp/nvgraph/cpp/include/test/phase_1_color_test.cuh
rename to cpp/nvgraph/cpp/include/test_to_be_removed/phase_1_color_test.cuh
diff --git a/cpp/nvgraph/cpp/include/test/phase_1_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/phase_1_test.cuh
similarity index 100%
rename from cpp/nvgraph/cpp/include/test/phase_1_test.cuh
rename to cpp/nvgraph/cpp/include/test_to_be_removed/phase_1_test.cuh
diff --git a/cpp/nvgraph/cpp/include/test/thrust_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/thrust_test.cuh
similarity index 100%
rename from cpp/nvgraph/cpp/include/test/thrust_test.cuh
rename to cpp/nvgraph/cpp/include/test_to_be_removed/thrust_test.cuh
diff --git a/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh b/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh
index 7faec5ee85d..1a017d80c80 100644
--- a/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh
+++ b/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh
@@ -20,9 +20,12 @@
 #include <thrust/gather.h>
 #include <thrust/binary_search.h>
 #include <thrust/detail/temporary_array.h>
-#include "util.cuh"
+
+#include <rmm/rmm.h>
+#include <rmm/thrust_rmm_allocator.h>
+
 #include "graph_utils.cuh"
-//#include <cusp/format_utils.h> //indices_to_offsets
+#include "util.cuh"
 
 template <typename DerivedPolicy, typename IndexArray, typename OffsetArray>
 void indices_to_offsets(const thrust::execution_policy<DerivedPolicy> &exec,
@@ -161,21 +164,21 @@ void jToJKernel(const IndexType *column_indices, const IndexType *aggregates, In
 // Method to compute Ac on DEVICE using csr format
 template <typename IndexType, typename ValueType>
 void generate_superverticies_graph(const int n_vertex, const int num_aggregates, 
-                                   thrust::device_vector<IndexType> &csr_ptr_d, 
-                                   thrust::device_vector<IndexType> &csr_ind_d,
-                                   thrust::device_vector<ValueType> &csr_val_d,
-                                   thrust::device_vector<IndexType> &new_csr_ptr_d, 
-                                   thrust::device_vector<IndexType> &new_csr_ind_d,
-                                   thrust::device_vector<ValueType> &new_csr_val_d,
-                                   const thrust::device_vector<IndexType> &aggregates  
+                                   rmm::device_vector<IndexType> &csr_ptr_d, 
+                                   rmm::device_vector<IndexType> &csr_ind_d,
+                                   rmm::device_vector<ValueType> &csr_val_d,
+                                   rmm::device_vector<IndexType> &new_csr_ptr_d, 
+                                   rmm::device_vector<IndexType> &new_csr_ind_d,
+                                   rmm::device_vector<ValueType> &new_csr_val_d,
+                                   const rmm::device_vector<IndexType> &aggregates  
                                    ){
   
   const int n_edges = csr_ptr_d[n_vertex];
 
   
-  thrust::device_vector<IndexType> I(n_edges,-1);
-  thrust::device_vector<IndexType> J(n_edges,-1);
-  thrust::device_vector<ValueType> V(n_edges,-1);
+  rmm::device_vector<IndexType> I(n_edges,-1);
+  rmm::device_vector<IndexType> J(n_edges,-1);
+  rmm::device_vector<ValueType> V(n_edges,-1);
 
   const int block_size_I = 128;
   const int block_size_J = 256;
@@ -229,7 +232,7 @@ void generate_superverticies_graph(const int n_vertex, const int num_aggregates,
 
 
   // Reduce by key to fill in Ac.column_indices and Ac.values
-  thrust::device_vector<IndexType> new_row_indices(NNZ,0);
+  rmm::device_vector<IndexType> new_row_indices(NNZ,0);
 
 
   thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(I.begin(), J.begin())),
diff --git a/cpp/nvgraph/cpp/include/thrust_traits.hxx b/cpp/nvgraph/cpp/include/thrust_traits.hxx
index 922d680474d..89a026d8c53 100644
--- a/cpp/nvgraph/cpp/include/thrust_traits.hxx
+++ b/cpp/nvgraph/cpp/include/thrust_traits.hxx
@@ -14,65 +14,35 @@
  * limitations under the License.
  */
 
-
-
 #ifndef THRUST_TRAITS_HXX
-
 #define THRUST_TRAITS_HXX
 
-
-
-#include <thrust/device_vector.h>
-
+#include <thrust/device_ptr.h>
 #include <thrust/host_vector.h>
 
-
+#include <rmm/rmm.h>
+#include <rmm/thrust_rmm_allocator.h>
 
 namespace nvgraph
-
 {
-
   //generic Vector Ptr Type facade:
-
-  //
-
   template<typename T, typename Vector>
-
   struct VectorPtrT;
 
-
-
   //partial specialization for device_vector:
-
-  //
-
   template<typename T>
-
-  struct VectorPtrT<T, thrust::device_vector<T> >
-
+  struct VectorPtrT<T, rmm::device_vector<T>>
   {
-
     typedef thrust::device_ptr<T> PtrT;
 
   };
 
-
-
   //partial specialization for host_vector:
-
-  //
-
   template<typename T>
-
-  struct VectorPtrT<T, thrust::host_vector<T> >
-
+  struct VectorPtrT<T, thrust::host_vector<T>>
   {
-
     typedef typename thrust::host_vector<T>::value_type* PtrT;
-
   };
-
 }
 
 #endif
-
diff --git a/cpp/nvgraph/cpp/include/valued_csr_graph.cuh b/cpp/nvgraph/cpp/include/valued_csr_graph.cuh
index 81e0e517f06..cf000da24a9 100644
--- a/cpp/nvgraph/cpp/include/valued_csr_graph.cuh
+++ b/cpp/nvgraph/cpp/include/valued_csr_graph.cuh
@@ -16,27 +16,30 @@
 
 #pragma once
 
+#include <rmm/rmm.h>
+#include <rmm/thrust_rmm_allocator.h>
+
 namespace nvlouvain{
 
 
 template <typename ValType>
-class Vector: public thrust::device_vector<ValType>{
+class Vector: public rmm::device_vector<ValType>{
   public:
-    Vector(): thrust::device_vector<ValType>(){}
-    Vector(int size): thrust::device_vector<ValType>(size){}
+    Vector(): rmm::device_vector<ValType>(){}
+    Vector(int size): rmm::device_vector<ValType>(size){}
  
     template <typename Iter> 
-    Vector(Iter begin, Iter end): thrust::device_vector<ValType>(begin, end){}
+    Vector(Iter begin, Iter end): rmm::device_vector<ValType>(begin, end){}
  
     inline void fill(const ValType val){
       thrust::fill(thrust::cuda::par, this->begin(), this->end(), val);
     }
-    inline thrust::device_vector<ValType>& to_device_vector(){
-      return static_cast<thrust::device_vector<ValType>> (*this);
+    inline rmm::device_vector<ValType>& to_device_vector(){
+      return static_cast<rmm::device_vector<ValType>> (*this);
     }
 
     inline ValType* raw(){
-      return (ValType*)thrust::raw_pointer_cast( thrust::device_vector<ValType>::data() );
+      return (ValType*)thrust::raw_pointer_cast( rmm::device_vector<ValType>::data() );
     }
 
     inline int get_size(){
@@ -49,7 +52,7 @@ template <typename IndexType, typename ValueType>
 class CsrGraph{
      
   public:
-    CsrGraph( thrust::device_vector<IndexType>& csr_ptr_d, thrust::device_vector<IndexType>& csr_ind_d,  thrust::device_vector<ValueType>& csr_val_d, IndexType v, IndexType e, bool _w=false):
+    CsrGraph( rmm::device_vector<IndexType>& csr_ptr_d, rmm::device_vector<IndexType>& csr_ind_d,  rmm::device_vector<ValueType>& csr_val_d, IndexType v, IndexType e, bool _w=false):
     _n_vertices(v), _n_edges(e), csr_ptr(csr_ptr_d.begin(), csr_ptr_d.end()), csr_ind(csr_ind_d.begin(), csr_ind_d.end()), csr_val(csr_val_d.begin(), csr_val_d.end()), weighted(_w){
     }
     
@@ -93,32 +96,32 @@ class CsrGraph{
       return csr_val;
     }
  
-    inline void update_csr_ptr(thrust::device_vector<IndexType> & d_v){
+    inline void update_csr_ptr(rmm::device_vector<IndexType> & d_v){
       thrust::copy(thrust::cuda::par, d_v.begin(), d_v.end(), csr_ptr.begin());
     }
-    inline void update_csr_ptr_n(thrust::device_vector<IndexType> & d_v,unsigned size){
+    inline void update_csr_ptr_n(rmm::device_vector<IndexType> & d_v,unsigned size){
       csr_ptr.resize(size);
       thrust::copy_n(thrust::cuda::par, d_v.begin(), size, csr_ptr.begin());
     } 
 
 
-    inline void update_csr_ind(thrust::device_vector<IndexType> & d_v){
+    inline void update_csr_ind(rmm::device_vector<IndexType> & d_v){
       thrust::copy(thrust::cuda::par, d_v.begin(), d_v.end(), csr_ind.begin());
     }
-    inline void update_csr_ind_n(thrust::device_vector<IndexType> & d_v,unsigned size){
+    inline void update_csr_ind_n(rmm::device_vector<IndexType> & d_v,unsigned size){
       csr_ind.resize(size);
       thrust::copy_n(thrust::cuda::par, d_v.begin(), size, csr_ind.begin());
     } 
 
 
-    inline void update_csr_val(thrust::device_vector<ValueType> & d_v){
+    inline void update_csr_val(rmm::device_vector<ValueType> & d_v){
       thrust::copy(thrust::cuda::par, d_v.begin(), d_v.end(), csr_val.begin());
     }  
-    inline void update_csr_val_n(thrust::device_vector<ValueType> & d_v,unsigned size){
+    inline void update_csr_val_n(rmm::device_vector<ValueType> & d_v,unsigned size){
       csr_val.resize(size); 
       thrust::copy_n(thrust::cuda::par, d_v.begin(), size, csr_val.begin());
     } 
-    inline void update_graph(size_t n_v, size_t n_e, thrust::device_vector<IndexType> & ptr, thrust::device_vector<IndexType> & ind, thrust::device_vector<ValueType> & val, bool w){
+    inline void update_graph(size_t n_v, size_t n_e, rmm::device_vector<IndexType> & ptr, rmm::device_vector<IndexType> & ind, rmm::device_vector<ValueType> & val, bool w){
       _n_vertices = n_v;
       _n_edges = n_e;
 #ifdef DEBUG
diff --git a/cpp/nvgraph/cpp/include/valued_csr_graph.hxx b/cpp/nvgraph/cpp/include/valued_csr_graph.hxx
index 5fe1986c449..0469eabf2fa 100644
--- a/cpp/nvgraph/cpp/include/valued_csr_graph.hxx
+++ b/cpp/nvgraph/cpp/include/valued_csr_graph.hxx
@@ -38,7 +38,7 @@ private:
 protected:
     /*! Storage for the nonzero entries of the CSR data structure.
      */
-    SHARED_PREFIX::shared_ptr<ValueType> values;
+    std::shared_ptr<ValueType> values;
 
 public:  
 
diff --git a/cpp/nvgraph/cpp/src/arnoldi.cu b/cpp/nvgraph/cpp/src/arnoldi.cu
index 8975b985f83..617adb893ad 100644
--- a/cpp/nvgraph/cpp/src/arnoldi.cu
+++ b/cpp/nvgraph/cpp/src/arnoldi.cu
@@ -31,13 +31,6 @@
 #include "nvgraph_csrmv.hxx"
 #include "matrix.hxx"
 
-
-#include "debug_macros.h"
-#ifdef DEBUG
-#define IRAM_VERBOSE
-// #define IRAM_DEBUG
-#endif
-
 namespace nvgraph
 {
 
@@ -88,19 +81,6 @@ NVGRAPH_ERROR ImplicitArnoldi<IndexType_, ValueType_>::solve(const int restart_i
                                                           const int nested_subspaces_freq)
 {
     //try {
-    #ifdef IRAM_VERBOSE
-        std::stringstream ss;
-        ss.str(std::string());
-        size_t used_mem, free_mem, total_mem;
-        ss <<" ------------------ImplicitArnoldi------------------"<< std::endl;
-        ss <<" --------------------------------------------"<< std::endl;
-        ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl;
-        ss <<" --------------------------------------------"<< std::endl;
-        COUT()<<ss.str();
-        // start timer
-        cuda_timer timer;
-        timer.start();
-    #endif
     m_nested_subspaces_freq = nested_subspaces_freq;
 
     setup(initial_guess, restart_it, nEigVals);
@@ -108,36 +88,12 @@ NVGRAPH_ERROR ImplicitArnoldi<IndexType_, ValueType_>::solve(const int restart_i
     bool converged = false;
     int i = 0;
     // we can print stats after setup to have the initial residual
-    #ifdef IRAM_VERBOSE
-            ss.str(std::string());
-            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
-            used_mem=total_mem-free_mem;
-            ss << std::setw(10) << i ;
-            ss.precision(3);
-            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
-            ss << std::setw(15) << std::scientific << m_residual;
-           if (m_miramns)  ss << "  (Krylov size: " << m_select << ")";
-            ss << std::endl;
-            COUT()<<ss.str();
-    #endif
     while (!converged && i< m_max_iter)
     {
         // re-add the extra eigenvalue in case QR step changed it.
         m_n_eigenvalues = m_nr_eigenvalues+1; 
         converged = solve_it();
         i++;
-         #ifdef IRAM_VERBOSE
-            ss.str(std::string());
-            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
-            used_mem=total_mem-free_mem;
-            ss << std::setw(10) << i ;
-            ss.precision(3);
-            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
-            ss << std::setw(15) << std::scientific << m_residual;
-            if (m_miramns)  ss << "  (Krylov size: " << m_select << ")";
-            ss << std::endl;
-            COUT()<<ss.str();
-        #endif
     }
     m_iterations = i;
     if (!m_miramns)
@@ -158,23 +114,6 @@ NVGRAPH_ERROR ImplicitArnoldi<IndexType_, ValueType_>::solve(const int restart_i
     compute_eigenvectors();
     cudaMemcpyAsync(eigVals.raw(), &m_ritz_eigenvalues[0], (size_t)(m_nr_eigenvalues*sizeof(m_ritz_eigenvalues[0])), cudaMemcpyHostToDevice);
     cudaCheckError();
-    #ifdef IRAM_VERBOSE
-        COUT() <<" --------------------------------------------"<< std::endl;
-        //stop timer
-        COUT() <<" Total Time : "<< timer.stop() << "ms"<<std::endl;
-        COUT() <<" --------------------------------------------"<< std::endl;
-
-       //for(int i = 0; i<m_nr_eigenvalues; i++)
-       //{
-       //     COUT() << m_ritz_eigenvalues[i];
-       //     if (m_ritz_eigenvalues_i[i])
-       //         COUT() << " " <<m_ritz_eigenvalues_i[i]<<std::endl;
-       //     else
-       //         COUT() <<std::endl;
-       //}
-
-
-    #endif
     // } catch (const std::exception &exc) {std::cout << exc.what();}
     // x = m_x; // sometime there is a mixup between pointers, need to investigate that.
     return NVGRAPH_OK;
@@ -494,19 +433,6 @@ bool ImplicitArnoldi<IndexType_, ValueType_>::solve_arnoldi(int lower_bound, int
         }
         
     }
-     #ifdef IRAM_DEBUG
-       COUT()
-       <<"---------------------------------------------"<<std::endl
-       <<"                   ARNOLDI                     "<<std::endl
-       <<"---------------------------------------------"<<std::endl;
-       COUT()<<"V:"<<std::endl;
-       for (int i = 0; i < m_Vi.size()-1; ++i)
-           m_V.dump(n*i,n);
-       COUT()<<std::endl<<"f:"<<std::endl;
-       m_V.dump(n*m_krylov_size,n);
-       COUT()<<std::endl<<"H:"<<std::endl;
-       dump_host_dense_mat(m_H, m_krylov_size);
-    #endif
    // dump_host_dense_mat(m_H, m_krylov_size);
     // this is where we compute the residual after the arnoldi reduction in IRAM
     if (!m_miramns)
@@ -535,10 +461,6 @@ bool ImplicitArnoldi<IndexType_, ValueType_>::solve_it()
 template <typename IndexType_, typename ValueType_>
 void ImplicitArnoldi<IndexType_, ValueType_>::select_subspace()
 {
-    #ifdef IRAM_DEBUG
-        COUT() <<std::endl << "Residuals "; dump_host_vec(m_mns_residuals);
-    #endif
-    
 #if __cplusplus > 199711L
     typename std::vector<ValueType_>::iterator it = std::min_element(std::begin(m_mns_residuals), std::end(m_mns_residuals));
 #else
@@ -682,25 +604,10 @@ void ImplicitArnoldi<IndexType_, ValueType_>::compute_residual(int subspace_size
                lam = std::abs(m_ritz_eigenvalues[i]);
 
             tmp_residual = residual_norm / lam;
-            //tmp_residual = residual_norm ;
-            //COUT() << "last_ritz_vector : "<<last_ritz_vector<<std::endl;
-            //COUT() << "res : "<<residual_norm<<std::endl;
-            //COUT() << "ri : "<<m_ritz_eigenvalues[i]<<std::endl;
-            //COUT() << "tmp : "<<tmp_residual<<std::endl;
             if (m_residual<tmp_residual)
                 m_residual = tmp_residual;
         }
     }
-    //#ifdef IRAM_DEBUG
-        //COUT()<<std::endl << "Residual " << m_residual <<std::endl;
-        //COUT() << "m_ritz_eigenvalues : "<<std::endl;
-        //dump_host_vec(m_ritz_eigenvalues);
-        //COUT() << "m_ritz_eigenvectors : "<<std::endl;
-        //dump_host_dense_mat(m_ritz_eigenvectors, subspace_size);
-        //COUT() << "m_beta : " << m_beta <<std::endl;
-        //COUT() << "last_ritz_vector : " << last_ritz_vector <<std::endl;
-        //COUT() << "residual_norm : " << residual_norm <<std::endl;
-    //#endif
 
     if (m_residual < m_tolerance)
     {
@@ -718,52 +625,11 @@ void ImplicitArnoldi<IndexType_, ValueType_>::implicit_restart()
     // optim:  avoid the cpy here 
     if (!m_miramns) std::copy(m_H.begin(), m_H.end(), m_H_select.begin());
     select_shifts(m_dirty_bit);
-    #ifdef IRAM_DEBUG
-     for(int i = 0; i<m_n_eigenvalues; i++)
-       {
-            COUT() << m_ritz_eigenvalues[i];
-            if (m_ritz_eigenvalues_i[i])
-                COUT() << " " <<m_ritz_eigenvalues_i[i]<<std::endl;
-            else
-                COUT() <<std::endl;
-       }
-        COUT()<<std::endl
-       <<"---------------------------------------------"<<std::endl
-       <<"        KRYLOV SOLUTION           "<<std::endl
-       <<"---------------------------------------------"<<std::endl;
-        COUT() << "ritz_values : "<<std::endl;
-        dump_host_vec(m_ritz_eigenvalues);
-        COUT() << "ritz_vectors : "<<std::endl;
-        dump_host_dense_mat(m_ritz_eigenvectors, m_select);
-    #endif
 
     qr_step();
 
-     #ifdef IRAM_DEBUG
-        COUT()<<std::endl
-       <<"---------------------------------------------"<<std::endl
-       <<"                SHIFTED QR                 "<<std::endl
-       <<"---------------------------------------------"<<std::endl;
-       COUT() << "H+"<< std::endl;
-       dump_host_dense_mat(m_H_select, m_select);
-       COUT() << "Q+"<< std::endl;
-       dump_host_dense_mat(m_Q, m_select);
-    #endif
-
     refine_basis();
 
-     #ifdef IRAM_DEBUG
-        COUT()<<std::endl
-       <<"---------------------------------------------"<<std::endl
-       <<"           REFINED BASIS               "<<std::endl
-       <<"---------------------------------------------"<<std::endl;
-       int n = m_A.get_num_vertices();
-       COUT() << "V+ : "<<std::endl;  
-       for (int i = 0; i < m_n_eigenvalues; ++i)
-           m_V.dump(n*i,n);
-       COUT()<<std::endl<<"f+:"<<std::endl;
-       m_V.dump(n*m_n_eigenvalues,n);
-    #endif
     // optim:  avoid the cpy here 
     if (!m_miramns) std::copy(m_H_select.begin(), m_H_select.end(), m_H.begin());
 }
@@ -777,12 +643,6 @@ void ImplicitArnoldi<IndexType_, ValueType_>::select_shifts(bool dirty_bit)
         std::copy(m_H_select.begin(), m_H_select.end(), m_H_tmp.begin()); 
         //Lapack<ValueType_>::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0], &m_ritz_eigenvectors[0], m_select , m_select, m_select);
         Lapack<ValueType_>::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0],&m_ritz_eigenvalues_i[0], &m_ritz_eigenvectors[0], NULL, m_select , m_select, m_select);
-        // #ifdef IRAM_DEBUG
-        //     COUT() << "m_ritz_eigenvalues : "<<std::endl;
-        //     dump_host_vec(m_ritz_eigenvalues);
-        //     COUT() << "m_ritz_eigenvectors : "<<std::endl;
-        //     dump_host_dense_mat(m_ritz_eigenvectors, m_select);
-        // #endif
     }
     m_dirty_bit = false;
     if (m_laplacian)
@@ -960,15 +820,6 @@ void ImplicitArnoldi<IndexType_, ValueType_>::qr_step()
     //for (int j = 0; j < m_select; j++)
     //    m_Q[j*m_select+j] = 1.0;
    
-    #ifdef IRAM_DEBUG
-        COUT() << "m_ritz_eigenvalues : "<<std::endl;
-        dump_host_vec(m_ritz_eigenvalues);
-        COUT() << "H0 : "<<std::endl;
-        dump_host_dense_mat(m_H_select, m_select);
-        COUT() << "Q0 : "<<std::endl;
-        dump_host_dense_mat(m_Q, m_select);
-        COUT() << "Lwork : " << lwork <<std::endl;
-    #endif
     int i = m_select-1;
     while (i >= m_n_eigenvalues)
     {
@@ -1144,17 +995,6 @@ void ImplicitArnoldi<IndexType_, ValueType_>::compute_eigenvectors()
     //nrm 1 for pagerank
     if(m_markov) 
         Cublas::scal(n, (ValueType_)1.0/m_eigenvectors.nrm1(), m_eigenvectors.raw(), 1);
-    
-    #ifdef IRAM_DEBUG
-        COUT()<<std::endl
-       <<"---------------------------------------------"<<std::endl
-       <<"             EIGENVECTORS            "<<std::endl
-       <<"---------------------------------------------"<<std::endl;
-       for (int i = 0; i < m_nr_eigenvalues; ++i)
-           m_eigenvectors.dump(n*i,n);
-        COUT() <<std::endl;  
-    #endif
-
 }
 
 template <typename IndexType_, typename ValueType_>
@@ -1200,15 +1040,9 @@ void ImplicitArnoldi<IndexType_, ValueType_>::cleanup_subspace(std::vector<Value
 template <typename IndexType_, typename ValueType_>
 void ImplicitArnoldi<IndexType_, ValueType_>::shift(std::vector<ValueType_>& H, int ld, int m, ValueType mu)
 {
-    #ifdef IRAM_DEBUG
-        dump_host_dense_mat(H,ld);
-    #endif
     int start = ld-m;
     for (int i = start; i < ld; i++)
         H[i*ld+i-start] -= mu;
-    #ifdef IRAM_DEBUG
-        dump_host_dense_mat(H,ld);
-    #endif
 }
 
 template <typename IndexType_, typename ValueType_>
diff --git a/cpp/nvgraph/cpp/src/bfs.cu b/cpp/nvgraph/cpp/src/bfs.cu
index 218f01a87ac..8c4934ca442 100644
--- a/cpp/nvgraph/cpp/src/bfs.cu
+++ b/cpp/nvgraph/cpp/src/bfs.cu
@@ -19,542 +19,540 @@
 #include "bfs.hxx"
 #include <limits>
 
+#include <rmm/rmm.h>
+
 #include "nvgraph_error.hxx"
 #include "bfs_kernels.cu"
 
 using namespace bfs_kernels;
 
 namespace nvgraph {
-	enum BFS_ALGO_STATE {
-		TOPDOWN, BOTTOMUP
-	};
-
-	template<typename IndexType>
-	NVGRAPH_ERROR Bfs<IndexType>::setup() {
-
-		// Determinism flag, false by default
-		deterministic = false;
-		//Working data
-		//Each vertex can be in the frontier at most once
-		cudaMalloc(&frontier, n * sizeof(IndexType));
-		cudaCheckError()
-		;
-
-		//We will update frontier during the execution
-		//We need the orig to reset frontier, or cudaFree
-		original_frontier = frontier;
-
-		//size of bitmaps for vertices
-		vertices_bmap_size = (n / (8 * sizeof(int)) + 1);
-		//ith bit of visited_bmap is set <=> ith vertex is visited
-		cudaMalloc(&visited_bmap, sizeof(int) * vertices_bmap_size);
-		cudaCheckError()
-		;
-
-		//ith bit of isolated_bmap is set <=> degree of ith vertex = 0
-		cudaMalloc(&isolated_bmap, sizeof(int) * vertices_bmap_size);
-		cudaCheckError()
-		;
-
-		//vertices_degree[i] = degree of vertex i
-		cudaMalloc(&vertex_degree, sizeof(IndexType) * n);
-		cudaCheckError()
-		;
-
-		//Cub working data
-		cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes);
-
-		//We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive
-		cudaMalloc(&buffer_np1_1, (n + 1) * sizeof(IndexType));
-		cudaCheckError()
-		;
-		cudaMalloc(&buffer_np1_2, (n + 1) * sizeof(IndexType));
-		cudaCheckError()
-		;
-
-		//Using buffers : top down
-
-		//frontier_vertex_degree[i] is the degree of vertex frontier[i]
-		frontier_vertex_degree = buffer_np1_1;
-		//exclusive sum of frontier_vertex_degree
-		exclusive_sum_frontier_vertex_degree = buffer_np1_2;
-
-		//Using buffers : bottom up
-
-		//contains list of unvisited vertices
-		unvisited_queue = buffer_np1_1;
-		//size of the "last" unvisited queue : size_last_unvisited_queue
-		//refers to the size of unvisited_queue
-		//which may not be up to date (the queue may contains vertices that are now visited)
-
-		//We may leave vertices unvisited after bottom up main kernels - storing them here
-		left_unvisited_queue = buffer_np1_2;
-
-		//We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket
-		//See top down kernels for more details
-		cudaMalloc(	&exclusive_sum_frontier_vertex_buckets_offsets,
-						((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType));
-		cudaCheckError()
-		;
-
-		//Init device-side counters
-		//Those counters must be/can be reset at each bfs iteration
-		//Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck
-		cudaMalloc(&d_counters_pad, 4 * sizeof(IndexType));
-		cudaCheckError()
-		;
-
-		d_new_frontier_cnt = &d_counters_pad[0];
-		d_mu = &d_counters_pad[1];
-		d_unvisited_cnt = &d_counters_pad[2];
-		d_left_unvisited_cnt = &d_counters_pad[3];
-
-		//Lets use this int* for the next 3 lines
-		//Its dereferenced value is not initialized - so we dont care about what we put in it
-		IndexType * d_nisolated = d_new_frontier_cnt;
-		cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
-		cudaCheckError()
-		;
-
-		//Computing isolated_bmap
-		//Only dependent on graph - not source vertex - done once
-		flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream);
-		cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-		cudaCheckError()
-		;
-
-		//We need nisolated to be ready to use
-		cudaStreamSynchronize(stream);
-		cudaCheckError()
-		;
-
-		return NVGRAPH_OK;
-	}
-
-	template<typename IndexType>
-	NVGRAPH_ERROR Bfs<IndexType>::configure(	IndexType *_distances,
-															IndexType *_predecessors,
-															int *_edge_mask)
-															{
-		distances = _distances;
-		predecessors = _predecessors;
-		edge_mask = _edge_mask;
-
-		useEdgeMask = (edge_mask != NULL);
-		computeDistances = (distances != NULL);
-		computePredecessors = (predecessors != NULL);
-
-		//We need distances to use bottom up
-		if (directed && !computeDistances)
-			cudaMalloc(&distances, n * sizeof(IndexType));
-
-		cudaCheckError()
-		;
-
-		return NVGRAPH_OK;
-	}
-
-	template<typename IndexType>
-	NVGRAPH_ERROR Bfs<IndexType>::traverse(IndexType source_vertex) {
-
-		//Init visited_bmap
-		//If the graph is undirected, we not that
-		//we will never discover isolated vertices (in degree = out degree = 0)
-		//we avoid a lot of work by flagging them now
-		//in g500 graphs they represent ~25% of total vertices
-		//more than that for wiki and twitter graphs
-
-		if (directed) {
-			cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream);
-		} else {
-			cudaMemcpyAsync(	visited_bmap,
-									isolated_bmap,
-									vertices_bmap_size * sizeof(int),
-									cudaMemcpyDeviceToDevice,
-									stream);
-		}
-		cudaCheckError()
-		;
-
-		//If needed, setting all vertices as undiscovered (inf distance)
-		//We dont use computeDistances here
-		//if the graph is undirected, we may need distances even if
-		//computeDistances is false
-		if (distances)
-			fill_vec(distances, n, vec_t<IndexType>::max, stream);
-
-		//If needed, setting all predecessors to non-existent (-1)
-		if (computePredecessors)
-		{
-			cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream);
-			cudaCheckError()
-			;
-		}
-
-		//
-		//Initial frontier
-		//
-
-		frontier = original_frontier;
-
-		if (distances)
-		{
-			cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream);
-			cudaCheckError()
-			;
-		}
-
-		//Setting source_vertex as visited
-		//There may be bit already set on that bmap (isolated vertices) - if the graph is undirected
-		int current_visited_bmap_source_vert = 0;
-
-		if (!directed) {
-			cudaMemcpyAsync(&current_visited_bmap_source_vert,
-									&visited_bmap[source_vertex / INT_SIZE],
-									sizeof(int),
-									cudaMemcpyDeviceToHost);
-			cudaCheckError()
-			;
-			//We need current_visited_bmap_source_vert
-			cudaStreamSynchronize(stream);
-			cudaCheckError()
-			;
-			//We could detect that source is isolated here
-		}
-
-		int m = (1 << (source_vertex % INT_SIZE));
-
-		//In that case, source is isolated, done now
-		if (!directed && (m & current_visited_bmap_source_vert)) {
-			//Init distances and predecessors are done, (cf Streamsync in previous if)
-			cudaCheckError()
-			;
-			return NVGRAPH_OK;
-		}
-
-		m |= current_visited_bmap_source_vert;
-
-		cudaMemcpyAsync(	&visited_bmap[source_vertex / INT_SIZE],
-								&m,
-								sizeof(int),
-								cudaMemcpyHostToDevice,
-								stream);
-		cudaCheckError()
-		;
-
-		//Adding source_vertex to init frontier
-		cudaMemcpyAsync(	&frontier[0],
-								&source_vertex,
-								sizeof(IndexType),
-								cudaMemcpyHostToDevice,
-								stream);
-		cudaCheckError()
-		;
-
-		//mf : edges in frontier
-		//nf : vertices in frontier
-		//mu : edges undiscovered
-		//nu : nodes undiscovered
-		//lvl : current frontier's depth
-		IndexType mf, nf, mu, nu;
-		bool growing;
-		IndexType lvl = 1;
-
-		//Frontier has one vertex
-		nf = 1;
-
-		//all edges are undiscovered (by def isolated vertices have 0 edges)
-		mu = nnz;
-
-		//all non isolated vertices are undiscovered (excepted source vertex, which is in frontier)
-		//That number is wrong if source_vertex is also isolated - but it's not important
-		nu = n - nisolated - nf;
-
-		//Last frontier was 0, now it is 1
-		growing = true;
-
-		IndexType size_last_left_unvisited_queue = n; //we just need value > 0
-		IndexType size_last_unvisited_queue = 0; //queue empty
-
-		//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-		set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream);
-		exclusive_sum(	d_cub_exclusive_sum_storage,
-							cub_exclusive_sum_storage_bytes,
-							frontier_vertex_degree,
-							exclusive_sum_frontier_vertex_degree,
-							nf + 1,
-							stream);
-
-		cudaMemcpyAsync(	&mf,
-								&exclusive_sum_frontier_vertex_degree[nf],
-								sizeof(IndexType),
-								cudaMemcpyDeviceToHost,
-								stream);
-		cudaCheckError()
-		;
-
-		//We need mf
-		cudaStreamSynchronize(stream);
-		cudaCheckError()
-		;
-
-		//At first we know we have to use top down
-		BFS_ALGO_STATE algo_state = TOPDOWN;
-
-		//useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data
-		//undirected g : need parents to be in children's neighbors
-		bool can_use_bottom_up = !directed && distances;
-
-		while (nf > 0) {
-			//Each vertices can appear only once in the frontierer array - we know it will fit
-			new_frontier = frontier + nf;
-			IndexType old_nf = nf;
-			resetDevicePointers();
-
-			if (can_use_bottom_up) {
-				//Choosing algo
-				//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
-
-				switch (algo_state) {
-				case TOPDOWN:
-					if (mf > mu / alpha)
-						algo_state = BOTTOMUP;
-					break;
-				case BOTTOMUP:
-					if (!growing && nf < n / beta) {
-
-						//We need to prepare the switch back to top down
-						//We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here
-						count_unvisited_edges(	unvisited_queue,
-														size_last_unvisited_queue,
-														visited_bmap,
-														vertex_degree,
-														d_mu,
-														stream);
-
-						//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-						set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream);
-						exclusive_sum(	d_cub_exclusive_sum_storage,
-											cub_exclusive_sum_storage_bytes,
-											frontier_vertex_degree,
-											exclusive_sum_frontier_vertex_degree,
-											nf + 1,
-											stream);
-
-						cudaMemcpyAsync(	&mf,
-												&exclusive_sum_frontier_vertex_degree[nf],
-												sizeof(IndexType),
-												cudaMemcpyDeviceToHost,
-												stream);
-						cudaCheckError()
-						;
-
-						cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-						cudaCheckError()
-						;
-
-						//We will need mf and mu
-						cudaStreamSynchronize(stream);
-						cudaCheckError()
-						;
-
-						algo_state = TOPDOWN;
-					}
-					break;
-				}
-			}
-
-			//Executing algo
-
-			switch (algo_state) {
-			case TOPDOWN:
-				compute_bucket_offsets(	exclusive_sum_frontier_vertex_degree,
-												exclusive_sum_frontier_vertex_buckets_offsets,
-												nf,
-												mf,
-												stream);
-				frontier_expand(	row_offsets,
-										col_indices,
-										frontier,
-										nf,
-										mf,
-										lvl,
-										new_frontier,
-										d_new_frontier_cnt,
-										exclusive_sum_frontier_vertex_degree,
-										exclusive_sum_frontier_vertex_buckets_offsets,
-										visited_bmap,
-										distances,
-										predecessors,
-										edge_mask,
-										isolated_bmap,
-										directed,
-										stream,
-										deterministic);
-
-				mu -= mf;
-
-				cudaMemcpyAsync(	&nf,
-										d_new_frontier_cnt,
-										sizeof(IndexType),
-										cudaMemcpyDeviceToHost,
-										stream);
-				cudaCheckError();
-
-				//We need nf
-				cudaStreamSynchronize(stream);
-				cudaCheckError();
-
-				if (nf) {
-
-					//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-					set_frontier_degree(frontier_vertex_degree, new_frontier, vertex_degree, nf, stream);
-					exclusive_sum(	d_cub_exclusive_sum_storage,
-										cub_exclusive_sum_storage_bytes,
-										frontier_vertex_degree,
-										exclusive_sum_frontier_vertex_degree,
-										nf + 1,
-										stream);
-					cudaMemcpyAsync(	&mf,
-											&exclusive_sum_frontier_vertex_degree[nf],
-											sizeof(IndexType),
-											cudaMemcpyDeviceToHost,
-											stream);
-					cudaCheckError()
-					;
-
-					//We need mf
-					cudaStreamSynchronize(stream);
-					cudaCheckError()
-					;
-				}
-				break;
-
-			case BOTTOMUP:
-				fill_unvisited_queue(visited_bmap,
-											vertices_bmap_size,
-											n,
-											unvisited_queue,
-											d_unvisited_cnt,
-											stream,
-											deterministic);
-
-				size_last_unvisited_queue = nu;
-
-				bottom_up_main(unvisited_queue,
-									size_last_unvisited_queue,
-									left_unvisited_queue,
-									d_left_unvisited_cnt,
-									visited_bmap,
-									row_offsets,
-									col_indices,
-									lvl,
-									new_frontier,
-									d_new_frontier_cnt,
-									distances,
-									predecessors,
-									edge_mask,
-									stream,
-									deterministic);
-
-				//The number of vertices left unvisited decreases
-				//If it wasnt necessary last time, it wont be this time
-				if (size_last_left_unvisited_queue) {
-					cudaMemcpyAsync(	&size_last_left_unvisited_queue,
-											d_left_unvisited_cnt,
-											sizeof(IndexType),
-											cudaMemcpyDeviceToHost,
-											stream);
-					cudaCheckError()
-					;
-					//We need last_left_unvisited_size
-					cudaStreamSynchronize(stream);
-					cudaCheckError()
-					;
-					bottom_up_large(	left_unvisited_queue,
-											size_last_left_unvisited_queue,
-											visited_bmap,
-											row_offsets,
-											col_indices,
-											lvl,
-											new_frontier,
-											d_new_frontier_cnt,
-											distances,
-											predecessors,
-											edge_mask,
-											stream,
-											deterministic);
-				}
-				cudaMemcpyAsync(	&nf,
-										d_new_frontier_cnt,
-										sizeof(IndexType),
-										cudaMemcpyDeviceToHost,
-										stream);
-				cudaCheckError()
-				;
-
-				//We will need nf
-				cudaStreamSynchronize(stream);
-				cudaCheckError()
-				;
-
-				break;
-			}
-
-			//Updating undiscovered edges count
-			nu -= nf;
-
-			//Using new frontier
-			frontier = new_frontier;
-			growing = (nf > old_nf);
-
-			++lvl;
-		}
-
-		cudaCheckError()
-		;
-		return NVGRAPH_OK;
-	}
-
-	//Just used for benchmarks now
-	template<typename IndexType>
-	NVGRAPH_ERROR Bfs<IndexType>::traverse(IndexType *source_vertices, IndexType nsources) {
-		for (IndexType i = 0; i < nsources; ++i)
-			traverse(source_vertices[i]);
-
-		return NVGRAPH_OK;
-	}
-
-	template<typename IndexType>
-	void Bfs<IndexType>::resetDevicePointers() {
-		cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream);
-		cudaCheckError()
-		;
-	}
-
-	template<typename IndexType>
-	void Bfs<IndexType>::clean() {
-		cudaCheckError()
-		;
-
-		//the vectors have a destructor that takes care of cleaning
-		cudaFree(original_frontier);
-		cudaFree(visited_bmap);
-		cudaFree(isolated_bmap);
-		cudaFree(vertex_degree);
-		cudaFree(d_cub_exclusive_sum_storage);
-		cudaFree(buffer_np1_1);
-		cudaFree(buffer_np1_2);
-		cudaFree(exclusive_sum_frontier_vertex_buckets_offsets);
-		cudaFree(d_counters_pad);
-
-		//In that case, distances is a working data
-		if (directed && !computeDistances)
-			cudaFree(distances);
-
-		cudaCheckError()
-		;
-	}
-
-	template class Bfs<int> ;
+  enum BFS_ALGO_STATE {
+    TOPDOWN, BOTTOMUP
+  };
+
+  template<typename IndexType>
+  NVGRAPH_ERROR Bfs<IndexType>::setup() {
+
+    // Determinism flag, false by default
+    deterministic = false;
+
+    auto rmm_result = RMM_SUCCESS;
+
+    //Working data
+    //Each vertex can be in the frontier at most once
+    rmm_result = RMM_ALLOC(&frontier, n * sizeof(IndexType), stream);
+    rmmCheckError(rmm_result);
+
+    //We will update frontier during the execution
+    //We need the orig to reset frontier, or cudaFree
+    original_frontier = frontier;
+
+    //size of bitmaps for vertices
+    vertices_bmap_size = (n / (8 * sizeof(int)) + 1);
+    //ith bit of visited_bmap is set <=> ith vertex is visited
+    rmm_result = RMM_ALLOC(&visited_bmap, sizeof(int) * vertices_bmap_size, stream);
+    rmmCheckError(rmm_result);
+
+    //ith bit of isolated_bmap is set <=> degree of ith vertex = 0
+    rmm_result = RMM_ALLOC(&isolated_bmap, sizeof(int) * vertices_bmap_size, stream);
+    rmmCheckError(rmm_result);
+
+    //vertices_degree[i] = degree of vertex i
+    rmm_result = RMM_ALLOC(&vertex_degree, sizeof(IndexType) * n, stream);
+    rmmCheckError(rmm_result);
+
+    //Cub working data
+    cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes);
+
+    //We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive
+    rmm_result = RMM_ALLOC(&buffer_np1_1, (n + 1) * sizeof(IndexType), stream);
+    rmmCheckError(rmm_result);
+
+    rmm_result = RMM_ALLOC(&buffer_np1_2, (n + 1) * sizeof(IndexType), stream);
+    rmmCheckError(rmm_result);
+
+    //Using buffers : top down
+
+    //frontier_vertex_degree[i] is the degree of vertex frontier[i]
+    frontier_vertex_degree = buffer_np1_1;
+    //exclusive sum of frontier_vertex_degree
+    exclusive_sum_frontier_vertex_degree = buffer_np1_2;
+
+    //Using buffers : bottom up
+
+    //contains list of unvisited vertices
+    unvisited_queue = buffer_np1_1;
+    //size of the "last" unvisited queue : size_last_unvisited_queue
+    //refers to the size of unvisited_queue
+    //which may not be up to date (the queue may contains vertices that are now visited)
+
+    //We may leave vertices unvisited after bottom up main kernels - storing them here
+    left_unvisited_queue = buffer_np1_2;
+
+    //We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket
+    //See top down kernels for more details
+    rmm_result = RMM_ALLOC(&exclusive_sum_frontier_vertex_buckets_offsets,
+                           ((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType),
+                           stream);
+    rmmCheckError(rmm_result);
+
+    //Init device-side counters
+    //Those counters must be/can be reset at each bfs iteration
+    //Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck
+    rmm_result = RMM_ALLOC(&d_counters_pad, 4 * sizeof(IndexType), stream);
+    rmmCheckError(rmm_result);
+
+    d_new_frontier_cnt = &d_counters_pad[0];
+    d_mu = &d_counters_pad[1];
+    d_unvisited_cnt = &d_counters_pad[2];
+    d_left_unvisited_cnt = &d_counters_pad[3];
+
+    //Lets use this int* for the next 3 lines
+    //Its dereferenced value is not initialized - so we dont care about what we put in it
+    IndexType * d_nisolated = d_new_frontier_cnt;
+    cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
+    cudaCheckError()
+    ;
+
+    //Computing isolated_bmap
+    //Only dependent on graph - not source vertex - done once
+    flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream);
+    cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
+    cudaCheckError()
+    ;
+
+    //We need nisolated to be ready to use
+    cudaStreamSynchronize(stream);
+    cudaCheckError()
+    ;
+
+    return NVGRAPH_OK;
+  }
+
+  template<typename IndexType>
+  NVGRAPH_ERROR Bfs<IndexType>::configure(  IndexType *_distances,
+                              IndexType *_predecessors,
+                              int *_edge_mask)
+                              {
+    distances = _distances;
+    predecessors = _predecessors;
+    edge_mask = _edge_mask;
+
+    useEdgeMask = (edge_mask != NULL);
+    computeDistances = (distances != NULL);
+    computePredecessors = (predecessors != NULL);
+
+    //We need distances to use bottom up
+    if (directed && !computeDistances) {
+      auto rmm_result = RMM_ALLOC(&distances, n * sizeof(IndexType), stream);
+      rmmCheckError(rmm_result);
+    }
+
+    return NVGRAPH_OK;
+  }
+
+  template<typename IndexType>
+  NVGRAPH_ERROR Bfs<IndexType>::traverse(IndexType source_vertex) {
+
+    //Init visited_bmap
+    //If the graph is undirected, we not that
+    //we will never discover isolated vertices (in degree = out degree = 0)
+    //we avoid a lot of work by flagging them now
+    //in g500 graphs they represent ~25% of total vertices
+    //more than that for wiki and twitter graphs
+
+    if (directed) {
+      cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream);
+    } else {
+      cudaMemcpyAsync(  visited_bmap,
+                  isolated_bmap,
+                  vertices_bmap_size * sizeof(int),
+                  cudaMemcpyDeviceToDevice,
+                  stream);
+    }
+    cudaCheckError()
+    ;
+
+    //If needed, setting all vertices as undiscovered (inf distance)
+    //We dont use computeDistances here
+    //if the graph is undirected, we may need distances even if
+    //computeDistances is false
+    if (distances)
+      fill_vec(distances, n, vec_t<IndexType>::max, stream);
+
+    //If needed, setting all predecessors to non-existent (-1)
+    if (computePredecessors)
+    {
+      cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream);
+      cudaCheckError()
+      ;
+    }
+
+    //
+    //Initial frontier
+    //
+
+    frontier = original_frontier;
+
+    if (distances)
+    {
+      cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream);
+      cudaCheckError()
+      ;
+    }
+
+    //Setting source_vertex as visited
+    //There may be bit already set on that bmap (isolated vertices) - if the graph is undirected
+    int current_visited_bmap_source_vert = 0;
+
+    if (!directed) {
+      cudaMemcpyAsync(&current_visited_bmap_source_vert,
+                  &visited_bmap[source_vertex / INT_SIZE],
+                  sizeof(int),
+                  cudaMemcpyDeviceToHost);
+      cudaCheckError()
+      ;
+      //We need current_visited_bmap_source_vert
+      cudaStreamSynchronize(stream);
+      cudaCheckError()
+      ;
+      //We could detect that source is isolated here
+    }
+
+    int m = (1 << (source_vertex % INT_SIZE));
+
+    //In that case, source is isolated, done now
+    if (!directed && (m & current_visited_bmap_source_vert)) {
+      //Init distances and predecessors are done, (cf Streamsync in previous if)
+      cudaCheckError()
+      ;
+      return NVGRAPH_OK;
+    }
+
+    m |= current_visited_bmap_source_vert;
+
+    cudaMemcpyAsync(  &visited_bmap[source_vertex / INT_SIZE],
+                &m,
+                sizeof(int),
+                cudaMemcpyHostToDevice,
+                stream);
+    cudaCheckError()
+    ;
+
+    //Adding source_vertex to init frontier
+    cudaMemcpyAsync(  &frontier[0],
+                &source_vertex,
+                sizeof(IndexType),
+                cudaMemcpyHostToDevice,
+                stream);
+    cudaCheckError()
+    ;
+
+    //mf : edges in frontier
+    //nf : vertices in frontier
+    //mu : edges undiscovered
+    //nu : nodes undiscovered
+    //lvl : current frontier's depth
+    IndexType mf, nf, mu, nu;
+    bool growing;
+    IndexType lvl = 1;
+
+    //Frontier has one vertex
+    nf = 1;
+
+    //all edges are undiscovered (by def isolated vertices have 0 edges)
+    mu = nnz;
+
+    //all non isolated vertices are undiscovered (excepted source vertex, which is in frontier)
+    //That number is wrong if source_vertex is also isolated - but it's not important
+    nu = n - nisolated - nf;
+
+    //Last frontier was 0, now it is 1
+    growing = true;
+
+    IndexType size_last_left_unvisited_queue = n; //we just need value > 0
+    IndexType size_last_unvisited_queue = 0; //queue empty
+
+    //Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+    set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream);
+    exclusive_sum(  d_cub_exclusive_sum_storage,
+              cub_exclusive_sum_storage_bytes,
+              frontier_vertex_degree,
+              exclusive_sum_frontier_vertex_degree,
+              nf + 1,
+              stream);
+
+    cudaMemcpyAsync(  &mf,
+                &exclusive_sum_frontier_vertex_degree[nf],
+                sizeof(IndexType),
+                cudaMemcpyDeviceToHost,
+                stream);
+    cudaCheckError()
+    ;
+
+    //We need mf
+    cudaStreamSynchronize(stream);
+    cudaCheckError()
+    ;
+
+    //At first we know we have to use top down
+    BFS_ALGO_STATE algo_state = TOPDOWN;
+
+    //useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data
+    //undirected g : need parents to be in children's neighbors
+    bool can_use_bottom_up = !directed && distances;
+
+    while (nf > 0) {
+      //Each vertices can appear only once in the frontierer array - we know it will fit
+      new_frontier = frontier + nf;
+      IndexType old_nf = nf;
+      resetDevicePointers();
+
+      if (can_use_bottom_up) {
+        //Choosing algo
+        //Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
+
+        switch (algo_state) {
+        case TOPDOWN:
+          if (mf > mu / alpha)
+            algo_state = BOTTOMUP;
+          break;
+        case BOTTOMUP:
+          if (!growing && nf < n / beta) {
+
+            //We need to prepare the switch back to top down
+            //We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here
+            count_unvisited_edges(  unvisited_queue,
+                            size_last_unvisited_queue,
+                            visited_bmap,
+                            vertex_degree,
+                            d_mu,
+                            stream);
+
+            //Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+            set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream);
+            exclusive_sum(  d_cub_exclusive_sum_storage,
+                      cub_exclusive_sum_storage_bytes,
+                      frontier_vertex_degree,
+                      exclusive_sum_frontier_vertex_degree,
+                      nf + 1,
+                      stream);
+
+            cudaMemcpyAsync(  &mf,
+                        &exclusive_sum_frontier_vertex_degree[nf],
+                        sizeof(IndexType),
+                        cudaMemcpyDeviceToHost,
+                        stream);
+            cudaCheckError()
+            ;
+
+            cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
+            cudaCheckError()
+            ;
+
+            //We will need mf and mu
+            cudaStreamSynchronize(stream);
+            cudaCheckError()
+            ;
+
+            algo_state = TOPDOWN;
+          }
+          break;
+        }
+      }
+
+      //Executing algo
+
+      switch (algo_state) {
+      case TOPDOWN:
+        compute_bucket_offsets(  exclusive_sum_frontier_vertex_degree,
+                        exclusive_sum_frontier_vertex_buckets_offsets,
+                        nf,
+                        mf,
+                        stream);
+        frontier_expand(  row_offsets,
+                    col_indices,
+                    frontier,
+                    nf,
+                    mf,
+                    lvl,
+                    new_frontier,
+                    d_new_frontier_cnt,
+                    exclusive_sum_frontier_vertex_degree,
+                    exclusive_sum_frontier_vertex_buckets_offsets,
+                    visited_bmap,
+                    distances,
+                    predecessors,
+                    edge_mask,
+                    isolated_bmap,
+                    directed,
+                    stream,
+                    deterministic);
+
+        mu -= mf;
+
+        cudaMemcpyAsync(  &nf,
+                    d_new_frontier_cnt,
+                    sizeof(IndexType),
+                    cudaMemcpyDeviceToHost,
+                    stream);
+        cudaCheckError();
+
+        //We need nf
+        cudaStreamSynchronize(stream);
+        cudaCheckError();
+
+        if (nf) {
+
+          //Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+          set_frontier_degree(frontier_vertex_degree, new_frontier, vertex_degree, nf, stream);
+          exclusive_sum(  d_cub_exclusive_sum_storage,
+                    cub_exclusive_sum_storage_bytes,
+                    frontier_vertex_degree,
+                    exclusive_sum_frontier_vertex_degree,
+                    nf + 1,
+                    stream);
+          cudaMemcpyAsync(  &mf,
+                      &exclusive_sum_frontier_vertex_degree[nf],
+                      sizeof(IndexType),
+                      cudaMemcpyDeviceToHost,
+                      stream);
+          cudaCheckError()
+          ;
+
+          //We need mf
+          cudaStreamSynchronize(stream);
+          cudaCheckError()
+          ;
+        }
+        break;
+
+      case BOTTOMUP:
+        fill_unvisited_queue(visited_bmap,
+                      vertices_bmap_size,
+                      n,
+                      unvisited_queue,
+                      d_unvisited_cnt,
+                      stream,
+                      deterministic);
+
+        size_last_unvisited_queue = nu;
+
+        bottom_up_main(unvisited_queue,
+                  size_last_unvisited_queue,
+                  left_unvisited_queue,
+                  d_left_unvisited_cnt,
+                  visited_bmap,
+                  row_offsets,
+                  col_indices,
+                  lvl,
+                  new_frontier,
+                  d_new_frontier_cnt,
+                  distances,
+                  predecessors,
+                  edge_mask,
+                  stream,
+                  deterministic);
+
+        //The number of vertices left unvisited decreases
+        //If it wasnt necessary last time, it wont be this time
+        if (size_last_left_unvisited_queue) {
+          cudaMemcpyAsync(  &size_last_left_unvisited_queue,
+                      d_left_unvisited_cnt,
+                      sizeof(IndexType),
+                      cudaMemcpyDeviceToHost,
+                      stream);
+          cudaCheckError()
+          ;
+          //We need last_left_unvisited_size
+          cudaStreamSynchronize(stream);
+          cudaCheckError()
+          ;
+          bottom_up_large(  left_unvisited_queue,
+                      size_last_left_unvisited_queue,
+                      visited_bmap,
+                      row_offsets,
+                      col_indices,
+                      lvl,
+                      new_frontier,
+                      d_new_frontier_cnt,
+                      distances,
+                      predecessors,
+                      edge_mask,
+                      stream,
+                      deterministic);
+        }
+        cudaMemcpyAsync(  &nf,
+                    d_new_frontier_cnt,
+                    sizeof(IndexType),
+                    cudaMemcpyDeviceToHost,
+                    stream);
+        cudaCheckError()
+        ;
+
+        //We will need nf
+        cudaStreamSynchronize(stream);
+        cudaCheckError()
+        ;
+
+        break;
+      }
+
+      //Updating undiscovered edges count
+      nu -= nf;
+
+      //Using new frontier
+      frontier = new_frontier;
+      growing = (nf > old_nf);
+
+      ++lvl;
+    }
+
+    cudaCheckError()
+    ;
+    return NVGRAPH_OK;
+  }
+
+  //Just used for benchmarks now
+  template<typename IndexType>
+  NVGRAPH_ERROR Bfs<IndexType>::traverse(IndexType *source_vertices, IndexType nsources) {
+    for (IndexType i = 0; i < nsources; ++i)
+      traverse(source_vertices[i]);
+
+    return NVGRAPH_OK;
+  }
+
+  template<typename IndexType>
+  void Bfs<IndexType>::resetDevicePointers() {
+    cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream);
+    cudaCheckError()
+    ;
+  }
+
+  template<typename IndexType>
+  void Bfs<IndexType>::clean() {
+    cudaCheckError()
+    ;
+
+    //the vectors have a destructor that takes care of cleaning
+    RMM_FREE(original_frontier, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(visited_bmap, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(isolated_bmap, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(vertex_degree, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(d_cub_exclusive_sum_storage, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(buffer_np1_1, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(buffer_np1_2, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(exclusive_sum_frontier_vertex_buckets_offsets, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+    RMM_FREE(d_counters_pad, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+
+    //In that case, distances is a working data
+    if (directed && !computeDistances)
+      RMM_FREE(distances, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+
+    cudaCheckError()
+    ;
+  }
+
+  template class Bfs<int> ;
 } // end namespace nvgraph
diff --git a/cpp/nvgraph/cpp/src/bfs_kernels.cu b/cpp/nvgraph/cpp/src/bfs_kernels.cu
index 594e2b980ca..7024036def5 100644
--- a/cpp/nvgraph/cpp/src/bfs_kernels.cu
+++ b/cpp/nvgraph/cpp/src/bfs_kernels.cu
@@ -18,6 +18,9 @@
 
 #include <sm_utils.h>
 #include <cub/cub.cuh>
+
+#include <rmm/rmm.h>
+
 #include <nvgraph_error.hxx>
 
 #define MAXBLOCKS 65535
@@ -85,1496 +88,1497 @@
 using namespace nvgraph;
 
 namespace bfs_kernels {
-	//
-	// gives the equivalent vectors from a type
-	// for the max val, would be better to use numeric_limits<>::max() once
-	// cpp11 is allowed in nvgraph
-	//
-
-	template<typename >
-	struct vec_t {
-		typedef int4 vec4;
-		typedef int2 vec2;
-	};
-
-	template<>
-	struct vec_t<int> {
-		typedef int4 vec4;
-		typedef int2 vec2;
-		static const int max = INT_MAX;
-	};
-
-	template<>
-	struct vec_t<long long int> {
-		typedef longlong4 vec4;
-		typedef longlong2 vec2;
-		static const long long int max = LLONG_MAX;
-	};
-
-	//
-	// ------------------------- Helper device functions -------------------
-	//
-
-	__forceinline__ __device__ int getMaskNRightmostBitSet(int n) {
-		if (n == INT_SIZE)
-			return (~0);
-		int mask = (1 << n) - 1;
-		return mask;
-	}
-
-	__forceinline__ __device__ int getMaskNLeftmostBitSet(int n) {
-		if (n == 0)
-			return 0;
-		int mask = ~((1 << (INT_SIZE - n)) - 1);
-		return mask;
-	}
-
-	__forceinline__ __device__ int getNextZeroBit(int& val) {
-		int ibit = __ffs(~val) - 1;
-		val |= (1 << ibit);
-
-		return ibit;
-	}
-
-	struct BitwiseAnd
-	{
-		template<typename T>
-		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
-																			{
-			return (a & b);
-		}
-	};
-
-	struct BitwiseOr
-	{
-		template<typename T>
-		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
-																			{
-			return (a | b);
-		}
-	};
-
-	template<typename IndexType>
-	__device__ IndexType binsearch_maxle(	const IndexType *vec,
-														const IndexType val,
-														IndexType low,
-														IndexType high) {
-		while (true) {
-			if (low == high)
-				return low; //we know it exists
-			if ((low + 1) == high)
-				return (vec[high] <= val) ? high : low;
-
-			IndexType mid = low + (high - low) / 2;
-
-			if (vec[mid] > val)
-				high = mid - 1;
-			else
-				low = mid;
-
-		}
-	}
-
-	//
-	//  -------------------------  Bottom up -------------------------
-	//
-
-	//
-	// fill_unvisited_queue_kernel
-	//
-	// Finding unvisited vertices in the visited_bmap, and putting them in the queue
-	// Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted
-	// For instance, the queue can look like this :
-	// 34 38 45 58 61 4 18 24 29 71 84 85 90
-	// Because they are represented by those ints in the bitmap :
-	// [34 38 45 58 61] [4 18 24 29] [71 84 85 90]
-
-	//visited_bmap_nints = the visited_bmap is made of that number of ints
-
-	template<typename IndexType>
-	__global__ void fill_unvisited_queue_kernel(	int *visited_bmap,
-																IndexType visited_bmap_nints,
-																IndexType n,
-																IndexType *unvisited,
-																IndexType *unvisited_cnt) {
-		typedef cub::BlockScan<int, FILL_UNVISITED_QUEUE_DIMX> BlockScan;
-		__shared__ typename BlockScan::TempStorage scan_temp_storage;
-
-		//When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) )
-		//We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in
-		//unvisited_common_block_offset
-		__shared__ IndexType unvisited_common_block_offset;
-
-		//We don't want threads divergence in the loop (we're going to call __syncthreads)
-		//Using a block-only dependent in the condition of the loop
-		for (IndexType block_v_idx = blockIdx.x * blockDim.x;
-				block_v_idx < visited_bmap_nints;
-				block_v_idx += blockDim.x * gridDim.x) {
-
-			//Index of visited_bmap that this thread will compute
-			IndexType v_idx = block_v_idx + threadIdx.x;
-
-			int thread_visited_int = (v_idx < visited_bmap_nints)
-												? visited_bmap[v_idx]
-													:
-													(~0); //will be neutral in the next lines (virtual vertices all visited)
-
-			//The last int can only be partially valid
-			//If we are indeed taking care of the last visited int in this thread,
-			//We need to first disable (ie set as "visited") the inactive bits (vertices >= n)
-			if (v_idx == (visited_bmap_nints - 1)) {
-				int active_bits = n - (INT_SIZE * v_idx);
-				int inactive_bits = INT_SIZE - active_bits;
-				int mask = getMaskNLeftmostBitSet(inactive_bits);
-				thread_visited_int |= mask; //Setting inactive bits as visited
-			}
-
-			//Counting number of unvisited vertices represented by this int
-			int n_unvisited_in_int = __popc(~thread_visited_int);
-			int unvisited_thread_offset;
-
-			//We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue
-			//We ask for that space when computing the block scan, that will tell where to write those
-			//vertices in the queue, using the common offset of the block (see below)
-			BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset);
-
-			//Last thread knows how many vertices will be written to the queue by this block
-			//Asking for that space in the queue using the global count, and saving the common offset
-			if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) {
-				IndexType total = unvisited_thread_offset + n_unvisited_in_int;
-				unvisited_common_block_offset = atomicAdd(unvisited_cnt, total);
-			}
-
-			//syncthreads for two reasons : 
-			// - we need to broadcast unvisited_common_block_offset
-			// - we will reuse scan_temp_storage (cf CUB doc)
-			__syncthreads();
-
-			IndexType current_unvisited_index = unvisited_common_block_offset
-					+ unvisited_thread_offset;
-			int nvertices_to_write = n_unvisited_in_int;
-
-			// getNextZeroBit uses __ffs, which gives least significant bit set
-			// which means that as long as n_unvisited_in_int is valid,
-			// we will use valid bits
-
-			while (nvertices_to_write > 0) {
-				if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) {
-					typename vec_t<IndexType>::vec4 vec_v;
-
-					vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-					vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-					vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-					vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-
-					typename vec_t<IndexType>::vec4 *unvisited_i4 = reinterpret_cast<typename vec_t<
-							IndexType>::vec4*>(&unvisited[current_unvisited_index]);
-					*unvisited_i4 = vec_v;
-
-					current_unvisited_index += 4;
-					nvertices_to_write -= 4;
-				}
-				else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) {
-					typename vec_t<IndexType>::vec2 vec_v;
-
-					vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-					vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-
-					typename vec_t<IndexType>::vec2 *unvisited_i2 = reinterpret_cast<typename vec_t<
-							IndexType>::vec2*>(&unvisited[current_unvisited_index]);
-					*unvisited_i2 = vec_v;
-
-					current_unvisited_index += 2;
-					nvertices_to_write -= 2;
-				} else {
-					IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-
-					unvisited[current_unvisited_index] = v;
-
-					current_unvisited_index += 1;
-					nvertices_to_write -= 1;
-				}
-
-			}
-		}
-	}
-
-	//Wrapper
-	template<typename IndexType>
-	void fill_unvisited_queue(	int *visited_bmap,
-										IndexType visited_bmap_nints,
-										IndexType n,
-										IndexType *unvisited,
-										IndexType *unvisited_cnt,
-										cudaStream_t m_stream,
-										bool deterministic) {
-		dim3 grid, block;
-		block.x = FILL_UNVISITED_QUEUE_DIMX;
-
-		grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x);
-
-		fill_unvisited_queue_kernel<<<grid, block, 0, m_stream>>>(	visited_bmap,
-																						visited_bmap_nints,
-																						n,
-																						unvisited,
-																						unvisited_cnt);
-		cudaCheckError()
-		;
-	}
-
-	//
-	// count_unvisited_edges_kernel
-	// Couting the total number of unvisited edges in the graph - using an potentially unvisited queue
-	// We need the current unvisited vertices to be in the unvisited queue
-	// But visited vertices can be in the potentially_unvisited queue
-	// We first check if the vertex is still unvisited before using it
-	// Useful when switching from "Bottom up" to "Top down"
-	//
-
-	template<typename IndexType>
-	__global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited,
-																const IndexType potentially_unvisited_size,
-																const int *visited_bmap,
-																IndexType *degree_vertices,
-																IndexType *mu) {
-		typedef cub::BlockReduce<IndexType, COUNT_UNVISITED_EDGES_DIMX> BlockReduce;
-		__shared__ typename BlockReduce::TempStorage reduce_temp_storage;
-
-		//number of undiscovered edges counted by this thread
-		IndexType thread_unvisited_edges_count = 0;
-
-		for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
-				idx < potentially_unvisited_size;
-				idx += blockDim.x * gridDim.x) {
-
-			IndexType u = potentially_unvisited[idx];
-			int u_visited_bmap = visited_bmap[u / INT_SIZE];
-			int is_visited = u_visited_bmap & (1 << (u % INT_SIZE));
-
-			if (!is_visited)
-				thread_unvisited_edges_count += degree_vertices[u];
-
-		}
-
-		//We need all thread_unvisited_edges_count to be ready before reducing
-		__syncthreads();
-
-		IndexType block_unvisited_edges_count =
-				BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count);
-
-		//block_unvisited_edges_count is only defined is th.x == 0
-		if (threadIdx.x == 0)
-			atomicAdd(mu, block_unvisited_edges_count);
-	}
-
-	//Wrapper
-	template<typename IndexType>
-	void count_unvisited_edges(const IndexType *potentially_unvisited,
-										const IndexType potentially_unvisited_size,
-										const int *visited_bmap,
-										IndexType *node_degree,
-										IndexType *mu,
-										cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = COUNT_UNVISITED_EDGES_DIMX;
-		grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x);
-
-		count_unvisited_edges_kernel<<<grid, block, 0, m_stream>>>(	potentially_unvisited,
-																						potentially_unvisited_size,
-																						visited_bmap,
-																						node_degree,
-																						mu);
-		cudaCheckError()
-		;
-	}
-
-	//
-	// Main Bottom Up kernel
-	// Here we will start to process unvisited vertices in the unvisited queue
-	// We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges
-	// If it's not possible to define a valid parent using only those edges,
-	// add it to the "left_unvisited_queue"
-	//
-
-	//
-	// We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property
-	// It is used to do a reduction locally and fully build the new visited_bmap
-	//
-
-	template<typename IndexType>
-	__global__ void main_bottomup_kernel(	const IndexType *unvisited,
-														const IndexType unvisited_size,
-														IndexType *left_unvisited,
-														IndexType *left_unvisited_cnt,
-														int *visited_bmap,
-														const IndexType *row_ptr,
-														const IndexType *col_ind,
-														IndexType lvl,
-														IndexType *new_frontier,
-														IndexType *new_frontier_cnt,
-														IndexType *distances,
-														IndexType *predecessors,
-														int *edge_mask) {
-		typedef cub::BlockDiscontinuity<IndexType, MAIN_BOTTOMUP_DIMX> BlockDiscontinuity;
-		typedef cub::WarpReduce<int> WarpReduce;
-		typedef cub::BlockScan<int, MAIN_BOTTOMUP_DIMX> BlockScan;
-
-		__shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage;
-		__shared__ typename WarpReduce::TempStorage reduce_temp_storage;
-		__shared__ typename BlockScan::TempStorage scan_temp_storage;
-
-		//To write vertices in the frontier,
-		//We will use a block scan to locally compute the offsets
-		//frontier_common_block_offset contains the common offset for the block
-		__shared__ IndexType frontier_common_block_offset;
-
-		// When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints
-		// from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23)
-		// vertices represented by the same int will be designed as part of the same "group"
-		// To detect the deliminations between those groups, we use BlockDiscontinuity
-		// Then we need to create the new "visited_bmap" within those group.
-		// We use a warp reduction that takes into account limits between groups to do it
-		// But a group can be cut in two different warps : in that case, the second warp
-		// put the result of its local reduction in local_visited_bmap_warp_head
-		// the first warp will then read it and finish the reduction
-
-		__shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS];
-
-		const int warpid = threadIdx.x / WARP_SIZE;
-		const int laneid = threadIdx.x % WARP_SIZE;
-
-		// we will call __syncthreads inside the loop
-		// we need to keep complete block active
-		for (IndexType block_off = blockIdx.x * blockDim.x;
-				block_off < unvisited_size;
-				block_off += blockDim.x * gridDim.x)
-						{
-			IndexType idx = block_off + threadIdx.x;
-
-			// This thread will take care of unvisited_vertex
-			// in the visited_bmap, it is represented by the int at index
-			// visited_bmap_index = unvisited_vertex/INT_SIZE
-			// it will be used by BlockDiscontinuity
-			// to flag the separation between groups of vertices (vertices represented by different in in visited_bmap)
-			IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one
-			visited_bmap_index[0] = -1;
-			IndexType unvisited_vertex = -1;
-
-			// local_visited_bmap gives info on the visited bit of unvisited_vertex
-			//
-			// By default, everything is visited
-			// This is because we only take care of unvisited vertices here,
-			// The other are by default unvisited
-			// If a vertex remain unvisited, we will notice it here
-			// That's why by default we consider everything visited ( ie ~0 )
-			// If we fail to assign one parent to an unvisited vertex, we will
-			// explicitly unset the bit
-			int local_visited_bmap = (~0);
-			int found = 0;
-			int more_to_visit = 0;
-			IndexType valid_parent;
-			IndexType left_unvisited_off;
-
-			if (idx < unvisited_size)
-					{
-				//Processing first STPV edges of unvisited v
-				//If bigger than that, push to left_unvisited queue
-				unvisited_vertex = unvisited[idx];
-
-				IndexType edge_begin = row_ptr[unvisited_vertex];
-				IndexType edge_end = row_ptr[unvisited_vertex + 1];
-
-				visited_bmap_index[0] = unvisited_vertex / INT_SIZE;
-
-				IndexType degree = edge_end - edge_begin;
-
-				for (IndexType edge = edge_begin;
-						edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge)
-						{
-					if (edge_mask && !edge_mask[edge])
-						continue;
-
-					IndexType parent_candidate = col_ind[edge];
-
-					if (distances[parent_candidate] == (lvl - 1))
-							{
-						found = 1;
-						valid_parent = parent_candidate;
-						break;
-					}
-				}
-
-				// This vertex will remain unvisited at the end of this kernel
-				// Explicitly say it
-				if (!found)
-					local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited
-				else
-				{
-					if (distances)
-						distances[unvisited_vertex] = lvl;
-					if (predecessors)
-						predecessors[unvisited_vertex] = valid_parent;
-				}
-
-				//If we haven't found a parent and there's more edge to check
-				if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES)
-				{
-					left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1); //TODO scan
-					more_to_visit = 1;
-				}
-
-			}
-
-			//
-			// We will separate vertices in group
-			// Two vertices are in the same group if represented by same int in visited_bmap
-			// ie u and v in same group <=> u/32 == v/32
-			//
-			// We will now flag the head of those group (first element of each group)
-			//
-			// 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue)
-			// 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained
-			// at most by two warps
-
-			int is_head_a[1]; //CUB need an array
-			BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a,
-																						visited_bmap_index,
-																						cub::Inequality());
-			int is_head = is_head_a[0];
-
-			// Computing the warp reduce within group
-			// This primitive uses the is_head flags to know where the limits of the groups are
-			// We use bitwise and as operator, because of the fact that 1 is the default value
-			// If a vertex is unvisited, we have to explicitly ask for it
-			int local_bmap_agg =
-					WarpReduce(reduce_temp_storage).HeadSegmentedReduce(	local_visited_bmap,
-																							is_head,
-																							BitwiseAnd());
-
-			// We need to take care of the groups cut in two in two different warps
-			// Saving second part of the reduce here, then applying it on the first part bellow
-			// Corner case : if the first thread of the warp is a head, then this group is not cut in two
-			// and then we have to be neutral (for an bitwise and, it's an ~0)
-			if (laneid == 0)
-					{
-				local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg;
-			}
-
-			//broadcasting local_visited_bmap_warp_head
-			__syncthreads();
-
-			int head_ballot = nvgraph::utils::ballot(is_head);
-
-			//As long as idx < unvisited_size, we know there's at least one head per warp
-			int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot);
-
-			int is_last_head_in_warp = (laneid == laneid_last_head_in_warp);
-
-			// if laneid == 0 && is_last_head_in_warp, it's a special case where
-			// a group of size 32 starts exactly at lane 0
-			// in that case, nothing to do (this group is not cut by a warp delimitation)
-			// we also have to make sure that a warp actually exists after this one (this corner case is handled after)
-			if (laneid != 0 && is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS)
-			{
-				local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1];
-			}
-
-			//Three cases :
-			// -> This is the first group of the block - it may be cut in two (with previous block)
-			// -> This is the last group of the block - same thing
-			// -> This group is completely contained in this block
-
-			if (warpid == 0 && laneid == 0)
-					{
-				//The first elt of this group considered in this block is unvisited_vertex
-				//We know that's the case because elts are sorted in a group, and we are at laneid == 0
-				//We will do an atomicOr - we have to be neutral about elts < unvisited_vertex
-				int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid
-				int mask = getMaskNLeftmostBitSet(INT_SIZE - iv);
-				local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex
-				atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
-			}
-			else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) &&
-					laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case
-					idx < unvisited_size //we could be out
-							)
-							{
-				//Last head of the block
-				//We don't know if this group is complete
-
-				//last_v is the last unvisited_vertex of the group IN THIS block
-				//we dont know about the rest - we have to be neutral about elts > last_v
-
-				//the destination thread of the __shfl is active
-				int laneid_max = min((IndexType) (WARP_SIZE - 1),
-											(unvisited_size - (block_off + 32 * warpid)));
-				IndexType last_v = nvgraph::utils::shfl(	unvisited_vertex,
-																		laneid_max,
-																		WARP_SIZE,
-																		__activemask());
-
-				if (is_last_head_in_warp)
-				{
-					int ilast_v = last_v % INT_SIZE + 1;
-					int mask = getMaskNRightmostBitSet(ilast_v);
-					local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex
-					atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
-				}
-			}
-			else
-			{
-				//group completely in block
-				if (is_head && idx < unvisited_size) {
-					visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int
-				}
-			}
-
-			//Saving in frontier
-
-			int thread_frontier_offset;
-			BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset);
-			IndexType inclusive_sum = thread_frontier_offset + found;
-			if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum)
-					{
-				frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
-			}
-
-			//1) Broadcasting frontier_common_block_offset
-			//2) we want to reuse the *_temp_storage
-			__syncthreads();
-
-			if (found)
-				new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex;
-			if (more_to_visit)
-				left_unvisited[left_unvisited_off] = unvisited_vertex;
-
-		}
-	}
-
-	template<typename IndexType>
-	void bottom_up_main(	IndexType *unvisited,
-								IndexType unvisited_size,
-								IndexType *left_unvisited,
-								IndexType *d_left_unvisited_idx,
-								int *visited,
-								const IndexType *row_ptr,
-								const IndexType *col_ind,
-								IndexType lvl,
-								IndexType *new_frontier,
-								IndexType *new_frontier_idx,
-								IndexType *distances,
-								IndexType *predecessors,
-								int *edge_mask,
-								cudaStream_t m_stream,
-								bool deterministic) {
-		dim3 grid, block;
-		block.x = MAIN_BOTTOMUP_DIMX;
-
-		grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x);
-
-		main_bottomup_kernel<<<grid, block, 0, m_stream>>>(unvisited,
-																			unvisited_size,
-																			left_unvisited,
-																			d_left_unvisited_idx,
-																			visited,
-																			row_ptr,
-																			col_ind,
-																			lvl,
-																			new_frontier,
-																			new_frontier_idx,
-																			distances,
-																			predecessors,
-																			edge_mask);
-		cudaCheckError()
-		;
-	}
-
-	//
-	// bottom_up_large_degree_kernel
-	// finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found
-	//
-	template<typename IndexType>
-	__global__ void bottom_up_large_degree_kernel(	IndexType *left_unvisited,
-																	IndexType left_unvisited_size,
-																	int *visited,
-																	const IndexType *row_ptr,
-																	const IndexType *col_ind,
-																	IndexType lvl,
-																	IndexType *new_frontier,
-																	IndexType *new_frontier_cnt,
-																	IndexType *distances,
-																	IndexType *predecessors,
-																	int *edge_mask) {
-
-		int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE;
-		int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
-		int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
-
-		//Inactive threads are not a pb for __ballot (known behaviour)
-		for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id;
-				idx < left_unvisited_size;
-				idx += gridDim.x * logical_warps_per_block) {
-
-			//Unvisited vertices - potentially in the next frontier
-			IndexType v = left_unvisited[idx];
-
-			//Used only with symmetric graphs
-			//Parents are included in v's neighbors
-			IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited
-
-			IndexType end_i_edge = row_ptr[v + 1];
-
-			//We can have warp divergence in the next loop
-			//It's not a pb because the behaviour of __ballot
-			//is know with inactive threads
-			for (IndexType i_edge = first_i_edge + logical_lane_id;
-					i_edge < end_i_edge;
-					i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) {
-
-				IndexType valid_parent = -1;
-
-				if (!edge_mask || edge_mask[i_edge]) {
-					IndexType u = col_ind[i_edge];
-					IndexType lvl_u = distances[u];
-
-					if (lvl_u == (lvl - 1)) {
-						valid_parent = u;
-					}
-				}
-
-				unsigned int warp_valid_p_ballot = nvgraph::utils::ballot((valid_parent != -1));
-
-				int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE;
-				unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1;
-				unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot
-						>> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp);
-				logical_warp_valid_p_ballot &= mask;
-
-				int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1;
-
-				if (chosen_thread == logical_lane_id) {
-					//Using only one valid parent (reduce bw)
-					IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1);
-					int m = 1 << (v % INT_SIZE);
-					atomicOr(&visited[v / INT_SIZE], m);
-					distances[v] = lvl;
-
-					if (predecessors)
-						predecessors[v] = valid_parent;
-
-					new_frontier[off] = v;
-				}
-
-				if (logical_warp_valid_p_ballot) {
-					break;
-				}
-			}
-
-		}
-	}
-
-	template<typename IndexType>
-	void bottom_up_large(IndexType *left_unvisited,
-								IndexType left_unvisited_size,
-								int *visited,
-								const IndexType *row_ptr,
-								const IndexType *col_ind,
-								IndexType lvl,
-								IndexType *new_frontier,
-								IndexType *new_frontier_idx,
-								IndexType *distances,
-								IndexType *predecessors,
-								int *edge_mask,
-								cudaStream_t m_stream,
-								bool deterministic) {
-		dim3 grid, block;
-		block.x = LARGE_BOTTOMUP_DIMX;
-		grid.x = min(	(IndexType) MAXBLOCKS,
-							((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x);
-
-		bottom_up_large_degree_kernel<<<grid, block, 0, m_stream>>>(left_unvisited,
-																						left_unvisited_size,
-																						visited,
-																						row_ptr,
-																						col_ind,
-																						lvl,
-																						new_frontier,
-																						new_frontier_idx,
-																						distances,
-																						predecessors,
-																						edge_mask);
-		cudaCheckError()
-		;
-	}
-
-	//
-	//
-	//  ------------------------------ Top down ------------------------------
-	//
-	//
-
-	//
-	// compute_bucket_offsets_kernel
-	// simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer
-	//
-
-	template<typename IndexType>
-	__global__ void compute_bucket_offsets_kernel(	const IndexType *frontier_degrees_exclusive_sum,
-																	IndexType *bucket_offsets,
-																	const IndexType frontier_size,
-																	IndexType total_degree) {
-		IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
-				* NBUCKETS_PER_BLOCK + 1);
-
-		for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x;
-				bid <= end;
-				bid += gridDim.x * blockDim.x) {
-
-			IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
-
-			bucket_offsets[bid] = binsearch_maxle(	frontier_degrees_exclusive_sum,
-																eid,
-																(IndexType) 0,
-																frontier_size - 1);
-
-		}
-	}
-
-	template<typename IndexType>
-	void compute_bucket_offsets(	IndexType *cumul,
-											IndexType *bucket_offsets,
-											IndexType frontier_size,
-											IndexType total_degree,
-											cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
-
-		grid.x = min(	(IndexType) MAXBLOCKS,
-							((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
-									* NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x);
-
-		compute_bucket_offsets_kernel<<<grid, block, 0, m_stream>>>(cumul,
-																						bucket_offsets,
-																						frontier_size,
-																						total_degree);
-		cudaCheckError()
-		;
-	}
-
-	//
-	// topdown_expand_kernel
-	// Read current frontier and compute new one with top down paradigm
-	// One thread = One edge
-	// To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than)
-	// This index k will give us the origin of this edge, which is frontier[k]
-	// This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k]
-	//
-	// To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches
-	// We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges
-	//
-	// Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k
-	// To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory
-	// We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below)
-	//
-	// We will then look which vertices are not visited yet :
-	// 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on
-	// 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue
-	//
-	// We then treat the candidates queue using the threadIdx.x < ncandidates
-	// If we are indeed the first thread to discover that vertex (result of atomicOr(visited))
-	// We add it to the new frontier
-	//
-
-	template<typename IndexType>
-	__global__ void topdown_expand_kernel(	const IndexType *row_ptr,
-														const IndexType *col_ind,
-														const IndexType *frontier,
-														const IndexType frontier_size,
-														const IndexType totaldegree,
-														const IndexType max_items_per_thread,
-														const IndexType lvl,
-														IndexType *new_frontier,
-														IndexType *new_frontier_cnt,
-														const IndexType *frontier_degrees_exclusive_sum,
-														const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
-														int *bmap,
-														IndexType *distances,
-														IndexType *predecessors,
-														const int *edge_mask,
-														const int *isolated_bmap,
-														bool directed) {
-		//BlockScan
-		typedef cub::BlockScan<IndexType, TOP_DOWN_EXPAND_DIMX> BlockScan;
-		__shared__ typename BlockScan::TempStorage scan_storage;
-
-		// We will do a scan to know where to write in frontier
-		// This will contain the common offset of the block
-		__shared__ IndexType frontier_common_block_offset;
-
-		__shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
-		__shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
-
-		//
-		// Frontier candidates local queue
-		// We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything
-		// We also save the predecessors here, because we will not be able to retrieve it after
-		//
-		__shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE
-				* TOP_DOWN_EXPAND_DIMX];
-		__shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE
-				* TOP_DOWN_EXPAND_DIMX];
-		__shared__ IndexType block_n_frontier_candidates;
-
-		IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
-		IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1)
-				/ TOP_DOWN_EXPAND_DIMX;
-
-		n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
-
-		for (;
-				(n_items_per_thread_left > 0) && (block_offset < totaldegree);
-
-				block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
-						n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) {
-
-			// In this loop, we will process batch_set_size batches
-			IndexType nitems_per_thread = min(	n_items_per_thread_left,
-															(IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD);
-
-			// Loading buckets offset (see compute_bucket_offsets_kernel)
-
-			if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
-				shared_buckets_offsets[threadIdx.x] =
-						frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE
-								+ threadIdx.x];
-
-			// We will use shared_buckets_offsets
-			__syncthreads();
-
-			//
-			// shared_buckets_offsets gives us a range of the possible indexes
-			// for edge of linear_threadx, we are looking for the value k such as
-			// k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx
-			//
-			// we have 0 <= k < frontier_size
-			// but we also have :
-			//
-			// frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
-			// <= k
-			// <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1]
-			//
-			// To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below)
-			// We will load them here
-			// We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop
-			// Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below)
-
-			//We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[
-			//If it doesn't fit, --right until it does, then loop
-			//It is excepted to fit on the first try, that's why we start right = nitems_per_thread
-
-			IndexType left = 0;
-			IndexType right = nitems_per_thread;
-
-			while (left < nitems_per_thread) {
-				//
-				// Values that are necessary to compute the local binary searches
-				// We only need those with indexes between extremes indexes of buckets_offsets
-				// We need the next val for the binary search, hence the +1
-				//
-
-				IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
-						- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
-
-				//If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1
-				while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
-					--right;
-
-					nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
-							- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
-				}
-
-				IndexType nitems_per_thread_for_this_load = right - left;
-
-				IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left
-						* NBUCKETS_PER_BLOCK];
-
-				//TODO put again the nvalues_to_load == 1
-				if (threadIdx.x < nvalues_to_load) {
-					shared_frontier_degrees_exclusive_sum[threadIdx.x] =
-							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
-									+ threadIdx.x];
-				}
-
-				if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
-					shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
-							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
-									+ TOP_DOWN_EXPAND_DIMX];
-				}
-
-				//shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync
-				//TODO we don't use it if nvalues_to_load == 1
-				__syncthreads();
-
-				// Now we will process the edges
-				// Here each thread will process nitems_per_thread_for_this_load
-				for (IndexType item_index = 0;
-						item_index < nitems_per_thread_for_this_load;
-						item_index += TOP_DOWN_BATCH_SIZE) {
-
-					// We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism)
-					// Reduces latency
-
-					IndexType current_max_edge_index = min(block_offset
-																				+ (left
-																						+ nitems_per_thread_for_this_load)
-																						* blockDim.x,
-																		totaldegree);
-
-					//We will need vec_u (source of the edge) until the end if we need to save the predecessors
-					//For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case)
-
-					IndexType vec_u[TOP_DOWN_BATCH_SIZE];
-					IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
-					IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
-
-					IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
+  //
+  // gives the equivalent vectors from a type
+  // for the max val, would be better to use numeric_limits<>::max() once
+  // cpp11 is allowed in nvgraph
+  //
+
+  template<typename >
+  struct vec_t {
+    typedef int4 vec4;
+    typedef int2 vec2;
+  };
+
+  template<>
+  struct vec_t<int> {
+    typedef int4 vec4;
+    typedef int2 vec2;
+    static const int max = INT_MAX;
+  };
+
+  template<>
+  struct vec_t<long long int> {
+    typedef longlong4 vec4;
+    typedef longlong2 vec2;
+    static const long long int max = LLONG_MAX;
+  };
+
+  //
+  // ------------------------- Helper device functions -------------------
+  //
+
+  __forceinline__ __device__ int getMaskNRightmostBitSet(int n) {
+    if (n == INT_SIZE)
+      return (~0);
+    int mask = (1 << n) - 1;
+    return mask;
+  }
+
+  __forceinline__ __device__ int getMaskNLeftmostBitSet(int n) {
+    if (n == 0)
+      return 0;
+    int mask = ~((1 << (INT_SIZE - n)) - 1);
+    return mask;
+  }
+
+  __forceinline__ __device__ int getNextZeroBit(int& val) {
+    int ibit = __ffs(~val) - 1;
+    val |= (1 << ibit);
+
+    return ibit;
+  }
+
+  struct BitwiseAnd
+  {
+    template<typename T>
+    __host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
+                                      {
+      return (a & b);
+    }
+  };
+
+  struct BitwiseOr
+  {
+    template<typename T>
+    __host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
+                                      {
+      return (a | b);
+    }
+  };
+
+  template<typename IndexType>
+  __device__ IndexType binsearch_maxle(  const IndexType *vec,
+                            const IndexType val,
+                            IndexType low,
+                            IndexType high) {
+    while (true) {
+      if (low == high)
+        return low; //we know it exists
+      if ((low + 1) == high)
+        return (vec[high] <= val) ? high : low;
+
+      IndexType mid = low + (high - low) / 2;
+
+      if (vec[mid] > val)
+        high = mid - 1;
+      else
+        low = mid;
+
+    }
+  }
+
+  //
+  //  -------------------------  Bottom up -------------------------
+  //
+
+  //
+  // fill_unvisited_queue_kernel
+  //
+  // Finding unvisited vertices in the visited_bmap, and putting them in the queue
+  // Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted
+  // For instance, the queue can look like this :
+  // 34 38 45 58 61 4 18 24 29 71 84 85 90
+  // Because they are represented by those ints in the bitmap :
+  // [34 38 45 58 61] [4 18 24 29] [71 84 85 90]
+
+  //visited_bmap_nints = the visited_bmap is made of that number of ints
+
+  template<typename IndexType>
+  __global__ void fill_unvisited_queue_kernel(  int *visited_bmap,
+                                IndexType visited_bmap_nints,
+                                IndexType n,
+                                IndexType *unvisited,
+                                IndexType *unvisited_cnt) {
+    typedef cub::BlockScan<int, FILL_UNVISITED_QUEUE_DIMX> BlockScan;
+    __shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+    //When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) )
+    //We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in
+    //unvisited_common_block_offset
+    __shared__ IndexType unvisited_common_block_offset;
+
+    //We don't want threads divergence in the loop (we're going to call __syncthreads)
+    //Using a block-only dependent in the condition of the loop
+    for (IndexType block_v_idx = blockIdx.x * blockDim.x;
+        block_v_idx < visited_bmap_nints;
+        block_v_idx += blockDim.x * gridDim.x) {
+
+      //Index of visited_bmap that this thread will compute
+      IndexType v_idx = block_v_idx + threadIdx.x;
+
+      int thread_visited_int = (v_idx < visited_bmap_nints)
+                        ? visited_bmap[v_idx]
+                          :
+                          (~0); //will be neutral in the next lines (virtual vertices all visited)
+
+      //The last int can only be partially valid
+      //If we are indeed taking care of the last visited int in this thread,
+      //We need to first disable (ie set as "visited") the inactive bits (vertices >= n)
+      if (v_idx == (visited_bmap_nints - 1)) {
+        int active_bits = n - (INT_SIZE * v_idx);
+        int inactive_bits = INT_SIZE - active_bits;
+        int mask = getMaskNLeftmostBitSet(inactive_bits);
+        thread_visited_int |= mask; //Setting inactive bits as visited
+      }
+
+      //Counting number of unvisited vertices represented by this int
+      int n_unvisited_in_int = __popc(~thread_visited_int);
+      int unvisited_thread_offset;
+
+      //We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue
+      //We ask for that space when computing the block scan, that will tell where to write those
+      //vertices in the queue, using the common offset of the block (see below)
+      BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset);
+
+      //Last thread knows how many vertices will be written to the queue by this block
+      //Asking for that space in the queue using the global count, and saving the common offset
+      if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) {
+        IndexType total = unvisited_thread_offset + n_unvisited_in_int;
+        unvisited_common_block_offset = atomicAdd(unvisited_cnt, total);
+      }
+
+      //syncthreads for two reasons : 
+      // - we need to broadcast unvisited_common_block_offset
+      // - we will reuse scan_temp_storage (cf CUB doc)
+      __syncthreads();
+
+      IndexType current_unvisited_index = unvisited_common_block_offset
+          + unvisited_thread_offset;
+      int nvertices_to_write = n_unvisited_in_int;
+
+      // getNextZeroBit uses __ffs, which gives least significant bit set
+      // which means that as long as n_unvisited_in_int is valid,
+      // we will use valid bits
+
+      while (nvertices_to_write > 0) {
+        if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) {
+          typename vec_t<IndexType>::vec4 vec_v;
+
+          vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+          vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+          vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+          vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+          typename vec_t<IndexType>::vec4 *unvisited_i4 = reinterpret_cast<typename vec_t<
+              IndexType>::vec4*>(&unvisited[current_unvisited_index]);
+          *unvisited_i4 = vec_v;
+
+          current_unvisited_index += 4;
+          nvertices_to_write -= 4;
+        }
+        else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) {
+          typename vec_t<IndexType>::vec2 vec_v;
+
+          vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+          vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+          typename vec_t<IndexType>::vec2 *unvisited_i2 = reinterpret_cast<typename vec_t<
+              IndexType>::vec2*>(&unvisited[current_unvisited_index]);
+          *unvisited_i2 = vec_v;
+
+          current_unvisited_index += 2;
+          nvertices_to_write -= 2;
+        } else {
+          IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+          unvisited[current_unvisited_index] = v;
+
+          current_unvisited_index += 1;
+          nvertices_to_write -= 1;
+        }
+
+      }
+    }
+  }
+
+  //Wrapper
+  template<typename IndexType>
+  void fill_unvisited_queue(  int *visited_bmap,
+                    IndexType visited_bmap_nints,
+                    IndexType n,
+                    IndexType *unvisited,
+                    IndexType *unvisited_cnt,
+                    cudaStream_t m_stream,
+                    bool deterministic) {
+    dim3 grid, block;
+    block.x = FILL_UNVISITED_QUEUE_DIMX;
+
+    grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x);
+
+    fill_unvisited_queue_kernel<<<grid, block, 0, m_stream>>>(  visited_bmap,
+                                            visited_bmap_nints,
+                                            n,
+                                            unvisited,
+                                            unvisited_cnt);
+    cudaCheckError()
+    ;
+  }
+
+  //
+  // count_unvisited_edges_kernel
+  // Couting the total number of unvisited edges in the graph - using an potentially unvisited queue
+  // We need the current unvisited vertices to be in the unvisited queue
+  // But visited vertices can be in the potentially_unvisited queue
+  // We first check if the vertex is still unvisited before using it
+  // Useful when switching from "Bottom up" to "Top down"
+  //
+
+  template<typename IndexType>
+  __global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited,
+                                const IndexType potentially_unvisited_size,
+                                const int *visited_bmap,
+                                IndexType *degree_vertices,
+                                IndexType *mu) {
+    typedef cub::BlockReduce<IndexType, COUNT_UNVISITED_EDGES_DIMX> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage reduce_temp_storage;
+
+    //number of undiscovered edges counted by this thread
+    IndexType thread_unvisited_edges_count = 0;
+
+    for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+        idx < potentially_unvisited_size;
+        idx += blockDim.x * gridDim.x) {
+
+      IndexType u = potentially_unvisited[idx];
+      int u_visited_bmap = visited_bmap[u / INT_SIZE];
+      int is_visited = u_visited_bmap & (1 << (u % INT_SIZE));
+
+      if (!is_visited)
+        thread_unvisited_edges_count += degree_vertices[u];
+
+    }
+
+    //We need all thread_unvisited_edges_count to be ready before reducing
+    __syncthreads();
+
+    IndexType block_unvisited_edges_count =
+        BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count);
+
+    //block_unvisited_edges_count is only defined is th.x == 0
+    if (threadIdx.x == 0)
+      atomicAdd(mu, block_unvisited_edges_count);
+  }
+
+  //Wrapper
+  template<typename IndexType>
+  void count_unvisited_edges(const IndexType *potentially_unvisited,
+                    const IndexType potentially_unvisited_size,
+                    const int *visited_bmap,
+                    IndexType *node_degree,
+                    IndexType *mu,
+                    cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = COUNT_UNVISITED_EDGES_DIMX;
+    grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x);
+
+    count_unvisited_edges_kernel<<<grid, block, 0, m_stream>>>(  potentially_unvisited,
+                                            potentially_unvisited_size,
+                                            visited_bmap,
+                                            node_degree,
+                                            mu);
+    cudaCheckError()
+    ;
+  }
+
+  //
+  // Main Bottom Up kernel
+  // Here we will start to process unvisited vertices in the unvisited queue
+  // We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges
+  // If it's not possible to define a valid parent using only those edges,
+  // add it to the "left_unvisited_queue"
+  //
+
+  //
+  // We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property
+  // It is used to do a reduction locally and fully build the new visited_bmap
+  //
+
+  template<typename IndexType>
+  __global__ void main_bottomup_kernel(  const IndexType *unvisited,
+                            const IndexType unvisited_size,
+                            IndexType *left_unvisited,
+                            IndexType *left_unvisited_cnt,
+                            int *visited_bmap,
+                            const IndexType *row_ptr,
+                            const IndexType *col_ind,
+                            IndexType lvl,
+                            IndexType *new_frontier,
+                            IndexType *new_frontier_cnt,
+                            IndexType *distances,
+                            IndexType *predecessors,
+                            int *edge_mask) {
+    typedef cub::BlockDiscontinuity<IndexType, MAIN_BOTTOMUP_DIMX> BlockDiscontinuity;
+    typedef cub::WarpReduce<int> WarpReduce;
+    typedef cub::BlockScan<int, MAIN_BOTTOMUP_DIMX> BlockScan;
+
+    __shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage;
+    __shared__ typename WarpReduce::TempStorage reduce_temp_storage;
+    __shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+    //To write vertices in the frontier,
+    //We will use a block scan to locally compute the offsets
+    //frontier_common_block_offset contains the common offset for the block
+    __shared__ IndexType frontier_common_block_offset;
+
+    // When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints
+    // from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23)
+    // vertices represented by the same int will be designed as part of the same "group"
+    // To detect the deliminations between those groups, we use BlockDiscontinuity
+    // Then we need to create the new "visited_bmap" within those group.
+    // We use a warp reduction that takes into account limits between groups to do it
+    // But a group can be cut in two different warps : in that case, the second warp
+    // put the result of its local reduction in local_visited_bmap_warp_head
+    // the first warp will then read it and finish the reduction
+
+    __shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS];
+
+    const int warpid = threadIdx.x / WARP_SIZE;
+    const int laneid = threadIdx.x % WARP_SIZE;
+
+    // we will call __syncthreads inside the loop
+    // we need to keep complete block active
+    for (IndexType block_off = blockIdx.x * blockDim.x;
+        block_off < unvisited_size;
+        block_off += blockDim.x * gridDim.x)
+            {
+      IndexType idx = block_off + threadIdx.x;
+
+      // This thread will take care of unvisited_vertex
+      // in the visited_bmap, it is represented by the int at index
+      // visited_bmap_index = unvisited_vertex/INT_SIZE
+      // it will be used by BlockDiscontinuity
+      // to flag the separation between groups of vertices (vertices represented by different in in visited_bmap)
+      IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one
+      visited_bmap_index[0] = -1;
+      IndexType unvisited_vertex = -1;
+
+      // local_visited_bmap gives info on the visited bit of unvisited_vertex
+      //
+      // By default, everything is visited
+      // This is because we only take care of unvisited vertices here,
+      // The other are by default unvisited
+      // If a vertex remain unvisited, we will notice it here
+      // That's why by default we consider everything visited ( ie ~0 )
+      // If we fail to assign one parent to an unvisited vertex, we will
+      // explicitly unset the bit
+      int local_visited_bmap = (~0);
+      int found = 0;
+      int more_to_visit = 0;
+      IndexType valid_parent;
+      IndexType left_unvisited_off;
+
+      if (idx < unvisited_size)
+          {
+        //Processing first STPV edges of unvisited v
+        //If bigger than that, push to left_unvisited queue
+        unvisited_vertex = unvisited[idx];
+
+        IndexType edge_begin = row_ptr[unvisited_vertex];
+        IndexType edge_end = row_ptr[unvisited_vertex + 1];
+
+        visited_bmap_index[0] = unvisited_vertex / INT_SIZE;
+
+        IndexType degree = edge_end - edge_begin;
+
+        for (IndexType edge = edge_begin;
+            edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge)
+            {
+          if (edge_mask && !edge_mask[edge])
+            continue;
+
+          IndexType parent_candidate = col_ind[edge];
+
+          if (distances[parent_candidate] == (lvl - 1))
+              {
+            found = 1;
+            valid_parent = parent_candidate;
+            break;
+          }
+        }
+
+        // This vertex will remain unvisited at the end of this kernel
+        // Explicitly say it
+        if (!found)
+          local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited
+        else
+        {
+          if (distances)
+            distances[unvisited_vertex] = lvl;
+          if (predecessors)
+            predecessors[unvisited_vertex] = valid_parent;
+        }
+
+        //If we haven't found a parent and there's more edge to check
+        if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES)
+        {
+          left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1); //TODO scan
+          more_to_visit = 1;
+        }
+
+      }
+
+      //
+      // We will separate vertices in group
+      // Two vertices are in the same group if represented by same int in visited_bmap
+      // ie u and v in same group <=> u/32 == v/32
+      //
+      // We will now flag the head of those group (first element of each group)
+      //
+      // 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue)
+      // 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained
+      // at most by two warps
+
+      int is_head_a[1]; //CUB need an array
+      BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a,
+                                            visited_bmap_index,
+                                            cub::Inequality());
+      int is_head = is_head_a[0];
+
+      // Computing the warp reduce within group
+      // This primitive uses the is_head flags to know where the limits of the groups are
+      // We use bitwise and as operator, because of the fact that 1 is the default value
+      // If a vertex is unvisited, we have to explicitly ask for it
+      int local_bmap_agg =
+          WarpReduce(reduce_temp_storage).HeadSegmentedReduce(  local_visited_bmap,
+                                              is_head,
+                                              BitwiseAnd());
+
+      // We need to take care of the groups cut in two in two different warps
+      // Saving second part of the reduce here, then applying it on the first part bellow
+      // Corner case : if the first thread of the warp is a head, then this group is not cut in two
+      // and then we have to be neutral (for an bitwise and, it's an ~0)
+      if (laneid == 0)
+          {
+        local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg;
+      }
+
+      //broadcasting local_visited_bmap_warp_head
+      __syncthreads();
+
+      int head_ballot = nvgraph::utils::ballot(is_head);
+
+      //As long as idx < unvisited_size, we know there's at least one head per warp
+      int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot);
+
+      int is_last_head_in_warp = (laneid == laneid_last_head_in_warp);
+
+      // if laneid == 0 && is_last_head_in_warp, it's a special case where
+      // a group of size 32 starts exactly at lane 0
+      // in that case, nothing to do (this group is not cut by a warp delimitation)
+      // we also have to make sure that a warp actually exists after this one (this corner case is handled after)
+      if (laneid != 0 && is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS)
+      {
+        local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1];
+      }
+
+      //Three cases :
+      // -> This is the first group of the block - it may be cut in two (with previous block)
+      // -> This is the last group of the block - same thing
+      // -> This group is completely contained in this block
+
+      if (warpid == 0 && laneid == 0)
+          {
+        //The first elt of this group considered in this block is unvisited_vertex
+        //We know that's the case because elts are sorted in a group, and we are at laneid == 0
+        //We will do an atomicOr - we have to be neutral about elts < unvisited_vertex
+        int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid
+        int mask = getMaskNLeftmostBitSet(INT_SIZE - iv);
+        local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex
+        atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
+      }
+      else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) &&
+          laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case
+          idx < unvisited_size //we could be out
+              )
+              {
+        //Last head of the block
+        //We don't know if this group is complete
+
+        //last_v is the last unvisited_vertex of the group IN THIS block
+        //we dont know about the rest - we have to be neutral about elts > last_v
+
+        //the destination thread of the __shfl is active
+        int laneid_max = min((IndexType) (WARP_SIZE - 1),
+                      (unvisited_size - (block_off + 32 * warpid)));
+        IndexType last_v = nvgraph::utils::shfl(  unvisited_vertex,
+                                    laneid_max,
+                                    WARP_SIZE,
+                                    __activemask());
+
+        if (is_last_head_in_warp)
+        {
+          int ilast_v = last_v % INT_SIZE + 1;
+          int mask = getMaskNRightmostBitSet(ilast_v);
+          local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex
+          atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
+        }
+      }
+      else
+      {
+        //group completely in block
+        if (is_head && idx < unvisited_size) {
+          visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int
+        }
+      }
+
+      //Saving in frontier
+
+      int thread_frontier_offset;
+      BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset);
+      IndexType inclusive_sum = thread_frontier_offset + found;
+      if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum)
+          {
+        frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
+      }
+
+      //1) Broadcasting frontier_common_block_offset
+      //2) we want to reuse the *_temp_storage
+      __syncthreads();
+
+      if (found)
+        new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex;
+      if (more_to_visit)
+        left_unvisited[left_unvisited_off] = unvisited_vertex;
+
+    }
+  }
+
+  template<typename IndexType>
+  void bottom_up_main(  IndexType *unvisited,
+                IndexType unvisited_size,
+                IndexType *left_unvisited,
+                IndexType *d_left_unvisited_idx,
+                int *visited,
+                const IndexType *row_ptr,
+                const IndexType *col_ind,
+                IndexType lvl,
+                IndexType *new_frontier,
+                IndexType *new_frontier_idx,
+                IndexType *distances,
+                IndexType *predecessors,
+                int *edge_mask,
+                cudaStream_t m_stream,
+                bool deterministic) {
+    dim3 grid, block;
+    block.x = MAIN_BOTTOMUP_DIMX;
+
+    grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x);
+
+    main_bottomup_kernel<<<grid, block, 0, m_stream>>>(unvisited,
+                                      unvisited_size,
+                                      left_unvisited,
+                                      d_left_unvisited_idx,
+                                      visited,
+                                      row_ptr,
+                                      col_ind,
+                                      lvl,
+                                      new_frontier,
+                                      new_frontier_idx,
+                                      distances,
+                                      predecessors,
+                                      edge_mask);
+    cudaCheckError()
+    ;
+  }
+
+  //
+  // bottom_up_large_degree_kernel
+  // finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found
+  //
+  template<typename IndexType>
+  __global__ void bottom_up_large_degree_kernel(  IndexType *left_unvisited,
+                                  IndexType left_unvisited_size,
+                                  int *visited,
+                                  const IndexType *row_ptr,
+                                  const IndexType *col_ind,
+                                  IndexType lvl,
+                                  IndexType *new_frontier,
+                                  IndexType *new_frontier_cnt,
+                                  IndexType *distances,
+                                  IndexType *predecessors,
+                                  int *edge_mask) {
+
+    int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE;
+    int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
+    int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
+
+    //Inactive threads are not a pb for __ballot (known behaviour)
+    for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id;
+        idx < left_unvisited_size;
+        idx += gridDim.x * logical_warps_per_block) {
+
+      //Unvisited vertices - potentially in the next frontier
+      IndexType v = left_unvisited[idx];
+
+      //Used only with symmetric graphs
+      //Parents are included in v's neighbors
+      IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited
+
+      IndexType end_i_edge = row_ptr[v + 1];
+
+      //We can have warp divergence in the next loop
+      //It's not a pb because the behaviour of __ballot
+      //is know with inactive threads
+      for (IndexType i_edge = first_i_edge + logical_lane_id;
+          i_edge < end_i_edge;
+          i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) {
+
+        IndexType valid_parent = -1;
+
+        if (!edge_mask || edge_mask[i_edge]) {
+          IndexType u = col_ind[i_edge];
+          IndexType lvl_u = distances[u];
+
+          if (lvl_u == (lvl - 1)) {
+            valid_parent = u;
+          }
+        }
+
+        unsigned int warp_valid_p_ballot = nvgraph::utils::ballot((valid_parent != -1));
+
+        int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE;
+        unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1;
+        unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot
+            >> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp);
+        logical_warp_valid_p_ballot &= mask;
+
+        int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1;
+
+        if (chosen_thread == logical_lane_id) {
+          //Using only one valid parent (reduce bw)
+          IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1);
+          int m = 1 << (v % INT_SIZE);
+          atomicOr(&visited[v / INT_SIZE], m);
+          distances[v] = lvl;
+
+          if (predecessors)
+            predecessors[v] = valid_parent;
+
+          new_frontier[off] = v;
+        }
+
+        if (logical_warp_valid_p_ballot) {
+          break;
+        }
+      }
+
+    }
+  }
+
+  template<typename IndexType>
+  void bottom_up_large(IndexType *left_unvisited,
+                IndexType left_unvisited_size,
+                int *visited,
+                const IndexType *row_ptr,
+                const IndexType *col_ind,
+                IndexType lvl,
+                IndexType *new_frontier,
+                IndexType *new_frontier_idx,
+                IndexType *distances,
+                IndexType *predecessors,
+                int *edge_mask,
+                cudaStream_t m_stream,
+                bool deterministic) {
+    dim3 grid, block;
+    block.x = LARGE_BOTTOMUP_DIMX;
+    grid.x = min(  (IndexType) MAXBLOCKS,
+              ((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x);
+
+    bottom_up_large_degree_kernel<<<grid, block, 0, m_stream>>>(left_unvisited,
+                                            left_unvisited_size,
+                                            visited,
+                                            row_ptr,
+                                            col_ind,
+                                            lvl,
+                                            new_frontier,
+                                            new_frontier_idx,
+                                            distances,
+                                            predecessors,
+                                            edge_mask);
+    cudaCheckError()
+    ;
+  }
+
+  //
+  //
+  //  ------------------------------ Top down ------------------------------
+  //
+  //
+
+  //
+  // compute_bucket_offsets_kernel
+  // simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer
+  //
+
+  template<typename IndexType>
+  __global__ void compute_bucket_offsets_kernel(  const IndexType *frontier_degrees_exclusive_sum,
+                                  IndexType *bucket_offsets,
+                                  const IndexType frontier_size,
+                                  IndexType total_degree) {
+    IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+        * NBUCKETS_PER_BLOCK + 1);
+
+    for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x;
+        bid <= end;
+        bid += gridDim.x * blockDim.x) {
+
+      IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
+
+      bucket_offsets[bid] = binsearch_maxle(  frontier_degrees_exclusive_sum,
+                                eid,
+                                (IndexType) 0,
+                                frontier_size - 1);
+
+    }
+  }
+
+  template<typename IndexType>
+  void compute_bucket_offsets(  IndexType *cumul,
+                      IndexType *bucket_offsets,
+                      IndexType frontier_size,
+                      IndexType total_degree,
+                      cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
+
+    grid.x = min(  (IndexType) MAXBLOCKS,
+              ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+                  * NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x);
+
+    compute_bucket_offsets_kernel<<<grid, block, 0, m_stream>>>(cumul,
+                                            bucket_offsets,
+                                            frontier_size,
+                                            total_degree);
+    cudaCheckError()
+    ;
+  }
+
+  //
+  // topdown_expand_kernel
+  // Read current frontier and compute new one with top down paradigm
+  // One thread = One edge
+  // To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than)
+  // This index k will give us the origin of this edge, which is frontier[k]
+  // This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k]
+  //
+  // To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches
+  // We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges
+  //
+  // Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k
+  // To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory
+  // We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below)
+  //
+  // We will then look which vertices are not visited yet :
+  // 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on
+  // 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue
+  //
+  // We then treat the candidates queue using the threadIdx.x < ncandidates
+  // If we are indeed the first thread to discover that vertex (result of atomicOr(visited))
+  // We add it to the new frontier
+  //
+
+  template<typename IndexType>
+  __global__ void topdown_expand_kernel(  const IndexType *row_ptr,
+                            const IndexType *col_ind,
+                            const IndexType *frontier,
+                            const IndexType frontier_size,
+                            const IndexType totaldegree,
+                            const IndexType max_items_per_thread,
+                            const IndexType lvl,
+                            IndexType *new_frontier,
+                            IndexType *new_frontier_cnt,
+                            const IndexType *frontier_degrees_exclusive_sum,
+                            const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+                            int *bmap,
+                            IndexType *distances,
+                            IndexType *predecessors,
+                            const int *edge_mask,
+                            const int *isolated_bmap,
+                            bool directed) {
+    //BlockScan
+    typedef cub::BlockScan<IndexType, TOP_DOWN_EXPAND_DIMX> BlockScan;
+    __shared__ typename BlockScan::TempStorage scan_storage;
+
+    // We will do a scan to know where to write in frontier
+    // This will contain the common offset of the block
+    __shared__ IndexType frontier_common_block_offset;
+
+    __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
+    __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
+
+    //
+    // Frontier candidates local queue
+    // We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything
+    // We also save the predecessors here, because we will not be able to retrieve it after
+    //
+    __shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE
+        * TOP_DOWN_EXPAND_DIMX];
+    __shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE
+        * TOP_DOWN_EXPAND_DIMX];
+    __shared__ IndexType block_n_frontier_candidates;
+
+    IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
+    IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1)
+        / TOP_DOWN_EXPAND_DIMX;
+
+    n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
+
+    for (;
+        (n_items_per_thread_left > 0) && (block_offset < totaldegree);
+
+        block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
+            n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) {
+
+      // In this loop, we will process batch_set_size batches
+      IndexType nitems_per_thread = min(  n_items_per_thread_left,
+                              (IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD);
+
+      // Loading buckets offset (see compute_bucket_offsets_kernel)
+
+      if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
+        shared_buckets_offsets[threadIdx.x] =
+            frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE
+                + threadIdx.x];
+
+      // We will use shared_buckets_offsets
+      __syncthreads();
+
+      //
+      // shared_buckets_offsets gives us a range of the possible indexes
+      // for edge of linear_threadx, we are looking for the value k such as
+      // k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx
+      //
+      // we have 0 <= k < frontier_size
+      // but we also have :
+      //
+      // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
+      // <= k
+      // <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1]
+      //
+      // To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below)
+      // We will load them here
+      // We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop
+      // Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below)
+
+      //We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[
+      //If it doesn't fit, --right until it does, then loop
+      //It is excepted to fit on the first try, that's why we start right = nitems_per_thread
+
+      IndexType left = 0;
+      IndexType right = nitems_per_thread;
+
+      while (left < nitems_per_thread) {
+        //
+        // Values that are necessary to compute the local binary searches
+        // We only need those with indexes between extremes indexes of buckets_offsets
+        // We need the next val for the binary search, hence the +1
+        //
+
+        IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+            - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+
+        //If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1
+        while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
+          --right;
+
+          nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+              - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+        }
+
+        IndexType nitems_per_thread_for_this_load = right - left;
+
+        IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left
+            * NBUCKETS_PER_BLOCK];
+
+        //TODO put again the nvalues_to_load == 1
+        if (threadIdx.x < nvalues_to_load) {
+          shared_frontier_degrees_exclusive_sum[threadIdx.x] =
+              frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+                  + threadIdx.x];
+        }
+
+        if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
+          shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
+              frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+                  + TOP_DOWN_EXPAND_DIMX];
+        }
+
+        //shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync
+        //TODO we don't use it if nvalues_to_load == 1
+        __syncthreads();
+
+        // Now we will process the edges
+        // Here each thread will process nitems_per_thread_for_this_load
+        for (IndexType item_index = 0;
+            item_index < nitems_per_thread_for_this_load;
+            item_index += TOP_DOWN_BATCH_SIZE) {
+
+          // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism)
+          // Reduces latency
+
+          IndexType current_max_edge_index = min(block_offset
+                                        + (left
+                                            + nitems_per_thread_for_this_load)
+                                            * blockDim.x,
+                                    totaldegree);
+
+          //We will need vec_u (source of the edge) until the end if we need to save the predecessors
+          //For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case)
+
+          IndexType vec_u[TOP_DOWN_BATCH_SIZE];
+          IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
+          IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
+
+          IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
 
 #pragma unroll
-					for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-
-						IndexType ibatch = left + item_index + iv;
-						IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x;
-
-						if (gid < current_max_edge_index) {
-							IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x)
-									/ TOP_DOWN_BUCKET_SIZE;
-							IndexType bucket_start = shared_buckets_offsets[start_off_idx]
-									- frontier_degrees_exclusive_sum_block_offset;
-							IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1]
-									- frontier_degrees_exclusive_sum_block_offset;
-
-							IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum,
-																	gid,
-																	bucket_start,
-																	bucket_end)
-									+ frontier_degrees_exclusive_sum_block_offset;
-							vec_u[iv] = frontier[k]; // origin of this edge
-							vec_frontier_degrees_exclusive_sum_index[iv] =
-									frontier_degrees_exclusive_sum[k];
-						} else {
-							vec_u[iv] = -1;
-							vec_frontier_degrees_exclusive_sum_index[iv] = -1;
-						}
-
-					}
-
-					IndexType *vec_row_ptr_u = &local_buf1[0];
+          for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+
+            IndexType ibatch = left + item_index + iv;
+            IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x;
+
+            if (gid < current_max_edge_index) {
+              IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x)
+                  / TOP_DOWN_BUCKET_SIZE;
+              IndexType bucket_start = shared_buckets_offsets[start_off_idx]
+                  - frontier_degrees_exclusive_sum_block_offset;
+              IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1]
+                  - frontier_degrees_exclusive_sum_block_offset;
+
+              IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum,
+                                  gid,
+                                  bucket_start,
+                                  bucket_end)
+                  + frontier_degrees_exclusive_sum_block_offset;
+              vec_u[iv] = frontier[k]; // origin of this edge
+              vec_frontier_degrees_exclusive_sum_index[iv] =
+                  frontier_degrees_exclusive_sum[k];
+            } else {
+              vec_u[iv] = -1;
+              vec_frontier_degrees_exclusive_sum_index[iv] = -1;
+            }
+
+          }
+
+          IndexType *vec_row_ptr_u = &local_buf1[0];
 #pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType u = vec_u[iv];
-						//row_ptr for this vertex origin u
-						vec_row_ptr_u[iv] = (u != -1)
-													? row_ptr[u]
-														:
-														-1;
-					}
-
-					//We won't need row_ptr after that, reusing pointer
-					IndexType *vec_dest_v = vec_row_ptr_u;
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType u = vec_u[iv];
+            //row_ptr for this vertex origin u
+            vec_row_ptr_u[iv] = (u != -1)
+                          ? row_ptr[u]
+                            :
+                            -1;
+          }
+
+          //We won't need row_ptr after that, reusing pointer
+          IndexType *vec_dest_v = vec_row_ptr_u;
 
 #pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType thread_item_index = left + item_index + iv;
-						IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x;
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType thread_item_index = left + item_index + iv;
+            IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x;
 
-						IndexType row_ptr_u = vec_row_ptr_u[iv];
-						IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
+            IndexType row_ptr_u = vec_row_ptr_u[iv];
+            IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
 
-						if (edge_mask && !edge_mask[edge])
-							row_ptr_u = -1; //disabling edge
+            if (edge_mask && !edge_mask[edge])
+              row_ptr_u = -1; //disabling edge
 
-						//Destination of this edge
-						vec_dest_v[iv] = (row_ptr_u != -1)
-												? col_ind[edge]
-													:
-													-1;
-					}
+            //Destination of this edge
+            vec_dest_v[iv] = (row_ptr_u != -1)
+                        ? col_ind[edge]
+                          :
+                          -1;
+          }
 
-					//We don't need vec_frontier_degrees_exclusive_sum_index anymore
-					IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
+          //We don't need vec_frontier_degrees_exclusive_sum_index anymore
+          IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
 #pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType v = vec_dest_v[iv];
-						vec_v_visited_bmap[iv] = (v != -1)
-															? bmap[v / INT_SIZE]
-																:
-																(~0); //will look visited
-					}
-
-					// From now on we will consider v as a frontier candidate
-					// If for some reason vec_candidate[iv] should be put in the new_frontier
-					// Then set vec_candidate[iv] = -1
-					IndexType *vec_frontier_candidate = vec_dest_v;
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType v = vec_dest_v[iv];
+            vec_v_visited_bmap[iv] = (v != -1)
+                              ? bmap[v / INT_SIZE]
+                                :
+                                (~0); //will look visited
+          }
+
+          // From now on we will consider v as a frontier candidate
+          // If for some reason vec_candidate[iv] should be put in the new_frontier
+          // Then set vec_candidate[iv] = -1
+          IndexType *vec_frontier_candidate = vec_dest_v;
 
 #pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType v = vec_frontier_candidate[iv];
-						int m = 1 << (v % INT_SIZE);
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType v = vec_frontier_candidate[iv];
+            int m = 1 << (v % INT_SIZE);
 
-						int is_visited = vec_v_visited_bmap[iv] & m;
+            int is_visited = vec_v_visited_bmap[iv] & m;
 
-						if (is_visited)
-							vec_frontier_candidate[iv] = -1;
-					}
+            if (is_visited)
+              vec_frontier_candidate[iv] = -1;
+          }
 
-					if (directed) {
-						//vec_v_visited_bmap is available
+          if (directed) {
+            //vec_v_visited_bmap is available
 
-						IndexType *vec_is_isolated_bmap = vec_v_visited_bmap;
+            IndexType *vec_is_isolated_bmap = vec_v_visited_bmap;
 
 #pragma unroll
-						for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-							IndexType v = vec_frontier_candidate[iv];
-							vec_is_isolated_bmap[iv] = (v != -1)
-																? isolated_bmap[v / INT_SIZE]
-																	:
-																	-1;
-						}
+            for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+              IndexType v = vec_frontier_candidate[iv];
+              vec_is_isolated_bmap[iv] = (v != -1)
+                                ? isolated_bmap[v / INT_SIZE]
+                                  :
+                                  -1;
+            }
 
 #pragma unroll
-						for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-							IndexType v = vec_frontier_candidate[iv];
-							int m = 1 << (v % INT_SIZE);
-							int is_isolated = vec_is_isolated_bmap[iv] & m;
+            for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+              IndexType v = vec_frontier_candidate[iv];
+              int m = 1 << (v % INT_SIZE);
+              int is_isolated = vec_is_isolated_bmap[iv] & m;
 
-							//If v is isolated, we will not add it to the frontier (it's not a frontier candidate)
-							// 1st reason : it's useless
-							// 2nd reason : it will make top down algo fail
-							// we need each node in frontier to have a degree > 0
-							// If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr
+              //If v is isolated, we will not add it to the frontier (it's not a frontier candidate)
+              // 1st reason : it's useless
+              // 2nd reason : it will make top down algo fail
+              // we need each node in frontier to have a degree > 0
+              // If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr
 
-							if (is_isolated && v != -1) {
-								int m = 1 << (v % INT_SIZE);
-								atomicOr(&bmap[v / INT_SIZE], m);
-								if (distances)
-									distances[v] = lvl;
+              if (is_isolated && v != -1) {
+                int m = 1 << (v % INT_SIZE);
+                atomicOr(&bmap[v / INT_SIZE], m);
+                if (distances)
+                  distances[v] = lvl;
 
-								if (predecessors)
-									predecessors[v] = vec_u[iv];
+                if (predecessors)
+                  predecessors[v] = vec_u[iv];
 
-								//This is no longer a candidate, neutralize it
-								vec_frontier_candidate[iv] = -1;
-							}
+                //This is no longer a candidate, neutralize it
+                vec_frontier_candidate[iv] = -1;
+              }
 
-						}
-					}
+            }
+          }
 
-					//Number of successor candidate hold by this thread
-					IndexType thread_n_frontier_candidates = 0;
+          //Number of successor candidate hold by this thread
+          IndexType thread_n_frontier_candidates = 0;
 
 #pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType v = vec_frontier_candidate[iv];
-						if (v != -1)
-							++thread_n_frontier_candidates;
-					}
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType v = vec_frontier_candidate[iv];
+            if (v != -1)
+              ++thread_n_frontier_candidates;
+          }
 
-					// We need to have all nfrontier_candidates to be ready before doing the scan
-					__syncthreads();
+          // We need to have all nfrontier_candidates to be ready before doing the scan
+          __syncthreads();
 
-					// We will put the frontier candidates in a local queue
-					// Computing offsets
-					IndexType thread_frontier_candidate_offset = 0; //offset inside block
-					BlockScan(scan_storage).ExclusiveSum(	thread_n_frontier_candidates,
-																		thread_frontier_candidate_offset);
+          // We will put the frontier candidates in a local queue
+          // Computing offsets
+          IndexType thread_frontier_candidate_offset = 0; //offset inside block
+          BlockScan(scan_storage).ExclusiveSum(  thread_n_frontier_candidates,
+                                    thread_frontier_candidate_offset);
 
 #pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						//May have bank conflicts
-						IndexType frontier_candidate = vec_frontier_candidate[iv];
-
-						if (frontier_candidate != -1) {
-							shared_local_new_frontier_candidates[thread_frontier_candidate_offset] =
-									frontier_candidate;
-							shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] =
-									vec_u[iv];
-							++thread_frontier_candidate_offset;
-						}
-					}
-
-					if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
-						//No need to add nsuccessor_candidate, even if its an
-						//exclusive sum
-						//We incremented the thread_frontier_candidate_offset
-						block_n_frontier_candidates = thread_frontier_candidate_offset;
-					}
-
-					//broadcast block_n_frontier_candidates
-					__syncthreads();
-
-					IndexType naccepted_vertices = 0;
-					//We won't need vec_frontier_candidate after that
-					IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate;
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            //May have bank conflicts
+            IndexType frontier_candidate = vec_frontier_candidate[iv];
+
+            if (frontier_candidate != -1) {
+              shared_local_new_frontier_candidates[thread_frontier_candidate_offset] =
+                  frontier_candidate;
+              shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] =
+                  vec_u[iv];
+              ++thread_frontier_candidate_offset;
+            }
+          }
+
+          if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
+            //No need to add nsuccessor_candidate, even if its an
+            //exclusive sum
+            //We incremented the thread_frontier_candidate_offset
+            block_n_frontier_candidates = thread_frontier_candidate_offset;
+          }
+
+          //broadcast block_n_frontier_candidates
+          __syncthreads();
+
+          IndexType naccepted_vertices = 0;
+          //We won't need vec_frontier_candidate after that
+          IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate;
 
 #pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						const int idx_shared = iv * blockDim.x + threadIdx.x;
-						vec_frontier_accepted_vertex[iv] = -1;
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            const int idx_shared = iv * blockDim.x + threadIdx.x;
+            vec_frontier_accepted_vertex[iv] = -1;
 
-						if (idx_shared < block_n_frontier_candidates) {
-							IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue
-							int m = 1 << (v % INT_SIZE);
-							int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old
+            if (idx_shared < block_n_frontier_candidates) {
+              IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue
+              int m = 1 << (v % INT_SIZE);
+              int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old
 
-							if (!(m & q)) { //if this thread was the first to discover this node
-								if (distances)
-									distances[v] = lvl;
+              if (!(m & q)) { //if this thread was the first to discover this node
+                if (distances)
+                  distances[v] = lvl;
 
-								if (predecessors) {
-									IndexType pred = shared_local_new_frontier_predecessors[idx_shared];
-									predecessors[v] = pred;
-								}
+                if (predecessors) {
+                  IndexType pred = shared_local_new_frontier_predecessors[idx_shared];
+                  predecessors[v] = pred;
+                }
 
-								vec_frontier_accepted_vertex[iv] = v;
-								++naccepted_vertices;
-							}
-						}
+                vec_frontier_accepted_vertex[iv] = v;
+                ++naccepted_vertices;
+              }
+            }
 
-					}
+          }
 
-					//We need naccepted_vertices to be ready
-					__syncthreads();
+          //We need naccepted_vertices to be ready
+          __syncthreads();
 
-					IndexType thread_new_frontier_offset;
+          IndexType thread_new_frontier_offset;
 
-					BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset);
+          BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset);
 
-					if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
+          if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
 
-						IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices;
-						//for this thread, thread_new_frontier_offset + has_successor (exclusive sum)
-						if (inclusive_sum)
-							frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
-					}
+            IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices;
+            //for this thread, thread_new_frontier_offset + has_successor (exclusive sum)
+            if (inclusive_sum)
+              frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
+          }
 
-					//Broadcasting frontier_common_block_offset
-					__syncthreads();
+          //Broadcasting frontier_common_block_offset
+          __syncthreads();
 
 #pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						const int idx_shared = iv * blockDim.x + threadIdx.x;
-						if (idx_shared < block_n_frontier_candidates) {
-
-							IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv];
-
-							if (new_frontier_vertex != -1) {
-								IndexType off = frontier_common_block_offset + thread_new_frontier_offset++;
-								//TODO Access is not good
-								new_frontier[off] = new_frontier_vertex;
-							}
-						}
-					}
-
-				}
-
-				//We need to keep shared_frontier_degrees_exclusive_sum coherent
-				__syncthreads();
-
-				//Preparing for next load
-				left = right;
-				right = nitems_per_thread;
-			}
-
-			//we need to keep shared_buckets_offsets coherent
-			__syncthreads();
-		}
-
-	}
-
-	template<typename IndexType>
-	void frontier_expand(const IndexType *row_ptr,
-								const IndexType *col_ind,
-								const IndexType *frontier,
-								const IndexType frontier_size,
-								const IndexType totaldegree,
-								const IndexType lvl,
-								IndexType *new_frontier,
-								IndexType *new_frontier_cnt,
-								const IndexType *frontier_degrees_exclusive_sum,
-								const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
-								int *visited_bmap,
-								IndexType *distances,
-								IndexType *predecessors,
-								const int *edge_mask,
-								const int *isolated_bmap,
-								bool directed,
-								cudaStream_t m_stream,
-								bool deterministic) {
-		if (!totaldegree)
-			return;
-
-		dim3 block;
-		block.x = TOP_DOWN_EXPAND_DIMX;
-
-		IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1)
-				/ (MAXBLOCKS * block.x);
-
-		dim3 grid;
-		grid.x = min(	(totaldegree + max_items_per_thread * block.x - 1)
-									/ (max_items_per_thread * block.x),
-							(IndexType) MAXBLOCKS);
-
-		topdown_expand_kernel<<<grid, block, 0, m_stream>>>(	row_ptr,
-																				col_ind,
-																				frontier,
-																				frontier_size,
-																				totaldegree,
-																				max_items_per_thread,
-																				lvl,
-																				new_frontier,
-																				new_frontier_cnt,
-																				frontier_degrees_exclusive_sum,
-																				frontier_degrees_exclusive_sum_buckets_offsets,
-																				visited_bmap,
-																				distances,
-																				predecessors,
-																				edge_mask,
-																				isolated_bmap,
-																				directed);
-		cudaCheckError()
-		;
-	}
-
-	template<typename IndexType>
-	__global__ void flag_isolated_vertices_kernel(	IndexType n,
-																	int *isolated_bmap,
-																	const IndexType *row_ptr,
-																	IndexType *degrees,
-																	IndexType *nisolated) {
-		typedef cub::BlockLoad<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
-				FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-		typedef cub::BlockStore<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
-				FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-		typedef cub::BlockReduce<IndexType, FLAG_ISOLATED_VERTICES_DIMX> BlockReduce;
-		typedef cub::WarpReduce<int, FLAG_ISOLATED_VERTICES_THREADS_PER_INT> WarpReduce;
-
-		__shared__ typename BlockLoad::TempStorage load_temp_storage;
-		__shared__ typename BlockStore::TempStorage store_temp_storage;
-		__shared__ typename BlockReduce::TempStorage block_reduce_temp_storage;
-
-		__shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX
-				/ FLAG_ISOLATED_VERTICES_THREADS_PER_INT];
-
-		__shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX];
-
-		for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD
-				* (blockDim.x * blockIdx.x);
-				block_off < n;
-				block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) {
-
-			IndexType thread_off = block_off
-					+ FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x;
-			IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1;
-
-			IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
-			IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1]
-
-			BlockLoad(load_temp_storage).Load(	row_ptr + block_off,
-															thread_row_ptr,
-															block_valid_items,
-															-1);
-
-			//To compute 4 degrees, we need 5 values of row_ptr
-			//Saving the "5th" value in shared memory for previous thread to use
-			if (threadIdx.x > 0) {
-				row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0];
-			}
-
-			//If this is the last thread, it needs to load its row ptr tail value
-			if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) {
-				row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1];
-
-			}
-			__syncthreads(); // we may reuse temp_storage
-
-			int local_isolated_bmap = 0;
-
-			IndexType imax = (n - thread_off);
-
-			IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            const int idx_shared = iv * blockDim.x + threadIdx.x;
+            if (idx_shared < block_n_frontier_candidates) {
+
+              IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv];
+
+              if (new_frontier_vertex != -1) {
+                IndexType off = frontier_common_block_offset + thread_new_frontier_offset++;
+                //TODO Access is not good
+                new_frontier[off] = new_frontier_vertex;
+              }
+            }
+          }
+
+        }
+
+        //We need to keep shared_frontier_degrees_exclusive_sum coherent
+        __syncthreads();
+
+        //Preparing for next load
+        left = right;
+        right = nitems_per_thread;
+      }
+
+      //we need to keep shared_buckets_offsets coherent
+      __syncthreads();
+    }
+
+  }
+
+  template<typename IndexType>
+  void frontier_expand(const IndexType *row_ptr,
+                const IndexType *col_ind,
+                const IndexType *frontier,
+                const IndexType frontier_size,
+                const IndexType totaldegree,
+                const IndexType lvl,
+                IndexType *new_frontier,
+                IndexType *new_frontier_cnt,
+                const IndexType *frontier_degrees_exclusive_sum,
+                const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+                int *visited_bmap,
+                IndexType *distances,
+                IndexType *predecessors,
+                const int *edge_mask,
+                const int *isolated_bmap,
+                bool directed,
+                cudaStream_t m_stream,
+                bool deterministic) {
+    if (!totaldegree)
+      return;
+
+    dim3 block;
+    block.x = TOP_DOWN_EXPAND_DIMX;
+
+    IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1)
+        / (MAXBLOCKS * block.x);
+
+    dim3 grid;
+    grid.x = min(  (totaldegree + max_items_per_thread * block.x - 1)
+                  / (max_items_per_thread * block.x),
+              (IndexType) MAXBLOCKS);
+
+    topdown_expand_kernel<<<grid, block, 0, m_stream>>>(  row_ptr,
+                                        col_ind,
+                                        frontier,
+                                        frontier_size,
+                                        totaldegree,
+                                        max_items_per_thread,
+                                        lvl,
+                                        new_frontier,
+                                        new_frontier_cnt,
+                                        frontier_degrees_exclusive_sum,
+                                        frontier_degrees_exclusive_sum_buckets_offsets,
+                                        visited_bmap,
+                                        distances,
+                                        predecessors,
+                                        edge_mask,
+                                        isolated_bmap,
+                                        directed);
+    cudaCheckError()
+    ;
+  }
+
+  template<typename IndexType>
+  __global__ void flag_isolated_vertices_kernel(  IndexType n,
+                                  int *isolated_bmap,
+                                  const IndexType *row_ptr,
+                                  IndexType *degrees,
+                                  IndexType *nisolated) {
+    typedef cub::BlockLoad<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
+        FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+    typedef cub::BlockStore<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
+        FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+    typedef cub::BlockReduce<IndexType, FLAG_ISOLATED_VERTICES_DIMX> BlockReduce;
+    typedef cub::WarpReduce<int, FLAG_ISOLATED_VERTICES_THREADS_PER_INT> WarpReduce;
+
+    __shared__ typename BlockLoad::TempStorage load_temp_storage;
+    __shared__ typename BlockStore::TempStorage store_temp_storage;
+    __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage;
+
+    __shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX
+        / FLAG_ISOLATED_VERTICES_THREADS_PER_INT];
+
+    __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX];
+
+    for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD
+        * (blockDim.x * blockIdx.x);
+        block_off < n;
+        block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) {
+
+      IndexType thread_off = block_off
+          + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x;
+      IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1;
+
+      IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
+      IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1]
+
+      BlockLoad(load_temp_storage).Load(  row_ptr + block_off,
+                              thread_row_ptr,
+                              block_valid_items,
+                              -1);
+
+      //To compute 4 degrees, we need 5 values of row_ptr
+      //Saving the "5th" value in shared memory for previous thread to use
+      if (threadIdx.x > 0) {
+        row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0];
+      }
+
+      //If this is the last thread, it needs to load its row ptr tail value
+      if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) {
+        row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1];
+
+      }
+      __syncthreads(); // we may reuse temp_storage
+
+      int local_isolated_bmap = 0;
+
+      IndexType imax = (n - thread_off);
+
+      IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
 
 #pragma unroll
-			for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) {
-				IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i];
-
-				if (i < imax)
-					local_isolated_bmap |= ((degree == 0) << i);
-			}
-
-			if (last_node_thread < n) {
-				IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] =
-						row_ptr_tail[threadIdx.x]
-								- thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1];
-
-				local_isolated_bmap |= ((degree == 0)
-						<< (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1));
-
-			}
-
-			local_isolated_bmap <<= (thread_off % INT_SIZE);
-
-			IndexType local_nisolated = __popc(local_isolated_bmap);
-
-			//We need local_nisolated and local_isolated_bmap to be ready for next steps
-			__syncthreads();
-
-			IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated);
-
-			if (threadIdx.x == 0 && total_nisolated) {
-				atomicAdd(nisolated, total_nisolated);
-			}
-
-			int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT;
-
-			//Building int for bmap
-			int int_aggregate_isolated_bmap =
-					WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(	local_isolated_bmap,
-																									BitwiseOr());
-
-			int is_head_of_visited_int =
-					((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0);
-			if (is_head_of_visited_int) {
-				isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap;
-			}
-
-			BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items);
-		}
-	}
-
-	template<typename IndexType>
-	void flag_isolated_vertices(	IndexType n,
-											int *isolated_bmap,
-											const IndexType *row_ptr,
-											IndexType *degrees,
-											IndexType *nisolated,
-											cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = FLAG_ISOLATED_VERTICES_DIMX;
-
-		grid.x = min(	(IndexType) MAXBLOCKS,
-							(n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x);
-
-		flag_isolated_vertices_kernel<<<grid, block, 0, m_stream>>>(n,
-																						isolated_bmap,
-																						row_ptr,
-																						degrees,
-																						nisolated);
-		cudaCheckError()
-		;
-	}
-
-	//
-	//
-	//
-	// Some utils functions
-	//
-	//
-
-	//Creates CUB data for graph size n
-	template<typename IndexType>
-	void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) {
-		// Determine temporary device storage requirements for exclusive prefix scan
-		d_temp_storage = NULL;
-		temp_storage_bytes = 0;
-		IndexType *d_in = NULL, *d_out = NULL;
-		cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n);
-		// Allocate temporary storage for exclusive prefix scan
-		cudaMalloc(&d_temp_storage, temp_storage_bytes);
-	}
-
-	template<typename IndexType>
-	__global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) {
-		for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x;
-				u < n;
-				u += gridDim.x * blockDim.x)
-			vec[u] = val;
-
-	}
-
-	template<typename IndexType>
-	void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = 256;
-		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
-		fill_kernel<<<grid, block, 0, m_stream>>>(vec, n, val);
-		cudaCheckError()
-		;
-	}
-
-	template<typename IndexType>
-	__global__ void set_frontier_degree_kernel(	IndexType *frontier_degree,
-																IndexType *frontier,
-																const IndexType *degree,
-																IndexType n) {
-		for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
-				idx < n;
-				idx += gridDim.x * blockDim.x) {
-			IndexType u = frontier[idx];
-			frontier_degree[idx] = degree[u];
-		}
-	}
-
-	template<typename IndexType>
-	void set_frontier_degree(	IndexType *frontier_degree,
-										IndexType *frontier,
-										const IndexType *degree,
-										IndexType n,
-										cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = 256;
-		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
-		set_frontier_degree_kernel<<<grid, block, 0, m_stream>>>(frontier_degree,
-																					frontier,
-																					degree,
-																					n);
-		cudaCheckError()
-		;
-	}
-
-	template<typename IndexType>
-	void exclusive_sum(	void *d_temp_storage,
-								size_t temp_storage_bytes,
-								IndexType *d_in,
-								IndexType *d_out,
-								IndexType num_items,
-								cudaStream_t m_stream) {
-		if (num_items <= 1)
-			return; //DeviceScan fails if n==1
-		cub::DeviceScan::ExclusiveSum(d_temp_storage,
-												temp_storage_bytes,
-												d_in,
-												d_out,
-												num_items,
-												m_stream);
-	}
-
-	template<typename T>
-	__global__ void fill_vec_kernel(T *vec, T n, T val) {
-		for (T idx = blockIdx.x * blockDim.x + threadIdx.x;
-				idx < n;
-				idx += blockDim.x * gridDim.x)
-			vec[idx] = val;
-	}
-
-	template<typename T>
-	void fill_vec(T *vec, T n, T val, cudaStream_t stream) {
-		dim3 grid, block;
-		block.x = 256;
-		grid.x = (n + block.x - 1) / block.x;
-
-		fill_vec_kernel<<<grid, block, 0, stream>>>(vec, n, val);
-		cudaCheckError()
-		;
-	}
+      for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) {
+        IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i];
+
+        if (i < imax)
+          local_isolated_bmap |= ((degree == 0) << i);
+      }
+
+      if (last_node_thread < n) {
+        IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] =
+            row_ptr_tail[threadIdx.x]
+                - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1];
+
+        local_isolated_bmap |= ((degree == 0)
+            << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1));
+
+      }
+
+      local_isolated_bmap <<= (thread_off % INT_SIZE);
+
+      IndexType local_nisolated = __popc(local_isolated_bmap);
+
+      //We need local_nisolated and local_isolated_bmap to be ready for next steps
+      __syncthreads();
+
+      IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated);
+
+      if (threadIdx.x == 0 && total_nisolated) {
+        atomicAdd(nisolated, total_nisolated);
+      }
+
+      int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT;
+
+      //Building int for bmap
+      int int_aggregate_isolated_bmap =
+          WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(  local_isolated_bmap,
+                                                  BitwiseOr());
+
+      int is_head_of_visited_int =
+          ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0);
+      if (is_head_of_visited_int) {
+        isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap;
+      }
+
+      BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items);
+    }
+  }
+
+  template<typename IndexType>
+  void flag_isolated_vertices(  IndexType n,
+                      int *isolated_bmap,
+                      const IndexType *row_ptr,
+                      IndexType *degrees,
+                      IndexType *nisolated,
+                      cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = FLAG_ISOLATED_VERTICES_DIMX;
+
+    grid.x = min(  (IndexType) MAXBLOCKS,
+              (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x);
+
+    flag_isolated_vertices_kernel<<<grid, block, 0, m_stream>>>(n,
+                                            isolated_bmap,
+                                            row_ptr,
+                                            degrees,
+                                            nisolated);
+    cudaCheckError()
+    ;
+  }
+
+  //
+  //
+  //
+  // Some utils functions
+  //
+  //
+
+  //Creates CUB data for graph size n
+  template<typename IndexType>
+  void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) {
+    // Determine temporary device storage requirements for exclusive prefix scan
+    d_temp_storage = NULL;
+    temp_storage_bytes = 0;
+    IndexType *d_in = NULL, *d_out = NULL;
+    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n);
+    // Allocate temporary storage for exclusive prefix scan
+    cudaStream_t stream{nullptr};
+    RMM_ALLOC(&d_temp_storage, temp_storage_bytes, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work.
+  }
+
+  template<typename IndexType>
+  __global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) {
+    for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x;
+        u < n;
+        u += gridDim.x * blockDim.x)
+      vec[u] = val;
+
+  }
+
+  template<typename IndexType>
+  void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = 256;
+    grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+    fill_kernel<<<grid, block, 0, m_stream>>>(vec, n, val);
+    cudaCheckError()
+    ;
+  }
+
+  template<typename IndexType>
+  __global__ void set_frontier_degree_kernel(  IndexType *frontier_degree,
+                                IndexType *frontier,
+                                const IndexType *degree,
+                                IndexType n) {
+    for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+        idx < n;
+        idx += gridDim.x * blockDim.x) {
+      IndexType u = frontier[idx];
+      frontier_degree[idx] = degree[u];
+    }
+  }
+
+  template<typename IndexType>
+  void set_frontier_degree(  IndexType *frontier_degree,
+                    IndexType *frontier,
+                    const IndexType *degree,
+                    IndexType n,
+                    cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = 256;
+    grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+    set_frontier_degree_kernel<<<grid, block, 0, m_stream>>>(frontier_degree,
+                                          frontier,
+                                          degree,
+                                          n);
+    cudaCheckError()
+    ;
+  }
+
+  template<typename IndexType>
+  void exclusive_sum(  void *d_temp_storage,
+                size_t temp_storage_bytes,
+                IndexType *d_in,
+                IndexType *d_out,
+                IndexType num_items,
+                cudaStream_t m_stream) {
+    if (num_items <= 1)
+      return; //DeviceScan fails if n==1
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                        temp_storage_bytes,
+                        d_in,
+                        d_out,
+                        num_items,
+                        m_stream);
+  }
+
+  template<typename T>
+  __global__ void fill_vec_kernel(T *vec, T n, T val) {
+    for (T idx = blockIdx.x * blockDim.x + threadIdx.x;
+        idx < n;
+        idx += blockDim.x * gridDim.x)
+      vec[idx] = val;
+  }
+
+  template<typename T>
+  void fill_vec(T *vec, T n, T val, cudaStream_t stream) {
+    dim3 grid, block;
+    block.x = 256;
+    grid.x = (n + block.x - 1) / block.x;
+
+    fill_vec_kernel<<<grid, block, 0, stream>>>(vec, n, val);
+    cudaCheckError()
+    ;
+  }
 }
 //
diff --git a/cpp/nvgraph/cpp/src/convert.cu b/cpp/nvgraph/cpp/src/convert.cu
index bb6c34146ee..3d1e0ad99e1 100644
--- a/cpp/nvgraph/cpp/src/convert.cu
+++ b/cpp/nvgraph/cpp/src/convert.cu
@@ -61,7 +61,7 @@
                  int *cscRowInd, int *cscColPtr, int *p,
                  cusparseIndexBase_t idxBase){
 
-      SHARED_PREFIX::shared_ptr<char> pBuffer;
+      std::shared_ptr<char> pBuffer;
 
       // Step 1: Allocate buffer
       size_t pBufferSizeInBytes = 0;
@@ -79,8 +79,8 @@
                 void *dstVal, int *dstRowInd, int *dstColInd,
                 cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
       size_t pBufferSizeInBytes = 0;
-      SHARED_PREFIX::shared_ptr<char> pBuffer;
-      SHARED_PREFIX::shared_ptr<int> P; // permutation array
+      std::shared_ptr<char> pBuffer;
+      std::shared_ptr<int> P; // permutation array
 
       // step 0: copy src to dst
       if(dstRowInd!=srcRowInd)
@@ -103,8 +103,8 @@
                 void *dstVal, int *dstRowInd, int *dstColInd,
                 cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
       size_t pBufferSizeInBytes = 0;
-      SHARED_PREFIX::shared_ptr<char> pBuffer;
-      SHARED_PREFIX::shared_ptr<int> P; // permutation array
+      std::shared_ptr<char> pBuffer;
+      std::shared_ptr<int> P; // permutation array
 
       // step 0: copy src to dst
       CHECK_CUDA( cudaMemcpy(dstRowInd, srcRowInd, nnz*sizeof(int), cudaMemcpyDefault) );
@@ -126,7 +126,7 @@
               void *dstVal, int *dstRowInd, int *dstColPtr,
               cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
       // coos -> cood -> csc
-      SHARED_PREFIX::shared_ptr<int> tmp = allocateDevice<int>(nnz, NULL);
+      std::shared_ptr<int> tmp = allocateDevice<int>(nnz, NULL);
       cooSortByDestination(m, n, nnz, srcVal, srcRowInd, srcColInd, dstVal, dstRowInd, tmp.get(), idxBase, dataType);
       coo2csr(tmp.get(), nnz, m, dstColPtr, idxBase);
     }
@@ -135,7 +135,7 @@
               void *dstVal, int *dstRowPtr, int *dstColInd,
               cusparseIndexBase_t idxBase, cudaDataType_t *dataType){
       // cood -> coos -> csr
-      SHARED_PREFIX::shared_ptr<int> tmp = allocateDevice<int>(nnz, NULL);
+      std::shared_ptr<int> tmp = allocateDevice<int>(nnz, NULL);
       cooSortBySource(m, n, nnz, srcVal, srcRowInd, srcColInd, dstVal, tmp.get(), dstColInd, idxBase, dataType);
       coo2csr(tmp.get(), nnz, m, dstRowPtr, idxBase);
     }
diff --git a/cpp/nvgraph/cpp/src/nvgraph.cu b/cpp/nvgraph/cpp/src/nvgraph.cu
index 38124c148e4..ee2131b9da1 100644
--- a/cpp/nvgraph/cpp/src/nvgraph.cu
+++ b/cpp/nvgraph/cpp/src/nvgraph.cu
@@ -24,7 +24,7 @@
 #include <cusolverDn.h>
 
 #include <nvgraph_error.hxx>
-#include <cnmem_shared_ptr.hxx>
+#include <rmm_shared_ptr.hxx>
 #include <valued_csr_graph.hxx>
 #include <multi_valued_csr_graph.hxx>
 #include <nvgraph_vector.hxx>
@@ -52,64 +52,57 @@
 #include "2d_partitioning.h"
 #include "bfs2d.hxx"
 
-static inline int check_context(const nvgraphHandle_t h)
-											{
-	int ret = 0;
-	if (h == NULL || !h->nvgraphIsInitialized)
-		ret = 1;
-	return ret;
+static inline int check_context(const nvgraphHandle_t h) {
+    int ret = 0;
+    if (h == NULL || !h->nvgraphIsInitialized)
+        ret = 1;
+    return ret;
 }
 
-static inline int check_graph(const nvgraphGraphDescr_t d)
-										{
-	int ret = 0;
-	if (d == NULL || d->graphStatus == IS_EMPTY)
-		ret = 1;
-	return ret;
+static inline int check_graph(const nvgraphGraphDescr_t d) {
+    int ret = 0;
+    if (d == NULL || d->graphStatus == IS_EMPTY)
+        ret = 1;
+    return ret;
 }
-static inline int check_topology(const nvgraphGraphDescr_t d)
-											{
-	int ret = 0;
-	if (d->graphStatus == IS_EMPTY)
-		ret = 1;
-	return ret;
+static inline int check_topology(const nvgraphGraphDescr_t d) {
+    int ret = 0;
+    if (d->graphStatus == IS_EMPTY)
+        ret = 1;
+    return ret;
 }
 
-static inline int check_int_size(size_t sz)
-											{
-	int ret = 0;
-	if (sz >= INT_MAX)
-		ret = 1;
-	return ret;
+static inline int check_int_size(size_t sz) {
+    int ret = 0;
+    if (sz >= INT_MAX)
+        ret = 1;
+    return ret;
 }
 
-static inline int check_int_ptr(const int* p)
-											{
-	int ret = 0;
-	if (!p)
-		ret = 1;
-	return ret;
+static inline int check_int_ptr(const int* p) {
+    int ret = 0;
+    if (!p)
+        ret = 1;
+    return ret;
 }
 
-static inline int check_uniform_type_array(const cudaDataType_t * t, size_t sz)
-															{
-	int ret = 0;
-	cudaDataType_t uniform_type = t[0];
-	for (size_t i = 1; i < sz; i++)
-			{
-		if (t[i] != uniform_type)
-			ret = 1;
-	}
-	return ret;
+static inline int check_uniform_type_array(const cudaDataType_t * t, size_t sz) {
+    int ret = 0;
+    cudaDataType_t uniform_type = t[0];
+    for (size_t i = 1; i < sz; i++)
+            {
+        if (t[i] != uniform_type)
+            ret = 1;
+    }
+    return ret;
 }
 
 template<typename T>
-bool check_ptr(const T* p)
-					{
-	bool ret = false;
-	if (!p)
-		ret = true;
-	return ret;
+bool check_ptr(const T* p) {
+    bool ret = false;
+    if (!p)
+        ret = true;
+    return ret;
 }
 
 namespace nvgraph
@@ -120,3417 +113,3319 @@ namespace nvgraph
 //right now this header does not exist and including graph_concrete_visitors.hxx
 //doesn't compile because of the Thrust code;
 //
-	extern CsrGraph<int>* extract_subgraph_by_vertices(CsrGraph<int>& graph,
-																		int* pV,
-																		size_t n,
-																		cudaStream_t stream);
-	extern MultiValuedCsrGraph<int, float>* extract_subgraph_by_vertices(MultiValuedCsrGraph<int,
-																										float>& graph,
-																								int* pV,
-																								size_t n,
-																								cudaStream_t stream);
-	extern MultiValuedCsrGraph<int, double>* extract_subgraph_by_vertices(MultiValuedCsrGraph<int,
-																											double>& graph,
-																									int* pV,
-																									size_t n,
-																									cudaStream_t stream);
-
-	extern CsrGraph<int>* extract_subgraph_by_edges(CsrGraph<int>& graph,
-																	int* pV,
-																	size_t n,
-																	cudaStream_t stream);
-	extern MultiValuedCsrGraph<int, float>* extract_subgraph_by_edges(MultiValuedCsrGraph<int, float>& graph,
-																							int* pV,
-																							size_t n,
-																							cudaStream_t stream);
-	extern MultiValuedCsrGraph<int, double>* extract_subgraph_by_edges(MultiValuedCsrGraph<int,
-																										double>& graph,
-																								int* pV,
-																								size_t n,
-																								cudaStream_t stream);
+    extern CsrGraph<int>* extract_subgraph_by_vertices(CsrGraph<int>& graph,
+                                                       int* pV,
+                                                       size_t n,
+                                                       cudaStream_t stream);
+    extern MultiValuedCsrGraph<int, float>* extract_subgraph_by_vertices(MultiValuedCsrGraph<int,
+                                                                         float>& graph,
+                                                                         int* pV,
+                                                                         size_t n,
+                                                                         cudaStream_t stream);
+    extern MultiValuedCsrGraph<int, double>* extract_subgraph_by_vertices(MultiValuedCsrGraph<int,
+                                                                          double>& graph,
+                                                                          int* pV,
+                                                                          size_t n,
+                                                                          cudaStream_t stream);
+
+    extern CsrGraph<int>* extract_subgraph_by_edges(CsrGraph<int>& graph,
+                                                    int* pV,
+                                                    size_t n,
+                                                    cudaStream_t stream);
+    extern MultiValuedCsrGraph<int, float>* extract_subgraph_by_edges(MultiValuedCsrGraph<int, float>& graph,
+                                                                      int* pV,
+                                                                      size_t n,
+                                                                      cudaStream_t stream);
+    extern MultiValuedCsrGraph<int, double>* extract_subgraph_by_edges(MultiValuedCsrGraph<int,
+                                                                       double>& graph,
+                                                                       int* pV,
+                                                                       size_t n,
+                                                                       cudaStream_t stream);
 
 #ifndef NVGRAPH_LIGHT
 
-	extern CsrGraph<int>* contract_graph_csr_mul(CsrGraph<int>& graph,
-																int* pV,
-																size_t n,
-																cudaStream_t stream,
-																const int& VCombine,
-																const int& VReduce,
-																const int& ECombine,
-																const int& EReduce);
-
-	extern CsrGraph<int>* contract_graph_csr_sum(CsrGraph<int>& graph,
-																int* pV,
-																size_t n,
-																cudaStream_t stream,
-																const int& VCombine,
-																const int& VReduce,
-																const int& ECombine,
-																const int& EReduce);
-
-	extern CsrGraph<int>* contract_graph_csr_min(CsrGraph<int>& graph,
-																int* pV,
-																size_t n,
-																cudaStream_t stream,
-																const int& VCombine,
-																const int& VReduce,
-																const int& ECombine,
-																const int& EReduce);
-
-	extern CsrGraph<int>* contract_graph_csr_max(CsrGraph<int>& graph,
-																int* pV,
-																size_t n,
-																cudaStream_t stream,
-																const int& VCombine,
-																const int& VReduce,
-																const int& ECombine,
-																const int& EReduce);
-
-	extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_mul(MultiValuedCsrGraph<int,
-																										float>& graph,
-																								int* pV,
-																								size_t n,
-																								cudaStream_t stream,
-																								const int& VCombine,
-																								const int& VReduce,
-																								const int& ECombine,
-																								const int& EReduce);
-
-	extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_sum(MultiValuedCsrGraph<int,
-																										float>& graph,
-																								int* pV,
-																								size_t n,
-																								cudaStream_t stream,
-																								const int& VCombine,
-																								const int& VReduce,
-																								const int& ECombine,
-																								const int& EReduce);
-
-	extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_min(MultiValuedCsrGraph<int,
-																										float>& graph,
-																								int* pV,
-																								size_t n,
-																								cudaStream_t stream,
-																								const int& VCombine,
-																								const int& VReduce,
-																								const int& ECombine,
-																								const int& EReduce);
-
-	extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_max(MultiValuedCsrGraph<int,
-																										float>& graph,
-																								int* pV,
-																								size_t n,
-																								cudaStream_t stream,
-																								const int& VCombine,
-																								const int& VReduce,
-																								const int& ECombine,
-																								const int& EReduce);
-
-	extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_mul(MultiValuedCsrGraph<int,
-																											double>& graph,
-																									int* pV,
-																									size_t n,
-																									cudaStream_t stream,
-																									const int& VCombine,
-																									const int& VReduce,
-																									const int& ECombine,
-																									const int& EReduce);
-
-	extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_sum(MultiValuedCsrGraph<int,
-																											double>& graph,
-																									int* pV,
-																									size_t n,
-																									cudaStream_t stream,
-																									const int& VCombine,
-																									const int& VReduce,
-																									const int& ECombine,
-																									const int& EReduce);
-
-	extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_min(MultiValuedCsrGraph<int,
-																											double>& graph,
-																									int* pV,
-																									size_t n,
-																									cudaStream_t stream,
-																									const int& VCombine,
-																									const int& VReduce,
-																									const int& ECombine,
-																									const int& EReduce);
-
-	extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_max(MultiValuedCsrGraph<int,
-																											double>& graph,
-																									int* pV,
-																									size_t n,
-																									cudaStream_t stream,
-																									const int& VCombine,
-																									const int& VReduce,
-																									const int& ECombine,
-																									const int& EReduce);
+    extern CsrGraph<int>* contract_graph_csr_mul(CsrGraph<int>& graph,
+                                                 int* pV,
+                                                 size_t n,
+                                                 cudaStream_t stream,
+                                                 const int& VCombine,
+                                                 const int& VReduce,
+                                                 const int& ECombine,
+                                                 const int& EReduce);
+
+    extern CsrGraph<int>* contract_graph_csr_sum(CsrGraph<int>& graph,
+                                                 int* pV,
+                                                 size_t n,
+                                                 cudaStream_t stream,
+                                                 const int& VCombine,
+                                                 const int& VReduce,
+                                                 const int& ECombine,
+                                                 const int& EReduce);
+
+    extern CsrGraph<int>* contract_graph_csr_min(CsrGraph<int>& graph,
+                                                 int* pV,
+                                                 size_t n,
+                                                 cudaStream_t stream,
+                                                 const int& VCombine,
+                                                 const int& VReduce,
+                                                 const int& ECombine,
+                                                 const int& EReduce);
+
+    extern CsrGraph<int>* contract_graph_csr_max(CsrGraph<int>& graph,
+                                                 int* pV,
+                                                 size_t n,
+                                                 cudaStream_t stream,
+                                                 const int& VCombine,
+                                                 const int& VReduce,
+                                                 const int& ECombine,
+                                                 const int& EReduce);
+
+    extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_mul(MultiValuedCsrGraph<int,
+                                                                        float>& graph,
+                                                                        int* pV,
+                                                                        size_t n,
+                                                                        cudaStream_t stream,
+                                                                        const int& VCombine,
+                                                                        const int& VReduce,
+                                                                        const int& ECombine,
+                                                                        const int& EReduce);
+
+    extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_sum(MultiValuedCsrGraph<int,
+                                                                        float>& graph,
+                                                                        int* pV,
+                                                                        size_t n,
+                                                                        cudaStream_t stream,
+                                                                        const int& VCombine,
+                                                                        const int& VReduce,
+                                                                        const int& ECombine,
+                                                                        const int& EReduce);
+
+    extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_min(MultiValuedCsrGraph<int,
+                                                                        float>& graph,
+                                                                        int* pV,
+                                                                        size_t n,
+                                                                        cudaStream_t stream,
+                                                                        const int& VCombine,
+                                                                        const int& VReduce,
+                                                                        const int& ECombine,
+                                                                        const int& EReduce);
+
+    extern MultiValuedCsrGraph<int, float>* contract_graph_mv_float_max(MultiValuedCsrGraph<int,
+                                                                        float>& graph,
+                                                                        int* pV,
+                                                                        size_t n,
+                                                                        cudaStream_t stream,
+                                                                        const int& VCombine,
+                                                                        const int& VReduce,
+                                                                        const int& ECombine,
+                                                                        const int& EReduce);
+
+    extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_mul(MultiValuedCsrGraph<int,
+                                                                          double>& graph,
+                                                                          int* pV,
+                                                                          size_t n,
+                                                                          cudaStream_t stream,
+                                                                          const int& VCombine,
+                                                                          const int& VReduce,
+                                                                          const int& ECombine,
+                                                                          const int& EReduce);
+
+    extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_sum(MultiValuedCsrGraph<int,
+                                                                          double>& graph,
+                                                                          int* pV,
+                                                                          size_t n,
+                                                                          cudaStream_t stream,
+                                                                          const int& VCombine,
+                                                                          const int& VReduce,
+                                                                          const int& ECombine,
+                                                                          const int& EReduce);
+
+    extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_min(MultiValuedCsrGraph<int,
+                                                                          double>& graph,
+                                                                          int* pV,
+                                                                          size_t n,
+                                                                          cudaStream_t stream,
+                                                                          const int& VCombine,
+                                                                          const int& VReduce,
+                                                                          const int& ECombine,
+                                                                          const int& EReduce);
+
+    extern MultiValuedCsrGraph<int, double>* contract_graph_mv_double_max(MultiValuedCsrGraph<int,
+                                                                          double>& graph,
+                                                                          int* pV,
+                                                                          size_t n,
+                                                                          cudaStream_t stream,
+                                                                          const int& VCombine,
+                                                                          const int& VReduce,
+                                                                          const int& ECombine,
+                                                                          const int& EReduce);
 #endif
 
-	nvgraphStatus_t getCAPIStatusForError(NVGRAPH_ERROR err)
-														{
-		nvgraphStatus_t ret = NVGRAPH_STATUS_SUCCESS;
-
-		switch (err)
-		{
-			case NVGRAPH_OK:
-				ret = NVGRAPH_STATUS_SUCCESS;
-				break;
-			case NVGRAPH_ERR_BAD_PARAMETERS:
-				ret = NVGRAPH_STATUS_INVALID_VALUE;
-				break;
-			case NVGRAPH_ERR_UNKNOWN:
-				ret = NVGRAPH_STATUS_INTERNAL_ERROR;
-				break;
-			case NVGRAPH_ERR_CUDA_FAILURE:
-				ret = NVGRAPH_STATUS_EXECUTION_FAILED;
-				break;
-			case NVGRAPH_ERR_THRUST_FAILURE:
-				ret = NVGRAPH_STATUS_EXECUTION_FAILED;
-				break;
-			case NVGRAPH_ERR_IO:
-				ret = NVGRAPH_STATUS_INTERNAL_ERROR;
-				break;
-			case NVGRAPH_ERR_NOT_IMPLEMENTED:
-				ret = NVGRAPH_STATUS_INVALID_VALUE;
-				break;
-			case NVGRAPH_ERR_NO_MEMORY:
-				ret = NVGRAPH_STATUS_ALLOC_FAILED;
-				break;
-			case NVGRAPH_ERR_NOT_CONVERGED:
-				ret = NVGRAPH_STATUS_NOT_CONVERGED;
-				break;
-			default:
-				ret = NVGRAPH_STATUS_INTERNAL_ERROR;
-		}
-		return ret;
-	}
-
-	extern "C" {
-		const char* nvgraphStatusGetString(nvgraphStatus_t status)
-														{
-			switch (status) {
-				case NVGRAPH_STATUS_SUCCESS:
-					return "Success";
-				case NVGRAPH_STATUS_NOT_INITIALIZED:
-					return "nvGRAPH not initialized";
-				case NVGRAPH_STATUS_ALLOC_FAILED:
-					return "nvGRAPH alloc failed";
-				case NVGRAPH_STATUS_INVALID_VALUE:
-					return "nvGRAPH invalid value";
-				case NVGRAPH_STATUS_ARCH_MISMATCH:
-					return "nvGRAPH arch mismatch";
-				case NVGRAPH_STATUS_MAPPING_ERROR:
-					return "nvGRAPH mapping error";
-				case NVGRAPH_STATUS_EXECUTION_FAILED:
-					return "nvGRAPH execution failed";
-				case NVGRAPH_STATUS_INTERNAL_ERROR:
-					return "nvGRAPH internal error";
-				case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED:
-					return "nvGRAPH type not supported";
-				case NVGRAPH_STATUS_NOT_CONVERGED:
-					return "nvGRAPH algorithm failed to converge";
-				case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED:
-					return "nvGRAPH graph type not supported";
-				default:
-					return "Unknown nvGRAPH Status";
-			}
-		}
-		;
-	}
-
-	static nvgraphStatus_t nvgraphCreateMulti_impl(struct nvgraphContext **outCtx,
-																	int numDevices,
-																	int* _devices) {
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			int device;
-
-			CHECK_CUDA(cudaFree((void * )0));
-			CHECK_CUDA(cudaGetDevice(&device));
-			struct nvgraphContext *ctx = NULL;
-			ctx = (struct nvgraphContext *) malloc(sizeof(*ctx));
-			if (!ctx) {
-				FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN);
-			}
-
-			//cnmem
-			memset(&ctx->cnmem_device, 0, sizeof(ctx->cnmem_device)); // init all to 0
-			ctx->cnmem_device.device = device; // cnmem runs on the device set by cudaSetDevice
-
-			size_t init_alloc = 1; // Initial allocation tentative, it is currently 1 so this feature is basically disabeled.
-
-			// Warning : Should uncomment that if using init_alloc > 1
-			//size_t freeMem, totalMem;
-			//cudaMemGetInfo(&freeMem, &totalMem);
-			//if (freeMem < init_alloc) // Couldn't find enough memory to do the initial alloc
-			//    init_alloc = 1; // (0 is used as default parameter in cnmem)
-
-			ctx->cnmem_device.size = init_alloc;
-			cnmemDevice_t* devices = (cnmemDevice_t*) malloc(sizeof(cnmemDevice_t) * numDevices);
-			memset(devices, 0, sizeof(cnmemDevice_t) * numDevices);
-			for (int i = 0; i < numDevices; i++) {
-				devices[i].device = _devices[i];
-				devices[i].size = 1;
-			}
-			cnmemStatus_t cm_status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
-			free(devices);
-			if (cm_status != CNMEM_STATUS_SUCCESS)
-				FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_UNKNOWN);
-
-			//Cublas and Cusparse
-			nvgraph::Cusparse::get_handle();
-			nvgraph::Cublas::get_handle();
-
-			//others
-			ctx->stream = 0;
-			ctx->nvgraphIsInitialized = true;
-
-			if (outCtx) {
-				*outCtx = ctx;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	static nvgraphStatus_t nvgraphCreate_impl(struct nvgraphContext **outCtx)
-															{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			int device;
-
-			CHECK_CUDA(cudaFree((void * )0));
-			CHECK_CUDA(cudaGetDevice(&device));
-			struct nvgraphContext *ctx = NULL;
-			ctx = (struct nvgraphContext *) malloc(sizeof(*ctx));
-			if (!ctx) {
-				FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN);
-			}
-
-			//cnmem
-			memset(&ctx->cnmem_device, 0, sizeof(ctx->cnmem_device)); // init all to 0
-			ctx->cnmem_device.device = device; // cnmem runs on the device set by cudaSetDevice
-
-			size_t init_alloc = 1; // Initial allocation tentative, it is currently 1 so this feature is basically disabeled.
-
-			// Warning : Should uncomment that if using init_alloc > 1
-			//size_t freeMem, totalMem;
-			//cudaMemGetInfo(&freeMem, &totalMem);
-			//if (freeMem < init_alloc) // Couldn't find enough memory to do the initial alloc
-			//    init_alloc = 1; // (0 is used as default parameter in cnmem)
-
-			ctx->cnmem_device.size = init_alloc;
-
-			cnmemStatus_t cm_status = cnmemInit(1, &ctx->cnmem_device, CNMEM_FLAGS_DEFAULT);
-			if (cm_status != CNMEM_STATUS_SUCCESS)
-				FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_UNKNOWN);
-
-			//Cublas and Cusparse
-			nvgraph::Cusparse::get_handle();
-			nvgraph::Cublas::get_handle();
-
-			//others
-			ctx->stream = 0;
-			ctx->nvgraphIsInitialized = true;
-
-			if (outCtx) {
-				*outCtx = ctx;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	static nvgraphStatus_t nvgraphDestroy_impl(nvgraphHandle_t handle)
-																{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle))
-				FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_NO_MEMORY);
-
-			//Cublas and Cusparse
-			nvgraph::Cusparse::destroy_handle();
-			nvgraph::Cublas::destroy_handle();
-			//cnmem
-
-//     compiler is complaining, cm_status is not used in release build
-#ifdef DEBUG
-			cnmemStatus_t cm_status = cnmemFinalize();
-			if( cm_status != CNMEM_STATUS_SUCCESS ) {
-				CERR() << "Warning: " << cnmemGetErrorString(cm_status) << std::endl;
-			}
-#else
-			cnmemFinalize();
-#endif
-			//others
-			free(handle);
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	static nvgraphStatus_t nvgraphCreateGraphDescr_impl(nvgraphHandle_t handle,
-																			struct nvgraphGraphDescr **outGraphDescr)
-																			{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			struct nvgraphGraphDescr *descrG = NULL;
-			descrG = (struct nvgraphGraphDescr*) malloc(sizeof(*descrG));
-			if (!descrG)
-			{
-				FatalError("Cannot allocate graph descriptor.", NVGRAPH_ERR_UNKNOWN);
-			}
-			descrG->graphStatus = IS_EMPTY;
-			if (outGraphDescr)
-			{
-				*outGraphDescr = descrG;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	static nvgraphStatus_t nvgraphDestroyGraphDescr_impl(nvgraphHandle_t handle,
-																			struct nvgraphGraphDescr *descrG)
-																			{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG)
-			{
-				if (descrG->TT == NVGRAPH_2D_32I_32I) {
-					switch (descrG->T) {
-						case CUDA_R_32I: {
-							nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m =
-									static_cast<nvgraph::Matrix2d<int32_t, int32_t, int32_t>*>(descrG->graph_handle);
-							delete m;
-							break;
-						}
-						default:
-							return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-					}
-				}
-				else {
-					switch (descrG->graphStatus) {
-						case IS_EMPTY: {
-							break;
-						}
-						case HAS_TOPOLOGY: {
-							nvgraph::CsrGraph<int> *CSRG =
-									static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-							delete CSRG;
-							break;
-						}
-						case HAS_VALUES: {
-							if (descrG->T == CUDA_R_32F) {
-								nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-										static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-								delete MCSRG;
-							}
-							else if (descrG->T == CUDA_R_64F) {
-								nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-										static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-								delete MCSRG;
-							}
-							else if (descrG->T == CUDA_R_32I) {
-								nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
-										static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
-								delete MCSRG;
-							}
-							else
-								return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-							break;
-						}
-						default:
-							return NVGRAPH_STATUS_INVALID_VALUE;
-					}
-				}
-				free(descrG);
-			}
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphSetStream_impl(nvgraphHandle_t handle, cudaStream_t stream)
-																		{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			//CnMem
-			cnmemStatus_t cm_status = cnmemRegisterStream(stream);
-			if (cm_status != CNMEM_STATUS_SUCCESS)
-				return NVGRAPH_STATUS_INTERNAL_ERROR;
-			// nvgraph handle
-			handle->stream = stream;
-			//Cublas and Cusparse
-			nvgraph::Cublas::setStream(stream);
-			nvgraph::Cusparse::setStream(stream);
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure_impl(nvgraphHandle_t handle,
-																					nvgraphGraphDescr_t descrG,
-																					void* topologyData,
-																					nvgraphTopologyType_t TT)
-																					{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			if (descrG->graphStatus != IS_EMPTY)
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			if (check_ptr(topologyData))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32)
-					{
-				int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL;
-				switch (TT)
-				{
-					case NVGRAPH_CSR_32:
-						{
-						nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t>(topologyData);
-						if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets)
-								|| check_ptr(t->destination_indices))
-							return NVGRAPH_STATUS_INVALID_VALUE;
-						v = t->nvertices;
-						e = t->nedges;
-						neighborhood = t->source_offsets;
-						edgedest = t->destination_indices;
-						break;
-					}
-					case NVGRAPH_CSC_32:
-						{
-						nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t>(topologyData);
-						if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets)
-								|| check_ptr(t->source_indices))
-							return NVGRAPH_STATUS_INVALID_VALUE;
-						v = t->nvertices;
-						e = t->nedges;
-						neighborhood = t->destination_offsets;
-						edgedest = t->source_indices;
-						break;
-					}
-					default:
-						return NVGRAPH_STATUS_INVALID_VALUE;
-				}
-
-				descrG->TT = TT;
-
-				// Create the internal CSR representation
-				nvgraph::CsrGraph<int> * CSRG = new nvgraph::CsrGraph<int>(v, e, handle->stream);
-
-				CHECK_CUDA(cudaMemcpy(CSRG->get_raw_row_offsets(),
-												neighborhood,
-												(size_t )((CSRG->get_num_vertices() + 1) * sizeof(int)),
-												cudaMemcpyDefault));
-
-				CHECK_CUDA(cudaMemcpy(CSRG->get_raw_column_indices(),
-												edgedest,
-												(size_t )((CSRG->get_num_edges()) * sizeof(int)),
-												cudaMemcpyDefault));
-
-				// Set the graph handle
-				descrG->graph_handle = CSRG;
-				descrG->graphStatus = HAS_TOPOLOGY;
-			}
-			else if (TT == NVGRAPH_2D_32I_32I) {
-				nvgraph2dCOOTopology32I_t td = static_cast<nvgraph2dCOOTopology32I_t>(topologyData);
-				switch (td->valueType) {
-					case CUDA_R_32I: {
-						if (!td->nvertices || !td->nedges || !td->source_indices
-								|| !td->destination_indices || !td->numDevices || !td->devices
-								|| !td->blockN)
-							return NVGRAPH_STATUS_INVALID_VALUE;
-						descrG->TT = TT;
-						descrG->graphStatus = HAS_TOPOLOGY;
-						if (td->values)
-							descrG->graphStatus = HAS_VALUES;
-						descrG->T = td->valueType;
-						std::vector<int32_t> devices;
-						for (int32_t i = 0; i < td->numDevices; i++)
-							devices.push_back(td->devices[i]);
-						nvgraph::MatrixDecompositionDescription<int32_t, int32_t> description(	td->nvertices,
-																														td->blockN,
-																														td->nedges,
-																														devices);
-						nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m = new nvgraph::Matrix2d<int32_t,
-								int32_t, int32_t>();
-						*m = nvgraph::COOto2d(description,
-														td->source_indices,
-														td->destination_indices,
-														(int32_t*) td->values);
-						descrG->graph_handle = m;
-						break;
-					}
-					default: {
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					}
-				}
-			}
-			else
-			{
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure_impl(nvgraphHandle_t handle,
-															nvgraphGraphDescr_t descrG,
-															void* topologyData,
-															nvgraphTopologyType_t TT)
-																					{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			if (descrG->graphStatus != IS_EMPTY)
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			if (check_ptr(topologyData))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32)
-					{
-				int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL;
-				switch (TT)
-				{
-					case NVGRAPH_CSR_32:
-						{
-						nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t>(topologyData);
-						if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets)
-								|| check_ptr(t->destination_indices))
-							return NVGRAPH_STATUS_INVALID_VALUE;
-						v = t->nvertices;
-						e = t->nedges;
-						neighborhood = t->source_offsets;
-						edgedest = t->destination_indices;
-						break;
-					}
-					case NVGRAPH_CSC_32:
-						{
-						nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t>(topologyData);
-						if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets)
-								|| check_ptr(t->source_indices))
-							return NVGRAPH_STATUS_INVALID_VALUE;
-						v = t->nvertices;
-						e = t->nedges;
-						neighborhood = t->destination_offsets;
-						edgedest = t->source_indices;
-						break;
-					}
-					default:
-						return NVGRAPH_STATUS_INVALID_VALUE;
-				}
-
-				descrG->TT = TT;
-
-				// Create the internal CSR representation
-				nvgraph::CsrGraph<int> * CSRG = new nvgraph::CsrGraph<int>(v, e, handle->stream);
-
-				CSRG->set_raw_row_offsets(neighborhood);
-				CSRG->set_raw_column_indices(edgedest);
-
-				// Set the graph handle
-				descrG->graph_handle = CSRG;
-				descrG->graphStatus = HAS_TOPOLOGY;
-			}
-			else if (TT == NVGRAPH_2D_32I_32I) {
-				nvgraph2dCOOTopology32I_t td = static_cast<nvgraph2dCOOTopology32I_t>(topologyData);
-				switch (td->valueType) {
-					case CUDA_R_32I: {
-						if (!td->nvertices || !td->nedges || !td->source_indices
-								|| !td->destination_indices || !td->numDevices || !td->devices
-								|| !td->blockN)
-							return NVGRAPH_STATUS_INVALID_VALUE;
-						descrG->TT = TT;
-						descrG->graphStatus = HAS_TOPOLOGY;
-						if (td->values)
-							descrG->graphStatus = HAS_VALUES;
-						descrG->T = td->valueType;
-						std::vector<int32_t> devices;
-						for (int32_t i = 0; i < td->numDevices; i++)
-							devices.push_back(td->devices[i]);
-						nvgraph::MatrixDecompositionDescription<int32_t, int32_t> description(	td->nvertices,
-																														td->blockN,
-																														td->nedges,
-																														devices);
-						nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m = new nvgraph::Matrix2d<int32_t,
-								int32_t, int32_t>();
-						*m = nvgraph::COOto2d(description,
-														td->source_indices,
-														td->destination_indices,
-														(int32_t*) td->values);
-						descrG->graph_handle = m;
-						break;
-					}
-					default: {
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					}
-				}
-			}
-			else
-			{
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure_impl(nvgraphHandle_t handle,
-																					nvgraphGraphDescr_t descrG,
-																					void* topologyData,
-																					nvgraphTopologyType_t* TT)
-																					{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_topology(descrG))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			nvgraphTopologyType_t graphTType = descrG->TT;
-
-			if (TT != NULL)
-				*TT = graphTType;
-
-			if (topologyData != NULL) {
-				nvgraph::CsrGraph<int> *CSRG =
-						static_cast<nvgraph::CsrGraph<int> *>(descrG->graph_handle);
-				int v = static_cast<int>(CSRG->get_num_vertices());
-				int e = static_cast<int>(CSRG->get_num_edges());
-				int *neighborhood = NULL, *edgedest = NULL;
-
-				switch (graphTType)
-				{
-					case NVGRAPH_CSR_32:
-						{
-						nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t>(topologyData);
-						t->nvertices = static_cast<int>(v);
-						t->nedges = static_cast<int>(e);
-						neighborhood = t->source_offsets;
-						edgedest = t->destination_indices;
-						break;
-					}
-					case NVGRAPH_CSC_32:
-						{
-						nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t>(topologyData);
-						t->nvertices = static_cast<int>(v);
-						t->nedges = static_cast<int>(e);
-						neighborhood = t->destination_offsets;
-						edgedest = t->source_indices;
-						break;
-					}
-					default:
-						return NVGRAPH_STATUS_INTERNAL_ERROR;
-				}
-
-				if (neighborhood != NULL) {
-					CHECK_CUDA(cudaMemcpy(neighborhood,
-													CSRG->get_raw_row_offsets(),
-													(size_t )((v + 1) * sizeof(int)),
-													cudaMemcpyDefault));
-				}
-
-				if (edgedest != NULL) {
-					CHECK_CUDA(cudaMemcpy(edgedest,
-													CSRG->get_raw_column_indices(),
-													(size_t )((e) * sizeof(int)),
-													cudaMemcpyDefault));
-				}
-
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData_impl(nvgraphHandle_t handle,
-																					nvgraphGraphDescr_t descrG,
-																					size_t numsets,
-																					cudaDataType_t *settypes)
-																					{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(numsets)
-					|| check_ptr(settypes))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			if (check_uniform_type_array(settypes, numsets))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
-					{
-				if (*settypes == CUDA_R_32F)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
-							int, float>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else if (*settypes == CUDA_R_64F)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
-							int, double>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else if (*settypes == CUDA_R_32I)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
-							int>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-				descrG->T = *settypes;
-				descrG->graphStatus = HAS_VALUES;
-			}
-			else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
-					{
-				if (*settypes != descrG->T)
-					return NVGRAPH_STATUS_INVALID_VALUE;
-			}
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			// Allocate and transfer
-			if (*settypes == CUDA_R_32F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-				MCSRG->allocateVertexData(numsets, NULL);
-			}
-			else if (*settypes == CUDA_R_64F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-				MCSRG->allocateVertexData(numsets, NULL);
-			}
-			else if (*settypes == CUDA_R_32I)
-					{
-				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
-				MCSRG->allocateVertexData(numsets, NULL);
-			}
-			else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData_impl(nvgraphHandle_t handle,
-															 nvgraphGraphDescr_t descrG,
-															 size_t setnum,
-															 cudaDataType_t settype,
-															 void *vertexData)
-															 {
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
-					{
-				if (settype == CUDA_R_32F)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
-							int, float>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else if (settype == CUDA_R_64F)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
-							int, double>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else if (settype == CUDA_R_32I)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
-							int>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-				descrG->T = settype;
-				descrG->graphStatus = HAS_VALUES;
-			}
-			else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
-					{
-				if (settype != descrG->T)
-					return NVGRAPH_STATUS_INVALID_VALUE;
-			}
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			// transfer
-			if (settype == CUDA_R_32F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-				MCSRG->attachVertexData(setnum, (float*)vertexData, NULL);
-			}
-			else if (settype == CUDA_R_64F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-				MCSRG->attachVertexData(setnum, (double*)vertexData, NULL);
-			}
-			else if (settype == CUDA_R_32I)
-					{
-				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
-				MCSRG->attachVertexData(setnum, (int*)vertexData, NULL);
-			}
-			else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-	nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData_impl(nvgraphHandle_t handle,
-																				nvgraphGraphDescr_t descrG,
-																				size_t numsets,
-																				cudaDataType_t *settypes)
-																				{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(numsets)
-					|| check_ptr(settypes))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			if (check_uniform_type_array(settypes, numsets))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			// Look at what kind of graph we have
-			if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
-					{
-				if (*settypes == CUDA_R_32F)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
-							int, float>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else if (*settypes == CUDA_R_64F)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
-							int, double>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else if (*settypes == CUDA_R_32I)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
-							int>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-				descrG->T = *settypes;
-				descrG->graphStatus = HAS_VALUES;
-			}
-			else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
-					{
-				if (*settypes != descrG->T)
-					return NVGRAPH_STATUS_INVALID_VALUE;
-			}
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			// Allocate and transfer
-			if (*settypes == CUDA_R_32F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-				MCSRG->allocateEdgeData(numsets, NULL);
-			}
-			else if (*settypes == CUDA_R_64F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-				MCSRG->allocateEdgeData(numsets, NULL);
-			}
-			else if (*settypes == CUDA_R_32I)
-					{
-				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
-				MCSRG->allocateEdgeData(numsets, NULL);
-			}
-			else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData_impl(nvgraphHandle_t handle,
-														   nvgraphGraphDescr_t descrG,
-														   size_t setnum,
-														   cudaDataType_t settype,
-														   void *edgeData)
-														   {
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			// Look at what kind of graph we have
-			if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
-					{
-				if (settype == CUDA_R_32F)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
-							int, float>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else if (settype == CUDA_R_64F)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
-							int, double>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else if (settype == CUDA_R_32I)
-						{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
-							int>(*CSRG);
-					descrG->graph_handle = MCSRG;
-				}
-				else
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-				descrG->T = settype;
-				descrG->graphStatus = HAS_VALUES;
-			}
-			else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
-					{
-				if (settype != descrG->T)
-					return NVGRAPH_STATUS_INVALID_VALUE;
-			}
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			// Allocate and transfer
-			if (settype == CUDA_R_32F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-				MCSRG->attachEdgeData(setnum, (float*)edgeData, NULL);
-			}
-			else if (settype == CUDA_R_64F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-				MCSRG->attachEdgeData(setnum, (double*)edgeData, NULL);
-			}
-			else if (settype == CUDA_R_32I)
-					{
-				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
-				MCSRG->attachEdgeData(setnum, (int*)edgeData, NULL);
-			}
-			else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData_impl(nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			void *vertexData,
-																			size_t setnum)
-																			{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
-					|| check_ptr(vertexData))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				FatalError("Graph should have allocated values.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->T == CUDA_R_32F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum),
-								(float*) vertexData,
-								(size_t) ((MCSRG->get_num_vertices()) * sizeof(float)),
-								cudaMemcpyDefault);
-			}
-			else if (descrG->T == CUDA_R_64F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum),
-								(double*) vertexData,
-								(size_t) ((MCSRG->get_num_vertices()) * sizeof(double)),
-								cudaMemcpyDefault);
-			}
-			else if (descrG->T == CUDA_R_32I)
-					{
-				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum),
-								(int*) vertexData,
-								(size_t) ((MCSRG->get_num_vertices()) * sizeof(int)),
-								cudaMemcpyDefault);
-			}
-			else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-
-			cudaCheckError()
-							;
-
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData_impl(nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			void *vertexData,
-																			size_t setnum)
-																			{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
-					|| check_ptr(vertexData))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				FatalError("Graph should have values.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->T == CUDA_R_32F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy((float*) vertexData,
-								MCSRG->get_raw_vertex_dim(setnum),
-								(size_t) ((MCSRG->get_num_vertices()) * sizeof(float)),
-								cudaMemcpyDefault);
-			}
-			else if (descrG->T == CUDA_R_64F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy((double*) vertexData,
-								MCSRG->get_raw_vertex_dim(setnum),
-								(size_t) ((MCSRG->get_num_vertices()) * sizeof(double)),
-								cudaMemcpyDefault);
-			}
-			else if (descrG->T == CUDA_R_32I)
-					{
-				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy((int*) vertexData,
-								MCSRG->get_raw_vertex_dim(setnum),
-								(size_t) ((MCSRG->get_num_vertices()) * sizeof(int)),
-								cudaMemcpyDefault);
-			}
-			else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-
-			cudaCheckError()
-							;
-
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology_impl(nvgraphHandle_t handle,
-																				nvgraphTopologyType_t srcTType,
-																				void *srcTopology,
-																				void *srcEdgeData,
-																				cudaDataType_t *dataType,
-																				nvgraphTopologyType_t dstTType,
-																				void *dstTopology,
-																				void *dstEdgeData)
-																				{
-
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_ptr(dstEdgeData) || check_ptr(srcEdgeData))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			size_t sizeT;
-			if (*dataType == CUDA_R_32F)
-				sizeT = sizeof(float);
-			else if (*dataType == CUDA_R_64F)
-				sizeT = sizeof(double);
-			else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-
-			// Trust me, this better than nested if's.
-			if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSR_32) {                  // CSR2CSR
-				nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t>(srcTopology);
-				nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t>(dstTopology);
-				dstT->nvertices = srcT->nvertices;
-				dstT->nedges = srcT->nedges;
-				CHECK_CUDA(cudaMemcpy(dstT->source_offsets,
-												srcT->source_offsets,
-												(srcT->nvertices + 1) * sizeof(int),
-												cudaMemcpyDefault));
-				CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
-												srcT->destination_indices,
-												srcT->nedges * sizeof(int),
-												cudaMemcpyDefault));
-				CHECK_CUDA(cudaMemcpy(dstEdgeData,
-												srcEdgeData,
-												srcT->nedges * sizeT,
-												cudaMemcpyDefault));
-			} else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSC_32) {           // CSR2CSC
-				nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t>(srcTopology);
-				nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t>(dstTopology);
-				dstT->nvertices = srcT->nvertices;
-				dstT->nedges = srcT->nedges;
-				csr2csc(srcT->nvertices, srcT->nvertices, srcT->nedges,
-							srcEdgeData,
-							srcT->source_offsets, srcT->destination_indices,
-							dstEdgeData,
-							dstT->source_indices, dstT->destination_offsets,
-							CUSPARSE_ACTION_NUMERIC,
-							CUSPARSE_INDEX_BASE_ZERO, dataType);
-			} else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_COO_32) {           // CSR2COO
-				nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t>(srcTopology);
-				nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t>(dstTopology);
-				dstT->nvertices = srcT->nvertices;
-				dstT->nedges = srcT->nedges;
-				if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE || dstT->tag == NVGRAPH_DEFAULT
-						|| dstT->tag == NVGRAPH_UNSORTED) {
-					csr2coo(srcT->source_offsets,
-								srcT->nedges,
-								srcT->nvertices,
-								dstT->source_indices,
-								CUSPARSE_INDEX_BASE_ZERO);
-					CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
-													srcT->destination_indices,
-													srcT->nedges * sizeof(int),
-													cudaMemcpyDefault));
-					CHECK_CUDA(cudaMemcpy(dstEdgeData,
-													srcEdgeData,
-													srcT->nedges * sizeT,
-													cudaMemcpyDefault));
-				} else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
-					// Step 1: Convert to COO_Source
-					csr2coo(srcT->source_offsets,
-								srcT->nedges,
-								srcT->nvertices,
-								dstT->source_indices,
-								CUSPARSE_INDEX_BASE_ZERO);
-					// Step 2: Convert to COO_Destination
-					cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges,
-												srcEdgeData,
-												dstT->source_indices, srcT->destination_indices,
-												dstEdgeData,
-												dstT->source_indices, dstT->destination_indices,
-												CUSPARSE_INDEX_BASE_ZERO,
-												dataType);
-				} else {
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-				}
-				///////////////////////////////////////////////////////////////////////////////////////////////////////////
-			} else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSR_32) {           // CSC2CSR
-				nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t>(srcTopology);
-				nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t>(dstTopology);
-				dstT->nvertices = srcT->nvertices;
-				dstT->nedges = srcT->nedges;
-				csc2csr(srcT->nvertices, srcT->nvertices, srcT->nedges,
-							srcEdgeData,
-							srcT->source_indices, srcT->destination_offsets,
-							dstEdgeData,
-							dstT->source_offsets, dstT->destination_indices,
-							CUSPARSE_ACTION_NUMERIC,
-							CUSPARSE_INDEX_BASE_ZERO, dataType);
-			} else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSC_32) {           // CSC2CSC
-				nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t>(srcTopology);
-				nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t>(dstTopology);
-				dstT->nvertices = srcT->nvertices;
-				dstT->nedges = srcT->nedges;
-				CHECK_CUDA(cudaMemcpy(dstT->destination_offsets,
-												srcT->destination_offsets,
-												(srcT->nvertices + 1) * sizeof(int),
-												cudaMemcpyDefault));
-				CHECK_CUDA(cudaMemcpy(dstT->source_indices,
-												srcT->source_indices,
-												srcT->nedges * sizeof(int),
-												cudaMemcpyDefault));
-				CHECK_CUDA(cudaMemcpy(dstEdgeData,
-												srcEdgeData,
-												srcT->nedges * sizeT,
-												cudaMemcpyDefault));
-			} else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_COO_32) {           // CSC2COO
-				nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t>(srcTopology);
-				nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t>(dstTopology);
-				dstT->nvertices = srcT->nvertices;
-				dstT->nedges = srcT->nedges;
-				if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) {
-					// Step 1: Convert to COO_Destination
-					csr2coo(srcT->destination_offsets,
-								srcT->nedges,
-								srcT->nvertices,
-								dstT->destination_indices,
-								CUSPARSE_INDEX_BASE_ZERO);
-					// Step 2: Convert to COO_Source
-					cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges,
-											srcEdgeData,
-											srcT->source_indices, dstT->destination_indices,
-											dstEdgeData,
-											dstT->source_indices, dstT->destination_indices,
-											CUSPARSE_INDEX_BASE_ZERO,
-											dataType);
-				} else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION || dstT->tag == NVGRAPH_DEFAULT
-						|| dstT->tag == NVGRAPH_UNSORTED) {
-					csr2coo(srcT->destination_offsets,
-								srcT->nedges,
-								srcT->nvertices,
-								dstT->destination_indices,
-								CUSPARSE_INDEX_BASE_ZERO);
-					CHECK_CUDA(cudaMemcpy(dstT->source_indices,
-													srcT->source_indices,
-													srcT->nedges * sizeof(int),
-													cudaMemcpyDefault));
-					CHECK_CUDA(cudaMemcpy(dstEdgeData,
-													srcEdgeData,
-													srcT->nedges * sizeT,
-													cudaMemcpyDefault));
-				} else {
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-				}
-				///////////////////////////////////////////////////////////////////////////////////////////////////////////
-			} else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSR_32) {           // COO2CSR
-				nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t>(srcTopology);
-				nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t>(dstTopology);
-				dstT->nvertices = srcT->nvertices;
-				dstT->nedges = srcT->nedges;
-				if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) {
-					coo2csr(srcT->source_indices,
-								srcT->nedges,
-								srcT->nvertices,
-								dstT->source_offsets,
-								CUSPARSE_INDEX_BASE_ZERO);
-					CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
-													srcT->destination_indices,
-													srcT->nedges * sizeof(int),
-													cudaMemcpyDefault));
-					CHECK_CUDA(cudaMemcpy(dstEdgeData,
-													srcEdgeData,
-													srcT->nedges * sizeT,
-													cudaMemcpyDefault));
-				} else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
-					cood2csr(srcT->nvertices, srcT->nvertices, srcT->nedges,
-								srcEdgeData,
-								srcT->source_indices, srcT->destination_indices,
-								dstEdgeData,
-								dstT->source_offsets, dstT->destination_indices,
-								CUSPARSE_INDEX_BASE_ZERO,
-								dataType);
-				} else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) {
-					coou2csr(srcT->nvertices, srcT->nvertices, srcT->nedges,
-								srcEdgeData,
-								srcT->source_indices, srcT->destination_indices,
-								dstEdgeData,
-								dstT->source_offsets, dstT->destination_indices,
-								CUSPARSE_INDEX_BASE_ZERO,
-								dataType);
-				} else {
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-				}
-			} else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSC_32) {           // COO2CSC
-				nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t>(srcTopology);
-				nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t>(dstTopology);
-				dstT->nvertices = srcT->nvertices;
-				dstT->nedges = srcT->nedges;
-				if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) {
-					coos2csc(srcT->nvertices, srcT->nvertices, srcT->nedges,
-								srcEdgeData,
-								srcT->source_indices, srcT->destination_indices,
-								dstEdgeData,
-								dstT->source_indices, dstT->destination_offsets,
-								CUSPARSE_INDEX_BASE_ZERO,
-								dataType);
-				} else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
-					coo2csr(srcT->destination_indices,
-								srcT->nedges,
-								srcT->nvertices,
-								dstT->destination_offsets,
-								CUSPARSE_INDEX_BASE_ZERO);
-					CHECK_CUDA(cudaMemcpy(dstT->source_indices,
-													srcT->source_indices,
-													srcT->nedges * sizeof(int),
-													cudaMemcpyDefault));
-					CHECK_CUDA(cudaMemcpy(dstEdgeData,
-													srcEdgeData,
-													srcT->nedges * sizeT,
-													cudaMemcpyDefault));
-				} else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) {
-					coou2csc(srcT->nvertices, srcT->nvertices, srcT->nedges,
-								srcEdgeData,
-								srcT->source_indices, srcT->destination_indices,
-								dstEdgeData,
-								dstT->source_indices, dstT->destination_offsets,
-								CUSPARSE_INDEX_BASE_ZERO,
-								dataType);
-				} else {
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-				}
-			} else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_COO_32) {           // COO2COO
-				nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t>(srcTopology);
-				nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t>(dstTopology);
-				dstT->nvertices = srcT->nvertices;
-				dstT->nedges = srcT->nedges;
-				if (srcT->tag == dstT->tag || dstT->tag == NVGRAPH_DEFAULT
-						|| dstT->tag == NVGRAPH_UNSORTED) {
-					CHECK_CUDA(cudaMemcpy(dstT->source_indices,
-													srcT->source_indices,
-													srcT->nedges * sizeof(int),
-													cudaMemcpyDefault));
-					CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
-													srcT->destination_indices,
-													srcT->nedges * sizeof(int),
-													cudaMemcpyDefault));
-					CHECK_CUDA(cudaMemcpy(dstEdgeData,
-													srcEdgeData,
-													srcT->nedges * sizeT,
-													cudaMemcpyDefault));
-				} else if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) {
-					cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges,
-											srcEdgeData,
-											srcT->source_indices, srcT->destination_indices,
-											dstEdgeData,
-											dstT->source_indices, dstT->destination_indices,
-											CUSPARSE_INDEX_BASE_ZERO,
-											dataType);
-				} else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
-					cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges,
-												srcEdgeData,
-												srcT->source_indices, srcT->destination_indices,
-												dstEdgeData,
-												dstT->source_indices, dstT->destination_indices,
-												CUSPARSE_INDEX_BASE_ZERO,
-												dataType);
-				} else {
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-				}
-
-				///////////////////////////////////////////////////////////////////////////////////////////////////////////
-			} else {
-				return NVGRAPH_STATUS_INVALID_VALUE;
-			}
-
-		}
-		NVGRAPH_CATCHES(rc)
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph_impl(nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t srcDescrG,
-																			nvgraphGraphDescr_t dstDescrG,
-																			nvgraphTopologyType_t dstTType)
-																			{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		nvgraphStatus_t status = NVGRAPH_STATUS_SUCCESS;
-		try
-		{
-			if (check_context(handle) || check_graph(srcDescrG))  // Graph must have a topology
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (dstDescrG->graphStatus != IS_EMPTY) // dst Graph must be empty
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			// graphs can only have CSR or CSC topology (EL is for storage only)
-			if (srcDescrG->TT != NVGRAPH_CSR_32 && srcDescrG->TT != NVGRAPH_CSC_32)
-				return NVGRAPH_STATUS_INTERNAL_ERROR; // invalid state, you can only create graph with CSR/CSC
-			if (dstTType != NVGRAPH_CSR_32 && dstTType != NVGRAPH_CSC_32)
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; // only conversion to CSR/CSC is allowed
-
-			int nvertices, nedges;
-			int *srcOffsets = NULL, *srcIndices = NULL, *dstOffsets = NULL, *dstIndices = NULL;
-			SHARED_PREFIX::shared_ptr<int> permutation, offsets, indices;
-
-			// Step 1: get source graph structure
-			nvgraph::CsrGraph<int> *CSRG =
-					static_cast<nvgraph::CsrGraph<int> *>(srcDescrG->graph_handle);
-			nvertices = static_cast<int>(CSRG->get_num_vertices());
-			nedges = static_cast<int>(CSRG->get_num_edges());
-			srcOffsets = CSRG->get_raw_row_offsets();
-			srcIndices = CSRG->get_raw_column_indices();
-
-			// Step 2: convert topology and get permutation array.
-			if (srcDescrG->TT != dstTType) { // Otherwise conversion is not needed, only copy.
-				offsets = allocateDevice<int>(nvertices + 1, NULL);
-				indices = allocateDevice<int>(nedges, NULL);
-				permutation = allocateDevice<int>(nedges, NULL);
-				csr2cscP(nvertices, nvertices, nedges,
-							srcOffsets,
-							srcIndices,
-							indices.get(),
-							offsets.get(), permutation.get(), CUSPARSE_INDEX_BASE_ZERO);
-				dstOffsets = offsets.get();
-				dstIndices = indices.get();
-			} else {
-				dstOffsets = srcOffsets;
-				dstIndices = srcIndices;
-			}
-
-			// Step 3: Set dst graph structure
-			if (dstTType == NVGRAPH_CSR_32) {
-				nvgraphCSRTopology32I_st dstTopology;
-				dstTopology.nedges = nedges;
-				dstTopology.nvertices = nvertices;
-				dstTopology.source_offsets = dstOffsets;
-				dstTopology.destination_indices = dstIndices;
-				status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType);
-			} else if (dstTType == NVGRAPH_CSC_32) {
-				nvgraphCSCTopology32I_st dstTopology;
-				dstTopology.nedges = nedges;
-				dstTopology.nvertices = nvertices;
-				dstTopology.destination_offsets = dstOffsets;
-				dstTopology.source_indices = dstIndices;
-				status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType);
-			} else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			if (status != NVGRAPH_STATUS_SUCCESS)
-				return NVGRAPH_STATUS_INTERNAL_ERROR;
-			offsets.reset();
-			indices.reset();
-
-			// Step 4: Allocate, convert and set edge+vertex data on the new graph
-			if (srcDescrG->graphStatus == HAS_VALUES) {
-				if (srcDescrG->T == CUDA_R_32F) {
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(srcDescrG->graph_handle);
-					size_t vertexDim = MCSRG->get_num_vertex_dim();
-					size_t edgesDim = MCSRG->get_num_edge_dim();
-					// Step 4.1: allocate and set vertex data (no need for convert)
-					if (vertexDim > 0) {
-						std::vector<cudaDataType_t> vertexDataType(vertexDim);
-						std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T);
-						status = nvgraphAllocateVertexData(handle,
-																		dstDescrG,
-																		vertexDim,
-																		vertexDataType.data());
-						if (status != NVGRAPH_STATUS_SUCCESS)
-							return NVGRAPH_STATUS_INTERNAL_ERROR;
-						for (size_t i = 0; i < vertexDim; ++i) {
-							void *vertexData = MCSRG->get_raw_vertex_dim(i);
-							status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i);
-							if (status != NVGRAPH_STATUS_SUCCESS)
-								return NVGRAPH_STATUS_INTERNAL_ERROR;
-						}
-					}
-					// Step 4.2: allocate and set vertex data
-					if (edgesDim > 0) {
-						void *dstEdgeData = NULL;
-						SHARED_PREFIX::shared_ptr<float> dstEdgeDataSP;
-
-						std::vector<cudaDataType_t> edgeDataType(edgesDim);
-						std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T);
-						status = nvgraphAllocateEdgeData(handle,
-																	dstDescrG,
-																	edgesDim,
-																	edgeDataType.data());
-						if (status != NVGRAPH_STATUS_SUCCESS)
-							return NVGRAPH_STATUS_INTERNAL_ERROR;
-						// allocate edge data memory (if there is a need)
-						if (edgesDim > 0 && srcDescrG->TT != dstTType) {
-							dstEdgeDataSP = allocateDevice<float>(nedges, NULL);
-							dstEdgeData = dstEdgeDataSP.get();
-						}
-						// Convert and set edge data (using permutation array)
-						for (size_t i = 0; i < edgesDim; ++i) {
-							void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i));
-							if (srcDescrG->TT != dstTType) // Convert using permutation array
-								gthrX(nedges,
-										srcEdgeData,
-										dstEdgeData,
-										permutation.get(),
-										CUSPARSE_INDEX_BASE_ZERO,
-										&(srcDescrG->T));
-							else
-								dstEdgeData = srcEdgeData;
-							// set edgedata
-							status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i);
-							if (status != NVGRAPH_STATUS_SUCCESS)
-								return NVGRAPH_STATUS_INTERNAL_ERROR;
-						}
-					}
-				} else if (srcDescrG->T == CUDA_R_64F) {
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(srcDescrG->graph_handle);
-					size_t vertexDim = MCSRG->get_num_vertex_dim();
-					size_t edgesDim = MCSRG->get_num_edge_dim();
-					// Step 4.1: allocate and set vertex data (no need for convert)
-					if (vertexDim > 0) {
-						std::vector<cudaDataType_t> vertexDataType(vertexDim);
-						std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T);
-						status = nvgraphAllocateVertexData(handle,
-																		dstDescrG,
-																		vertexDim,
-																		vertexDataType.data());
-						if (status != NVGRAPH_STATUS_SUCCESS)
-							return NVGRAPH_STATUS_INTERNAL_ERROR;
-						for (size_t i = 0; i < vertexDim; ++i) {
-							void *vertexData = MCSRG->get_raw_vertex_dim(i);
-							status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i);
-							if (status != NVGRAPH_STATUS_SUCCESS)
-								return NVGRAPH_STATUS_INTERNAL_ERROR;
-						}
-					}
-					// Step 4.2: allocate and set vertex data
-					if (edgesDim > 0) {
-						void *dstEdgeData = NULL;
-						SHARED_PREFIX::shared_ptr<double> dstEdgeDataSP;
-
-						std::vector<cudaDataType_t> edgeDataType(edgesDim);
-						std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T);
-						status = nvgraphAllocateEdgeData(handle,
-																	dstDescrG,
-																	edgesDim,
-																	edgeDataType.data());
-						if (status != NVGRAPH_STATUS_SUCCESS)
-							return NVGRAPH_STATUS_INTERNAL_ERROR;
-						// allocate edge data memory (if there is a need)
-						if (edgesDim > 0 && srcDescrG->TT != dstTType) {
-							dstEdgeDataSP = allocateDevice<double>(nedges, NULL);
-							dstEdgeData = dstEdgeDataSP.get();
-						}
-						// Convert and set edge data (using permutation array)
-						for (size_t i = 0; i < edgesDim; ++i) {
-							void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i));
-							if (srcDescrG->TT != dstTType) // Convert using permutation array
-								gthrX(nedges,
-										srcEdgeData,
-										dstEdgeData,
-										permutation.get(),
-										CUSPARSE_INDEX_BASE_ZERO,
-										&(srcDescrG->T));
-							else
-								dstEdgeData = srcEdgeData;
-							// set edgedata
-							status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i);
-							if (status != NVGRAPH_STATUS_SUCCESS)
-								return NVGRAPH_STATUS_INTERNAL_ERROR;
-						}
-					}
-				} else
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-		return getCAPIStatusForError(rc);
-
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData_impl(nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			void *edgeData,
-																			size_t setnum)
-																			{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
-					|| check_ptr(edgeData))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->T == CUDA_R_32F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy(MCSRG->get_raw_edge_dim(setnum),
-								(float*) edgeData,
-								(size_t) ((MCSRG->get_num_edges()) * sizeof(float)),
-								cudaMemcpyDefault);
-			}
-			else if (descrG->T == CUDA_R_64F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy(MCSRG->get_raw_edge_dim(setnum),
-								(double*) edgeData,
-								(size_t) ((MCSRG->get_num_edges()) * sizeof(double)),
-								cudaMemcpyDefault);
-			}
-			else if (descrG->T == CUDA_R_32I)
-					{
-				nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy(MCSRG->get_raw_edge_dim(setnum),
-								(int*) edgeData,
-								(size_t) ((MCSRG->get_num_edges()) * sizeof(int)),
-								cudaMemcpyDefault);
-			}
-			else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-
-			cudaCheckError()
-							;
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData_impl(nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			void *edgeData,
-																			size_t setnum)
-																			{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
-					|| check_ptr(edgeData))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->T == CUDA_R_32F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy((float*) edgeData,
-								MCSRG->get_raw_edge_dim(setnum),
-								(size_t) ((MCSRG->get_num_edges()) * sizeof(float)),
-								cudaMemcpyDefault);
-			}
-			else if (descrG->T == CUDA_R_64F)
-					{
-				nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-						static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-				if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
-					return NVGRAPH_STATUS_INVALID_VALUE;
-				cudaMemcpy((double*) edgeData,
-								MCSRG->get_raw_edge_dim(setnum),
-								(size_t) ((MCSRG->get_num_edges()) * sizeof(double)),
-								cudaMemcpyDefault);
-			}
-			else
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-
-			cudaCheckError()
-							;
-
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv_impl_cub(nvgraphHandle_t handle,
-																		const nvgraphGraphDescr_t descrG,
-																		const size_t weight_index,
-																		const void *alpha,
-																		const size_t x,
-																		const void *beta,
-																		const size_t y,
-																		const nvgraphSemiring_t SR)
-																		{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-
-		try
-		{
-			// some basic checks
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			rc = SemiringAPILauncher(handle, descrG, weight_index, alpha, x, beta, y, SR);
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphSssp_impl(nvgraphHandle_t handle,
-																const nvgraphGraphDescr_t descrG,
-																const size_t weight_index,
-																const int *source_vert,
-																const size_t sssp)
-																{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
-					|| check_int_ptr(source_vert))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
-				return NVGRAPH_STATUS_INVALID_VALUE;
+    nvgraphStatus_t getCAPIStatusForError(NVGRAPH_ERROR err) {
+        nvgraphStatus_t ret = NVGRAPH_STATUS_SUCCESS;
+
+        switch (err) {
+            case NVGRAPH_OK:
+                ret = NVGRAPH_STATUS_SUCCESS;
+                break;
+            case NVGRAPH_ERR_BAD_PARAMETERS:
+                ret = NVGRAPH_STATUS_INVALID_VALUE;
+                break;
+            case NVGRAPH_ERR_UNKNOWN:
+                ret = NVGRAPH_STATUS_INTERNAL_ERROR;
+                break;
+            case NVGRAPH_ERR_CUDA_FAILURE:
+                ret = NVGRAPH_STATUS_EXECUTION_FAILED;
+                break;
+            case NVGRAPH_ERR_THRUST_FAILURE:
+                ret = NVGRAPH_STATUS_EXECUTION_FAILED;
+                break;
+            case NVGRAPH_ERR_IO:
+                ret = NVGRAPH_STATUS_INTERNAL_ERROR;
+                break;
+            case NVGRAPH_ERR_NOT_IMPLEMENTED:
+                ret = NVGRAPH_STATUS_INVALID_VALUE;
+                break;
+            case NVGRAPH_ERR_NO_MEMORY:
+                ret = NVGRAPH_STATUS_ALLOC_FAILED;
+                break;
+            case NVGRAPH_ERR_NOT_CONVERGED:
+                ret = NVGRAPH_STATUS_NOT_CONVERGED;
+                break;
+            default:
+                ret = NVGRAPH_STATUS_INTERNAL_ERROR;
+        }
+        return ret;
+    }
+
+    extern "C" {
+        const char* nvgraphStatusGetString(nvgraphStatus_t status) {
+            switch (status) {
+                case NVGRAPH_STATUS_SUCCESS:
+                    return "Success";
+                case NVGRAPH_STATUS_NOT_INITIALIZED:
+                    return "nvGRAPH not initialized";
+                case NVGRAPH_STATUS_ALLOC_FAILED:
+                    return "nvGRAPH alloc failed";
+                case NVGRAPH_STATUS_INVALID_VALUE:
+                    return "nvGRAPH invalid value";
+                case NVGRAPH_STATUS_ARCH_MISMATCH:
+                    return "nvGRAPH arch mismatch";
+                case NVGRAPH_STATUS_MAPPING_ERROR:
+                    return "nvGRAPH mapping error";
+                case NVGRAPH_STATUS_EXECUTION_FAILED:
+                    return "nvGRAPH execution failed";
+                case NVGRAPH_STATUS_INTERNAL_ERROR:
+                    return "nvGRAPH internal error";
+                case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED:
+                    return "nvGRAPH type not supported";
+                case NVGRAPH_STATUS_NOT_CONVERGED:
+                    return "nvGRAPH algorithm failed to converge";
+                case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED:
+                    return "nvGRAPH graph type not supported";
+                default:
+                    return "Unknown nvGRAPH Status";
+            }
+        }
+    }
+
+    static nvgraphStatus_t nvgraphCreateMulti_impl(struct nvgraphContext **outCtx,
+                                                   int numDevices,
+                                                   int* _devices) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            // First, initialize NVGraph's context
+
+            auto ctx = static_cast<struct nvgraphContext*>(calloc(1, sizeof(struct nvgraphContext)));
+            if (ctx == nullptr) {
+                FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN);
+            }
+
+            auto option = rmmOptions_t{};
+            if (rmmIsInitialized(&option) == true) {
+                if ((option.allocation_mode & PoolAllocation) != 0) {
+                    FatalError("RMM does not support multi-GPUs with pool allocation, yet.", NVGRAPH_ERR_UNKNOWN);
+                }
+            }
+            // if RMM is unintialized, RMM_ALLOC/RMM_FREE are just aliases for cudaMalloc/cudaFree
+
+            ctx->stream = nullptr;
+            ctx->nvgraphIsInitialized = true;
+
+             if (outCtx != nullptr) {
+                 *outCtx = ctx;
+             }
+
+            // Second, initialize Cublas and Cusparse (get_handle() creates a new handle
+            // if there is no existing handle).
+
+            nvgraph::Cusparse::get_handle();
+            nvgraph::Cublas::get_handle();
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    static nvgraphStatus_t nvgraphCreate_impl(struct nvgraphContext **outCtx) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            // First, initialize NVGraph's context
+
+            auto ctx = static_cast<struct nvgraphContext*>(calloc(1, sizeof(struct nvgraphContext)));
+            if (ctx == nullptr) {
+                FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN);
+            }
+
+            // Now NVGraph assumes that RMM is initialized outside NVGraph
+            // if RMM is unintialized, RMM_ALLOC/RMM_FREE are just aliases for cudaMalloc/cudaFree
+
+            ctx->stream = nullptr;
+            ctx->nvgraphIsInitialized = true;
+
+             if (outCtx != nullptr) {
+                 *outCtx = ctx;
+             }
+
+            // Second, initialize Cublas and Cusparse (get_handle() creates a new handle
+            // if there is no existing handle).
+
+            nvgraph::Cusparse::get_handle();
+            nvgraph::Cublas::get_handle();
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    static nvgraphStatus_t nvgraphDestroy_impl(nvgraphHandle_t handle) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle))
+                FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_NO_MEMORY);
+
+            // First, destroy Cublas and Cusparse
+
+            nvgraph::Cusparse::destroy_handle();
+            nvgraph::Cublas::destroy_handle();
+
+            // Second, destroy NVGraph's context
+
+            free(handle);
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    static nvgraphStatus_t nvgraphCreateGraphDescr_impl(nvgraphHandle_t handle,
+                                                        struct nvgraphGraphDescr **outGraphDescr) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            struct nvgraphGraphDescr *descrG = NULL;
+            descrG = (struct nvgraphGraphDescr*) malloc(sizeof(*descrG));
+            if (!descrG)
+            {
+                FatalError("Cannot allocate graph descriptor.", NVGRAPH_ERR_UNKNOWN);
+            }
+            descrG->graphStatus = IS_EMPTY;
+            if (outGraphDescr)
+            {
+                *outGraphDescr = descrG;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    static nvgraphStatus_t nvgraphDestroyGraphDescr_impl(nvgraphHandle_t handle,
+                                                         struct nvgraphGraphDescr *descrG) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG) {
+                if (descrG->TT == NVGRAPH_2D_32I_32I) {
+                    switch (descrG->T) {
+                        case CUDA_R_32I: {
+                            nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m =
+                                    static_cast<nvgraph::Matrix2d<int32_t, int32_t, int32_t>*>(descrG->graph_handle);
+                            delete m;
+                            break;
+                        }
+                        default:
+                            return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                    }
+                }
+                else {
+                    switch (descrG->graphStatus) {
+                        case IS_EMPTY: {
+                            break;
+                        }
+                        case HAS_TOPOLOGY: {
+                            nvgraph::CsrGraph<int> *CSRG =
+                                    static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                            delete CSRG;
+                            break;
+                        }
+                        case HAS_VALUES: {
+                            if (descrG->T == CUDA_R_32F) {
+                                nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                                        static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                                delete MCSRG;
+                            }
+                            else if (descrG->T == CUDA_R_64F) {
+                                nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                                        static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                                delete MCSRG;
+                            }
+                            else if (descrG->T == CUDA_R_32I) {
+                                nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+                                        static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+                                delete MCSRG;
+                            }
+                            else
+                                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                            break;
+                        }
+                        default:
+                            return NVGRAPH_STATUS_INVALID_VALUE;
+                    }
+                }
+                free(descrG);
+            }
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphSetStream_impl(nvgraphHandle_t handle, cudaStream_t stream) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            // nvgraph handle
+            handle->stream = stream;
+            //Cublas and Cusparse
+            nvgraph::Cublas::setStream(stream);
+            nvgraph::Cusparse::setStream(stream);
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure_impl(nvgraphHandle_t handle,
+                                                              nvgraphGraphDescr_t descrG,
+                                                              void* topologyData,
+                                                              nvgraphTopologyType_t TT) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            if (descrG->graphStatus != IS_EMPTY)
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            if (check_ptr(topologyData))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32)
+                    {
+                int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL;
+                switch (TT)
+                {
+                    case NVGRAPH_CSR_32:
+                        {
+                        nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t>(topologyData);
+                        if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets)
+                                || check_ptr(t->destination_indices))
+                            return NVGRAPH_STATUS_INVALID_VALUE;
+                        v = t->nvertices;
+                        e = t->nedges;
+                        neighborhood = t->source_offsets;
+                        edgedest = t->destination_indices;
+                        break;
+                    }
+                    case NVGRAPH_CSC_32:
+                        {
+                        nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t>(topologyData);
+                        if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets)
+                                || check_ptr(t->source_indices))
+                            return NVGRAPH_STATUS_INVALID_VALUE;
+                        v = t->nvertices;
+                        e = t->nedges;
+                        neighborhood = t->destination_offsets;
+                        edgedest = t->source_indices;
+                        break;
+                    }
+                    default:
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                }
+
+                descrG->TT = TT;
+
+                // Create the internal CSR representation
+                nvgraph::CsrGraph<int> * CSRG = new nvgraph::CsrGraph<int>(v, e, handle->stream);
+
+                CHECK_CUDA(cudaMemcpy(CSRG->get_raw_row_offsets(),
+                                      neighborhood,
+                                      (size_t )((CSRG->get_num_vertices() + 1) * sizeof(int)),
+                                      cudaMemcpyDefault));
+
+                CHECK_CUDA(cudaMemcpy(CSRG->get_raw_column_indices(),
+                                      edgedest,
+                                      (size_t )((CSRG->get_num_edges()) * sizeof(int)),
+                                      cudaMemcpyDefault));
+
+                // Set the graph handle
+                descrG->graph_handle = CSRG;
+                descrG->graphStatus = HAS_TOPOLOGY;
+            }
+            else if (TT == NVGRAPH_2D_32I_32I) {
+                nvgraph2dCOOTopology32I_t td = static_cast<nvgraph2dCOOTopology32I_t>(topologyData);
+                switch (td->valueType) {
+                    case CUDA_R_32I: {
+                        if (!td->nvertices || !td->nedges || !td->source_indices
+                                || !td->destination_indices || !td->numDevices || !td->devices
+                                || !td->blockN)
+                            return NVGRAPH_STATUS_INVALID_VALUE;
+                        descrG->TT = TT;
+                        descrG->graphStatus = HAS_TOPOLOGY;
+                        if (td->values)
+                            descrG->graphStatus = HAS_VALUES;
+                        descrG->T = td->valueType;
+                        std::vector<int32_t> devices;
+                        for (int32_t i = 0; i < td->numDevices; i++)
+                            devices.push_back(td->devices[i]);
+                        nvgraph::MatrixDecompositionDescription<int32_t, int32_t> description(td->nvertices,
+                                                                                              td->blockN,
+                                                                                              td->nedges,
+                                                                                              devices);
+                        nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m = new nvgraph::Matrix2d<int32_t,
+                                int32_t, int32_t>();
+                        *m = nvgraph::COOto2d(description,
+                                              td->source_indices,
+                                              td->destination_indices,
+                                              (int32_t*) td->values);
+                        descrG->graph_handle = m;
+                        break;
+                    }
+                    default: {
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    }
+                }
+            }
+            else
+            {
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure_impl(nvgraphHandle_t handle,
+                                                            nvgraphGraphDescr_t descrG,
+                                                            void* topologyData,
+                                                            nvgraphTopologyType_t TT) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            if (descrG->graphStatus != IS_EMPTY)
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            if (check_ptr(topologyData))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32)
+                    {
+                int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL;
+                switch (TT)
+                {
+                    case NVGRAPH_CSR_32:
+                        {
+                        nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t>(topologyData);
+                        if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets)
+                                || check_ptr(t->destination_indices))
+                            return NVGRAPH_STATUS_INVALID_VALUE;
+                        v = t->nvertices;
+                        e = t->nedges;
+                        neighborhood = t->source_offsets;
+                        edgedest = t->destination_indices;
+                        break;
+                    }
+                    case NVGRAPH_CSC_32:
+                        {
+                        nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t>(topologyData);
+                        if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets)
+                                || check_ptr(t->source_indices))
+                            return NVGRAPH_STATUS_INVALID_VALUE;
+                        v = t->nvertices;
+                        e = t->nedges;
+                        neighborhood = t->destination_offsets;
+                        edgedest = t->source_indices;
+                        break;
+                    }
+                    default:
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                }
+
+                descrG->TT = TT;
+
+                // Create the internal CSR representation
+                nvgraph::CsrGraph<int> * CSRG = new nvgraph::CsrGraph<int>(v, e, handle->stream);
+
+                CSRG->set_raw_row_offsets(neighborhood);
+                CSRG->set_raw_column_indices(edgedest);
+
+                // Set the graph handle
+                descrG->graph_handle = CSRG;
+                descrG->graphStatus = HAS_TOPOLOGY;
+            }
+            else if (TT == NVGRAPH_2D_32I_32I) {
+                nvgraph2dCOOTopology32I_t td = static_cast<nvgraph2dCOOTopology32I_t>(topologyData);
+                switch (td->valueType) {
+                    case CUDA_R_32I: {
+                        if (!td->nvertices || !td->nedges || !td->source_indices
+                                || !td->destination_indices || !td->numDevices || !td->devices
+                                || !td->blockN)
+                            return NVGRAPH_STATUS_INVALID_VALUE;
+                        descrG->TT = TT;
+                        descrG->graphStatus = HAS_TOPOLOGY;
+                        if (td->values)
+                            descrG->graphStatus = HAS_VALUES;
+                        descrG->T = td->valueType;
+                        std::vector<int32_t> devices;
+                        for (int32_t i = 0; i < td->numDevices; i++)
+                            devices.push_back(td->devices[i]);
+                        nvgraph::MatrixDecompositionDescription<int32_t, int32_t> description(td->nvertices,
+                                                                                              td->blockN,
+                                                                                              td->nedges,
+                                                                                              devices);
+                        nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m = new nvgraph::Matrix2d<int32_t,
+                                int32_t, int32_t>();
+                        *m = nvgraph::COOto2d(description,
+                                              td->source_indices,
+                                              td->destination_indices,
+                                              (int32_t*) td->values);
+                        descrG->graph_handle = m;
+                        break;
+                    }
+                    default: {
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    }
+                }
+            }
+            else
+            {
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure_impl(nvgraphHandle_t handle,
+                                                              nvgraphGraphDescr_t descrG,
+                                                              void* topologyData,
+                                                              nvgraphTopologyType_t* TT) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_topology(descrG))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            nvgraphTopologyType_t graphTType = descrG->TT;
+
+            if (TT != NULL)
+                *TT = graphTType;
+
+            if (topologyData != NULL) {
+                nvgraph::CsrGraph<int> *CSRG =
+                        static_cast<nvgraph::CsrGraph<int> *>(descrG->graph_handle);
+                int v = static_cast<int>(CSRG->get_num_vertices());
+                int e = static_cast<int>(CSRG->get_num_edges());
+                int *neighborhood = NULL, *edgedest = NULL;
+
+                switch (graphTType)
+                {
+                    case NVGRAPH_CSR_32:
+                        {
+                        nvgraphCSRTopology32I_t t = static_cast<nvgraphCSRTopology32I_t>(topologyData);
+                        t->nvertices = static_cast<int>(v);
+                        t->nedges = static_cast<int>(e);
+                        neighborhood = t->source_offsets;
+                        edgedest = t->destination_indices;
+                        break;
+                    }
+                    case NVGRAPH_CSC_32:
+                        {
+                        nvgraphCSCTopology32I_t t = static_cast<nvgraphCSCTopology32I_t>(topologyData);
+                        t->nvertices = static_cast<int>(v);
+                        t->nedges = static_cast<int>(e);
+                        neighborhood = t->destination_offsets;
+                        edgedest = t->source_indices;
+                        break;
+                    }
+                    default:
+                        return NVGRAPH_STATUS_INTERNAL_ERROR;
+                }
+
+                if (neighborhood != NULL) {
+                    CHECK_CUDA(cudaMemcpy(neighborhood,
+                                          CSRG->get_raw_row_offsets(),
+                                          (size_t )((v + 1) * sizeof(int)),
+                                          cudaMemcpyDefault));
+                }
+
+                if (edgedest != NULL) {
+                    CHECK_CUDA(cudaMemcpy(edgedest,
+                                          CSRG->get_raw_column_indices(),
+                                          (size_t )((e) * sizeof(int)),
+                                          cudaMemcpyDefault));
+                }
+
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData_impl(nvgraphHandle_t handle,
+                                                               nvgraphGraphDescr_t descrG,
+                                                               size_t numsets,
+                                                               cudaDataType_t *settypes) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(numsets)
+                    || check_ptr(settypes))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            if (check_uniform_type_array(settypes, numsets))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
+                    {
+                if (*settypes == CUDA_R_32F)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+                            int, float>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else if (*settypes == CUDA_R_64F)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+                            int, double>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else if (*settypes == CUDA_R_32I)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
+                            int>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                descrG->T = *settypes;
+                descrG->graphStatus = HAS_VALUES;
+            }
+            else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
+                    {
+                if (*settypes != descrG->T)
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+            }
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            // Allocate and transfer
+            if (*settypes == CUDA_R_32F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                MCSRG->allocateVertexData(numsets, NULL);
+            }
+            else if (*settypes == CUDA_R_64F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                MCSRG->allocateVertexData(numsets, NULL);
+            }
+            else if (*settypes == CUDA_R_32I)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+                MCSRG->allocateVertexData(numsets, NULL);
+            }
+            else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData_impl(nvgraphHandle_t handle,
+                                                             nvgraphGraphDescr_t descrG,
+                                                             size_t setnum,
+                                                             cudaDataType_t settype,
+                                                             void *vertexData) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(setnum))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
+                    {
+                if (settype == CUDA_R_32F)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+                            int, float>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else if (settype == CUDA_R_64F)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+                            int, double>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else if (settype == CUDA_R_32I)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
+                            int>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                descrG->T = settype;
+                descrG->graphStatus = HAS_VALUES;
+            }
+            else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
+                    {
+                if (settype != descrG->T)
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+            }
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            // transfer
+            if (settype == CUDA_R_32F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                MCSRG->attachVertexData(setnum, (float*)vertexData, NULL);
+            }
+            else if (settype == CUDA_R_64F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                MCSRG->attachVertexData(setnum, (double*)vertexData, NULL);
+            }
+            else if (settype == CUDA_R_32I)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+                MCSRG->attachVertexData(setnum, (int*)vertexData, NULL);
+            }
+            else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+    nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData_impl(nvgraphHandle_t handle,
+                                                             nvgraphGraphDescr_t descrG,
+                                                             size_t numsets,
+                                                             cudaDataType_t *settypes) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(numsets)
+                    || check_ptr(settypes))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            if (check_uniform_type_array(settypes, numsets))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            // Look at what kind of graph we have
+            if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
+                    {
+                if (*settypes == CUDA_R_32F)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+                            int, float>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else if (*settypes == CUDA_R_64F)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+                            int, double>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else if (*settypes == CUDA_R_32I)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
+                            int>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                descrG->T = *settypes;
+                descrG->graphStatus = HAS_VALUES;
+            }
+            else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
+                    {
+                if (*settypes != descrG->T)
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+            }
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            // Allocate and transfer
+            if (*settypes == CUDA_R_32F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                MCSRG->allocateEdgeData(numsets, NULL);
+            }
+            else if (*settypes == CUDA_R_64F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                MCSRG->allocateEdgeData(numsets, NULL);
+            }
+            else if (*settypes == CUDA_R_32I)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+                MCSRG->allocateEdgeData(numsets, NULL);
+            }
+            else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData_impl(nvgraphHandle_t handle,
+                                                           nvgraphGraphDescr_t descrG,
+                                                           size_t setnum,
+                                                           cudaDataType_t settype,
+                                                           void *edgeData) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(setnum))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            // Look at what kind of graph we have
+            if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first
+                    {
+                if (settype == CUDA_R_32F)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+                            int, float>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else if (settype == CUDA_R_64F)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG = new nvgraph::MultiValuedCsrGraph<
+                            int, double>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else if (settype == CUDA_R_32I)
+                        {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = new nvgraph::MultiValuedCsrGraph<int,
+                            int>(*CSRG);
+                    descrG->graph_handle = MCSRG;
+                }
+                else
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                descrG->T = settype;
+                descrG->graphStatus = HAS_VALUES;
+            }
+            else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type
+                    {
+                if (settype != descrG->T)
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+            }
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            // Allocate and transfer
+            if (settype == CUDA_R_32F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                MCSRG->attachEdgeData(setnum, (float*)edgeData, NULL);
+            }
+            else if (settype == CUDA_R_64F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                MCSRG->attachEdgeData(setnum, (double*)edgeData, NULL);
+            }
+            else if (settype == CUDA_R_32I)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+                MCSRG->attachEdgeData(setnum, (int*)edgeData, NULL);
+            }
+            else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData_impl(nvgraphHandle_t handle,
+                                                          nvgraphGraphDescr_t descrG,
+                                                          void *vertexData,
+                                                          size_t setnum) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
+                    || check_ptr(vertexData))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                FatalError("Graph should have allocated values.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->T == CUDA_R_32F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum),
+                           (float*) vertexData,
+                           (size_t) ((MCSRG->get_num_vertices()) * sizeof(float)),
+                           cudaMemcpyDefault);
+            }
+            else if (descrG->T == CUDA_R_64F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum),
+                           (double*) vertexData,
+                           (size_t) ((MCSRG->get_num_vertices()) * sizeof(double)),
+                           cudaMemcpyDefault);
+            }
+            else if (descrG->T == CUDA_R_32I)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum),
+                           (int*) vertexData,
+                           (size_t) ((MCSRG->get_num_vertices()) * sizeof(int)),
+                           cudaMemcpyDefault);
+            }
+            else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+            cudaCheckError();
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData_impl(nvgraphHandle_t handle,
+                                                          nvgraphGraphDescr_t descrG,
+                                                          void *vertexData,
+                                                          size_t setnum) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
+                    || check_ptr(vertexData))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                FatalError("Graph should have values.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->T == CUDA_R_32F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy((float*) vertexData,
+                                MCSRG->get_raw_vertex_dim(setnum),
+                                (size_t) ((MCSRG->get_num_vertices()) * sizeof(float)),
+                                cudaMemcpyDefault);
+            }
+            else if (descrG->T == CUDA_R_64F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy((double*) vertexData,
+                                MCSRG->get_raw_vertex_dim(setnum),
+                                (size_t) ((MCSRG->get_num_vertices()) * sizeof(double)),
+                                cudaMemcpyDefault);
+            }
+            else if (descrG->T == CUDA_R_32I)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy((int*) vertexData,
+                                MCSRG->get_raw_vertex_dim(setnum),
+                                (size_t) ((MCSRG->get_num_vertices()) * sizeof(int)),
+                                cudaMemcpyDefault);
+            }
+            else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+            cudaCheckError();
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology_impl(nvgraphHandle_t handle,
+                                                            nvgraphTopologyType_t srcTType,
+                                                            void *srcTopology,
+                                                            void *srcEdgeData,
+                                                            cudaDataType_t *dataType,
+                                                            nvgraphTopologyType_t dstTType,
+                                                            void *dstTopology,
+                                                            void *dstEdgeData) {
+
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_ptr(dstEdgeData) || check_ptr(srcEdgeData))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            size_t sizeT;
+            if (*dataType == CUDA_R_32F)
+                sizeT = sizeof(float);
+            else if (*dataType == CUDA_R_64F)
+                sizeT = sizeof(double);
+            else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+            // Trust me, this better than nested if's.
+            if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSR_32) {                  // CSR2CSR
+                nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t>(srcTopology);
+                nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t>(dstTopology);
+                dstT->nvertices = srcT->nvertices;
+                dstT->nedges = srcT->nedges;
+                CHECK_CUDA(cudaMemcpy(dstT->source_offsets,
+                                                srcT->source_offsets,
+                                                (srcT->nvertices + 1) * sizeof(int),
+                                                cudaMemcpyDefault));
+                CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
+                                                srcT->destination_indices,
+                                                srcT->nedges * sizeof(int),
+                                                cudaMemcpyDefault));
+                CHECK_CUDA(cudaMemcpy(dstEdgeData,
+                                                srcEdgeData,
+                                                srcT->nedges * sizeT,
+                                                cudaMemcpyDefault));
+            } else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSC_32) {           // CSR2CSC
+                nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t>(srcTopology);
+                nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t>(dstTopology);
+                dstT->nvertices = srcT->nvertices;
+                dstT->nedges = srcT->nedges;
+                csr2csc(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                            srcEdgeData,
+                            srcT->source_offsets, srcT->destination_indices,
+                            dstEdgeData,
+                            dstT->source_indices, dstT->destination_offsets,
+                            CUSPARSE_ACTION_NUMERIC,
+                            CUSPARSE_INDEX_BASE_ZERO, dataType);
+            } else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_COO_32) {           // CSR2COO
+                nvgraphCSRTopology32I_t srcT = static_cast<nvgraphCSRTopology32I_t>(srcTopology);
+                nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t>(dstTopology);
+                dstT->nvertices = srcT->nvertices;
+                dstT->nedges = srcT->nedges;
+                if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE || dstT->tag == NVGRAPH_DEFAULT
+                        || dstT->tag == NVGRAPH_UNSORTED) {
+                    csr2coo(srcT->source_offsets,
+                                srcT->nedges,
+                                srcT->nvertices,
+                                dstT->source_indices,
+                                CUSPARSE_INDEX_BASE_ZERO);
+                    CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
+                                                    srcT->destination_indices,
+                                                    srcT->nedges * sizeof(int),
+                                                    cudaMemcpyDefault));
+                    CHECK_CUDA(cudaMemcpy(dstEdgeData,
+                                                    srcEdgeData,
+                                                    srcT->nedges * sizeT,
+                                                    cudaMemcpyDefault));
+                } else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
+                    // Step 1: Convert to COO_Source
+                    csr2coo(srcT->source_offsets,
+                                srcT->nedges,
+                                srcT->nvertices,
+                                dstT->source_indices,
+                                CUSPARSE_INDEX_BASE_ZERO);
+                    // Step 2: Convert to COO_Destination
+                    cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                                                srcEdgeData,
+                                                dstT->source_indices, srcT->destination_indices,
+                                                dstEdgeData,
+                                                dstT->source_indices, dstT->destination_indices,
+                                                CUSPARSE_INDEX_BASE_ZERO,
+                                                dataType);
+                } else {
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                }
+                ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+            } else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSR_32) {           // CSC2CSR
+                nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t>(srcTopology);
+                nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t>(dstTopology);
+                dstT->nvertices = srcT->nvertices;
+                dstT->nedges = srcT->nedges;
+                csc2csr(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                            srcEdgeData,
+                            srcT->source_indices, srcT->destination_offsets,
+                            dstEdgeData,
+                            dstT->source_offsets, dstT->destination_indices,
+                            CUSPARSE_ACTION_NUMERIC,
+                            CUSPARSE_INDEX_BASE_ZERO, dataType);
+            } else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSC_32) {           // CSC2CSC
+                nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t>(srcTopology);
+                nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t>(dstTopology);
+                dstT->nvertices = srcT->nvertices;
+                dstT->nedges = srcT->nedges;
+                CHECK_CUDA(cudaMemcpy(dstT->destination_offsets,
+                                                srcT->destination_offsets,
+                                                (srcT->nvertices + 1) * sizeof(int),
+                                                cudaMemcpyDefault));
+                CHECK_CUDA(cudaMemcpy(dstT->source_indices,
+                                                srcT->source_indices,
+                                                srcT->nedges * sizeof(int),
+                                                cudaMemcpyDefault));
+                CHECK_CUDA(cudaMemcpy(dstEdgeData,
+                                                srcEdgeData,
+                                                srcT->nedges * sizeT,
+                                                cudaMemcpyDefault));
+            } else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_COO_32) {           // CSC2COO
+                nvgraphCSCTopology32I_t srcT = static_cast<nvgraphCSCTopology32I_t>(srcTopology);
+                nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t>(dstTopology);
+                dstT->nvertices = srcT->nvertices;
+                dstT->nedges = srcT->nedges;
+                if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) {
+                    // Step 1: Convert to COO_Destination
+                    csr2coo(srcT->destination_offsets,
+                                srcT->nedges,
+                                srcT->nvertices,
+                                dstT->destination_indices,
+                                CUSPARSE_INDEX_BASE_ZERO);
+                    // Step 2: Convert to COO_Source
+                    cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                                            srcEdgeData,
+                                            srcT->source_indices, dstT->destination_indices,
+                                            dstEdgeData,
+                                            dstT->source_indices, dstT->destination_indices,
+                                            CUSPARSE_INDEX_BASE_ZERO,
+                                            dataType);
+                } else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION || dstT->tag == NVGRAPH_DEFAULT
+                        || dstT->tag == NVGRAPH_UNSORTED) {
+                    csr2coo(srcT->destination_offsets,
+                                srcT->nedges,
+                                srcT->nvertices,
+                                dstT->destination_indices,
+                                CUSPARSE_INDEX_BASE_ZERO);
+                    CHECK_CUDA(cudaMemcpy(dstT->source_indices,
+                                                    srcT->source_indices,
+                                                    srcT->nedges * sizeof(int),
+                                                    cudaMemcpyDefault));
+                    CHECK_CUDA(cudaMemcpy(dstEdgeData,
+                                                    srcEdgeData,
+                                                    srcT->nedges * sizeT,
+                                                    cudaMemcpyDefault));
+                } else {
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                }
+                ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+            } else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSR_32) {           // COO2CSR
+                nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t>(srcTopology);
+                nvgraphCSRTopology32I_t dstT = static_cast<nvgraphCSRTopology32I_t>(dstTopology);
+                dstT->nvertices = srcT->nvertices;
+                dstT->nedges = srcT->nedges;
+                if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) {
+                    coo2csr(srcT->source_indices,
+                                srcT->nedges,
+                                srcT->nvertices,
+                                dstT->source_offsets,
+                                CUSPARSE_INDEX_BASE_ZERO);
+                    CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
+                                                    srcT->destination_indices,
+                                                    srcT->nedges * sizeof(int),
+                                                    cudaMemcpyDefault));
+                    CHECK_CUDA(cudaMemcpy(dstEdgeData,
+                                                    srcEdgeData,
+                                                    srcT->nedges * sizeT,
+                                                    cudaMemcpyDefault));
+                } else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
+                    cood2csr(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                                srcEdgeData,
+                                srcT->source_indices, srcT->destination_indices,
+                                dstEdgeData,
+                                dstT->source_offsets, dstT->destination_indices,
+                                CUSPARSE_INDEX_BASE_ZERO,
+                                dataType);
+                } else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) {
+                    coou2csr(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                                srcEdgeData,
+                                srcT->source_indices, srcT->destination_indices,
+                                dstEdgeData,
+                                dstT->source_offsets, dstT->destination_indices,
+                                CUSPARSE_INDEX_BASE_ZERO,
+                                dataType);
+                } else {
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                }
+            } else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSC_32) {           // COO2CSC
+                nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t>(srcTopology);
+                nvgraphCSCTopology32I_t dstT = static_cast<nvgraphCSCTopology32I_t>(dstTopology);
+                dstT->nvertices = srcT->nvertices;
+                dstT->nedges = srcT->nedges;
+                if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) {
+                    coos2csc(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                                srcEdgeData,
+                                srcT->source_indices, srcT->destination_indices,
+                                dstEdgeData,
+                                dstT->source_indices, dstT->destination_offsets,
+                                CUSPARSE_INDEX_BASE_ZERO,
+                                dataType);
+                } else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
+                    coo2csr(srcT->destination_indices,
+                                srcT->nedges,
+                                srcT->nvertices,
+                                dstT->destination_offsets,
+                                CUSPARSE_INDEX_BASE_ZERO);
+                    CHECK_CUDA(cudaMemcpy(dstT->source_indices,
+                                                    srcT->source_indices,
+                                                    srcT->nedges * sizeof(int),
+                                                    cudaMemcpyDefault));
+                    CHECK_CUDA(cudaMemcpy(dstEdgeData,
+                                                    srcEdgeData,
+                                                    srcT->nedges * sizeT,
+                                                    cudaMemcpyDefault));
+                } else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) {
+                    coou2csc(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                                srcEdgeData,
+                                srcT->source_indices, srcT->destination_indices,
+                                dstEdgeData,
+                                dstT->source_indices, dstT->destination_offsets,
+                                CUSPARSE_INDEX_BASE_ZERO,
+                                dataType);
+                } else {
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                }
+            } else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_COO_32) {           // COO2COO
+                nvgraphCOOTopology32I_t srcT = static_cast<nvgraphCOOTopology32I_t>(srcTopology);
+                nvgraphCOOTopology32I_t dstT = static_cast<nvgraphCOOTopology32I_t>(dstTopology);
+                dstT->nvertices = srcT->nvertices;
+                dstT->nedges = srcT->nedges;
+                if (srcT->tag == dstT->tag || dstT->tag == NVGRAPH_DEFAULT
+                        || dstT->tag == NVGRAPH_UNSORTED) {
+                    CHECK_CUDA(cudaMemcpy(dstT->source_indices,
+                                                    srcT->source_indices,
+                                                    srcT->nedges * sizeof(int),
+                                                    cudaMemcpyDefault));
+                    CHECK_CUDA(cudaMemcpy(dstT->destination_indices,
+                                                    srcT->destination_indices,
+                                                    srcT->nedges * sizeof(int),
+                                                    cudaMemcpyDefault));
+                    CHECK_CUDA(cudaMemcpy(dstEdgeData,
+                                                    srcEdgeData,
+                                                    srcT->nedges * sizeT,
+                                                    cudaMemcpyDefault));
+                } else if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) {
+                    cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                                            srcEdgeData,
+                                            srcT->source_indices, srcT->destination_indices,
+                                            dstEdgeData,
+                                            dstT->source_indices, dstT->destination_indices,
+                                            CUSPARSE_INDEX_BASE_ZERO,
+                                            dataType);
+                } else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) {
+                    cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges,
+                                                srcEdgeData,
+                                                srcT->source_indices, srcT->destination_indices,
+                                                dstEdgeData,
+                                                dstT->source_indices, dstT->destination_indices,
+                                                CUSPARSE_INDEX_BASE_ZERO,
+                                                dataType);
+                } else {
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                }
+
+                ///////////////////////////////////////////////////////////////////////////////////////////////////////////
+            } else {
+                return NVGRAPH_STATUS_INVALID_VALUE;
+            }
+
+        }
+        NVGRAPH_CATCHES(rc)
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph_impl(nvgraphHandle_t handle,
+                                                         nvgraphGraphDescr_t srcDescrG,
+                                                         nvgraphGraphDescr_t dstDescrG,
+                                                         nvgraphTopologyType_t dstTType) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        nvgraphStatus_t status = NVGRAPH_STATUS_SUCCESS;
+        try
+        {
+            if (check_context(handle) || check_graph(srcDescrG))  // Graph must have a topology
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (dstDescrG->graphStatus != IS_EMPTY) // dst Graph must be empty
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            // graphs can only have CSR or CSC topology (EL is for storage only)
+            if (srcDescrG->TT != NVGRAPH_CSR_32 && srcDescrG->TT != NVGRAPH_CSC_32)
+                return NVGRAPH_STATUS_INTERNAL_ERROR; // invalid state, you can only create graph with CSR/CSC
+            if (dstTType != NVGRAPH_CSR_32 && dstTType != NVGRAPH_CSC_32)
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; // only conversion to CSR/CSC is allowed
+
+            int nvertices, nedges;
+            int *srcOffsets = NULL, *srcIndices = NULL, *dstOffsets = NULL, *dstIndices = NULL;
+            std::shared_ptr<int> permutation, offsets, indices;
+
+            // Step 1: get source graph structure
+            nvgraph::CsrGraph<int> *CSRG =
+                    static_cast<nvgraph::CsrGraph<int> *>(srcDescrG->graph_handle);
+            nvertices = static_cast<int>(CSRG->get_num_vertices());
+            nedges = static_cast<int>(CSRG->get_num_edges());
+            srcOffsets = CSRG->get_raw_row_offsets();
+            srcIndices = CSRG->get_raw_column_indices();
+
+            // Step 2: convert topology and get permutation array.
+            if (srcDescrG->TT != dstTType) { // Otherwise conversion is not needed, only copy.
+                offsets = allocateDevice<int>(nvertices + 1, NULL);
+                indices = allocateDevice<int>(nedges, NULL);
+                permutation = allocateDevice<int>(nedges, NULL);
+                csr2cscP(nvertices, nvertices, nedges,
+                            srcOffsets,
+                            srcIndices,
+                            indices.get(),
+                            offsets.get(), permutation.get(), CUSPARSE_INDEX_BASE_ZERO);
+                dstOffsets = offsets.get();
+                dstIndices = indices.get();
+            } else {
+                dstOffsets = srcOffsets;
+                dstIndices = srcIndices;
+            }
+
+            // Step 3: Set dst graph structure
+            if (dstTType == NVGRAPH_CSR_32) {
+                nvgraphCSRTopology32I_st dstTopology;
+                dstTopology.nedges = nedges;
+                dstTopology.nvertices = nvertices;
+                dstTopology.source_offsets = dstOffsets;
+                dstTopology.destination_indices = dstIndices;
+                status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType);
+            } else if (dstTType == NVGRAPH_CSC_32) {
+                nvgraphCSCTopology32I_st dstTopology;
+                dstTopology.nedges = nedges;
+                dstTopology.nvertices = nvertices;
+                dstTopology.destination_offsets = dstOffsets;
+                dstTopology.source_indices = dstIndices;
+                status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType);
+            } else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            if (status != NVGRAPH_STATUS_SUCCESS)
+                return NVGRAPH_STATUS_INTERNAL_ERROR;
+            offsets.reset();
+            indices.reset();
+
+            // Step 4: Allocate, convert and set edge+vertex data on the new graph
+            if (srcDescrG->graphStatus == HAS_VALUES) {
+                if (srcDescrG->T == CUDA_R_32F) {
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(srcDescrG->graph_handle);
+                    size_t vertexDim = MCSRG->get_num_vertex_dim();
+                    size_t edgesDim = MCSRG->get_num_edge_dim();
+                    // Step 4.1: allocate and set vertex data (no need for convert)
+                    if (vertexDim > 0) {
+                        std::vector<cudaDataType_t> vertexDataType(vertexDim);
+                        std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T);
+                        status = nvgraphAllocateVertexData(handle,
+                                                           dstDescrG,
+                                                           vertexDim,
+                                                           vertexDataType.data());
+                        if (status != NVGRAPH_STATUS_SUCCESS)
+                            return NVGRAPH_STATUS_INTERNAL_ERROR;
+                        for (size_t i = 0; i < vertexDim; ++i) {
+                            void *vertexData = MCSRG->get_raw_vertex_dim(i);
+                            status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i);
+                            if (status != NVGRAPH_STATUS_SUCCESS)
+                                return NVGRAPH_STATUS_INTERNAL_ERROR;
+                        }
+                    }
+                    // Step 4.2: allocate and set vertex data
+                    if (edgesDim > 0) {
+                        void *dstEdgeData = NULL;
+                        std::shared_ptr<float> dstEdgeDataSP;
+
+                        std::vector<cudaDataType_t> edgeDataType(edgesDim);
+                        std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T);
+                        status = nvgraphAllocateEdgeData(handle,
+                                                         dstDescrG,
+                                                         edgesDim,
+                                                         edgeDataType.data());
+                        if (status != NVGRAPH_STATUS_SUCCESS)
+                            return NVGRAPH_STATUS_INTERNAL_ERROR;
+                        // allocate edge data memory (if there is a need)
+                        if (edgesDim > 0 && srcDescrG->TT != dstTType) {
+                            dstEdgeDataSP = allocateDevice<float>(nedges, NULL);
+                            dstEdgeData = dstEdgeDataSP.get();
+                        }
+                        // Convert and set edge data (using permutation array)
+                        for (size_t i = 0; i < edgesDim; ++i) {
+                            void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i));
+                            if (srcDescrG->TT != dstTType) // Convert using permutation array
+                                gthrX(nedges,
+                                        srcEdgeData,
+                                        dstEdgeData,
+                                        permutation.get(),
+                                        CUSPARSE_INDEX_BASE_ZERO,
+                                        &(srcDescrG->T));
+                            else
+                                dstEdgeData = srcEdgeData;
+                            // set edgedata
+                            status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i);
+                            if (status != NVGRAPH_STATUS_SUCCESS)
+                                return NVGRAPH_STATUS_INTERNAL_ERROR;
+                        }
+                    }
+                } else if (srcDescrG->T == CUDA_R_64F) {
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(srcDescrG->graph_handle);
+                    size_t vertexDim = MCSRG->get_num_vertex_dim();
+                    size_t edgesDim = MCSRG->get_num_edge_dim();
+                    // Step 4.1: allocate and set vertex data (no need for convert)
+                    if (vertexDim > 0) {
+                        std::vector<cudaDataType_t> vertexDataType(vertexDim);
+                        std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T);
+                        status = nvgraphAllocateVertexData(handle,
+                                                           dstDescrG,
+                                                           vertexDim,
+                                                           vertexDataType.data());
+                        if (status != NVGRAPH_STATUS_SUCCESS)
+                            return NVGRAPH_STATUS_INTERNAL_ERROR;
+                        for (size_t i = 0; i < vertexDim; ++i) {
+                            void *vertexData = MCSRG->get_raw_vertex_dim(i);
+                            status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i);
+                            if (status != NVGRAPH_STATUS_SUCCESS)
+                                return NVGRAPH_STATUS_INTERNAL_ERROR;
+                        }
+                    }
+                    // Step 4.2: allocate and set vertex data
+                    if (edgesDim > 0) {
+                        void *dstEdgeData = NULL;
+                        std::shared_ptr<double> dstEdgeDataSP;
+
+                        std::vector<cudaDataType_t> edgeDataType(edgesDim);
+                        std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T);
+                        status = nvgraphAllocateEdgeData(handle,
+                                                         dstDescrG,
+                                                         edgesDim,
+                                                         edgeDataType.data());
+                        if (status != NVGRAPH_STATUS_SUCCESS)
+                            return NVGRAPH_STATUS_INTERNAL_ERROR;
+                        // allocate edge data memory (if there is a need)
+                        if (edgesDim > 0 && srcDescrG->TT != dstTType) {
+                            dstEdgeDataSP = allocateDevice<double>(nedges, NULL);
+                            dstEdgeData = dstEdgeDataSP.get();
+                        }
+                        // Convert and set edge data (using permutation array)
+                        for (size_t i = 0; i < edgesDim; ++i) {
+                            void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i));
+                            if (srcDescrG->TT != dstTType) // Convert using permutation array
+                                gthrX(nedges,
+                                        srcEdgeData,
+                                        dstEdgeData,
+                                        permutation.get(),
+                                        CUSPARSE_INDEX_BASE_ZERO,
+                                        &(srcDescrG->T));
+                            else
+                                dstEdgeData = srcEdgeData;
+                            // set edgedata
+                            status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i);
+                            if (status != NVGRAPH_STATUS_SUCCESS)
+                                return NVGRAPH_STATUS_INTERNAL_ERROR;
+                        }
+                    }
+                } else
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+        return getCAPIStatusForError(rc);
+
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData_impl(nvgraphHandle_t handle,
+                                                        nvgraphGraphDescr_t descrG,
+                                                        void *edgeData,
+                                                        size_t setnum) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
+                    || check_ptr(edgeData))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->T == CUDA_R_32F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy(MCSRG->get_raw_edge_dim(setnum),
+                                (float*) edgeData,
+                                (size_t) ((MCSRG->get_num_edges()) * sizeof(float)),
+                                cudaMemcpyDefault);
+            }
+            else if (descrG->T == CUDA_R_64F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy(MCSRG->get_raw_edge_dim(setnum),
+                                (double*) edgeData,
+                                (size_t) ((MCSRG->get_num_edges()) * sizeof(double)),
+                                cudaMemcpyDefault);
+            }
+            else if (descrG->T == CUDA_R_32I)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, int> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, int>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy(MCSRG->get_raw_edge_dim(setnum),
+                                (int*) edgeData,
+                                (size_t) ((MCSRG->get_num_edges()) * sizeof(int)),
+                                cudaMemcpyDefault);
+            }
+            else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+            cudaCheckError();
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData_impl(nvgraphHandle_t handle,
+                                                        nvgraphGraphDescr_t descrG,
+                                                        void *edgeData,
+                                                        size_t setnum) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)
+                    || check_ptr(edgeData))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->T == CUDA_R_32F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy((float*) edgeData,
+                                MCSRG->get_raw_edge_dim(setnum),
+                                (size_t) ((MCSRG->get_num_edges()) * sizeof(float)),
+                                cudaMemcpyDefault);
+            }
+            else if (descrG->T == CUDA_R_64F)
+                    {
+                nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                        static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+                cudaMemcpy((double*) edgeData,
+                                MCSRG->get_raw_edge_dim(setnum),
+                                (size_t) ((MCSRG->get_num_edges()) * sizeof(double)),
+                                cudaMemcpyDefault);
+            }
+            else
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+            cudaCheckError();
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv_impl_cub(nvgraphHandle_t handle,
+                                                       const nvgraphGraphDescr_t descrG,
+                                                       const size_t weight_index,
+                                                       const void *alpha,
+                                                       const size_t x,
+                                                       const void *beta,
+                                                       const size_t y,
+                                                       const nvgraphSemiring_t SR) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+
+        try
+        {
+            // some basic checks
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            rc = SemiringAPILauncher(handle, descrG, weight_index, alpha, x, beta, y, SR);
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphSssp_impl(nvgraphHandle_t handle,
+                                                 const nvgraphGraphDescr_t descrG,
+                                                 const size_t weight_index,
+                                                 const int *source_vert,
+                                                 const size_t sssp) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
+                    || check_int_ptr(source_vert))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
+                return NVGRAPH_STATUS_INVALID_VALUE;
 //        cudaError_t cuda_status;
 
-			if (descrG->graphStatus != HAS_VALUES)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			switch (descrG->T)
-			{
-				case CUDA_R_32F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-
-					int n = static_cast<int>(MCSRG->get_num_vertices());
-					nvgraph::Vector<float> co(n, handle->stream);
-					nvgraph::Sssp<int, float> sssp_solver(*MCSRG->get_valued_csr_graph(weight_index));
-					nvgraph::set_connectivity<int, float>(n, *source_vert, 0.0, FLT_MAX, co.raw());
-					MCSRG->get_vertex_dim(sssp).copy(co);
-					rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp));
-					break;
-				}
-				case CUDA_R_64F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-
-					int n = static_cast<int>(MCSRG->get_num_vertices());
-					nvgraph::Vector<double> co(n, handle->stream);
-					nvgraph::Sssp<int, double> sssp_solver(*MCSRG->get_valued_csr_graph(weight_index));
-					nvgraph::set_connectivity<int, double>(n, *source_vert, 0.0, DBL_MAX, co.raw());
-					MCSRG->get_vertex_dim(sssp).copy(co);
-					rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp));
-					break;
-				}
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphTraversal_impl(nvgraphHandle_t handle,
-																		const nvgraphGraphDescr_t descrG,
-																		const nvgraphTraversal_t traversalT,
-																		const int *source_vertex_ptr,
-																		const nvgraphTraversalParameter_t params)
-																		{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_ptr(source_vertex_ptr))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph (storing results)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->T != CUDA_R_32I) //results are ints
-				return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-
-			//Results (bfs distances, predecessors..) are written in dimension in mvcsrg
-			nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = static_cast<nvgraph::MultiValuedCsrGraph<
-					int, int>*>(descrG->graph_handle);
-
-			//
-			//Computing traversal parameters
-			//
-
-			size_t distancesIndex, predecessorsIndex, edgeMaskIndex;
-			size_t undirectedFlagParam;
-			size_t alpha_ul, beta_ul;
-
-			int *distances = NULL, *predecessors = NULL, *edge_mask = NULL;
-
-			nvgraphTraversalGetDistancesIndex(params, &distancesIndex);
-			nvgraphTraversalGetPredecessorsIndex(params, &predecessorsIndex);
-			nvgraphTraversalGetEdgeMaskIndex(params, &edgeMaskIndex);
-			nvgraphTraversalGetUndirectedFlag(params, &undirectedFlagParam);
-			nvgraphTraversalGetAlpha(params, &alpha_ul);
-			nvgraphTraversalGetBeta(params, &beta_ul);
-
-			int alpha = static_cast<int>(alpha_ul);
-			int beta = static_cast<int>(beta_ul);
-
-			//If distances_index was set by user, then use it
-			if (distancesIndex <= MCSRG->get_num_vertex_dim()) {
-				distances = MCSRG->get_vertex_dim(distancesIndex).raw();
-			}
-
-			//If predecessors_index was set by user, then use it
-			if (predecessorsIndex <= MCSRG->get_num_vertex_dim()) {
-				predecessors = MCSRG->get_vertex_dim(predecessorsIndex).raw();
-			}
-
-			//If edgemask_index was set by user, then use it
-			if (edgeMaskIndex <= MCSRG->get_num_vertex_dim()) {
-				edge_mask = MCSRG->get_edge_dim(edgeMaskIndex).raw();
-			}
-
-			int source_vertex = *source_vertex_ptr;
-
-			int n = static_cast<int>(MCSRG->get_num_vertices());
-			int nnz = static_cast<int>(MCSRG->get_num_edges());
-			int *row_offsets = MCSRG->get_raw_row_offsets();
-			int *col_indices = MCSRG->get_raw_column_indices();
-
-			bool undirected = (bool) undirectedFlagParam;
-
-			if (source_vertex < 0 || source_vertex >= n) {
-				return NVGRAPH_STATUS_INVALID_VALUE;
-			}
-
-			//Calling corresponding implementation
-			switch (traversalT) {
-				case NVGRAPH_TRAVERSAL_BFS:
-					nvgraph::Bfs<int> bfs_solver(n,
-															nnz,
-															row_offsets,
-															col_indices,
-															!undirected,
-															alpha,
-															beta,
-															handle->stream);
-
-					//To easily implement multi source with single source,
-					//loop on those two
-					rc = bfs_solver.configure(distances, predecessors, edge_mask);
-					rc = bfs_solver.traverse(source_vertex);
-					break;
-			};
-
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	/**
-	 * CAPI Method for calling 2d BFS algorithm.
-	 * @param handle Nvgraph context handle.
-	 * @param descrG Graph handle (must be 2D partitioned)
-	 * @param source_vert The source vertex ID
-	 * @param distances Pointer to memory allocated to store the distances.
-	 * @param predecessors Pointer to memory allocated to store the predecessors
-	 * @return Status code.
-	 */
-	nvgraphStatus_t NVGRAPH_API nvgraph2dBfs_impl(nvgraphHandle_t handle,
-																	const nvgraphGraphDescr_t descrG,
-																	const int32_t source_vert,
-																	int32_t* distances,
-																	int32_t* predecessors) {
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try {
-			if (check_context(handle) || check_graph(descrG))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-			if (descrG->graphStatus == IS_EMPTY)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-			if (descrG->TT != NVGRAPH_2D_32I_32I)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-			if (descrG->T != CUDA_R_32I)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-			nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m = static_cast<nvgraph::Matrix2d<int32_t,
-					int32_t, int32_t>*>(descrG->graph_handle);
-//			std::cout << m->toString();
-			nvgraph::Bfs2d<int32_t, int32_t, int32_t> bfs(m, true, 0, 0);
-			rc = bfs.configure(distances, predecessors);
-			rc = bfs.traverse(source_vert);
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphWidestPath_impl(nvgraphHandle_t handle,
-																		const nvgraphGraphDescr_t descrG,
-																		const size_t weight_index,
-																		const int *source_vert,
-																		const size_t widest_path)
-																		{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
-					|| check_int_ptr(source_vert))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
-				return NVGRAPH_STATUS_INVALID_VALUE;
+            if (descrG->graphStatus != HAS_VALUES)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            switch (descrG->T)
+            {
+                case CUDA_R_32F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+
+                    int n = static_cast<int>(MCSRG->get_num_vertices());
+                    nvgraph::Vector<float> co(n, handle->stream);
+                    nvgraph::Sssp<int, float> sssp_solver(*MCSRG->get_valued_csr_graph(weight_index));
+                    nvgraph::set_connectivity<int, float>(n, *source_vert, 0.0, FLT_MAX, co.raw());
+                    MCSRG->get_vertex_dim(sssp).copy(co);
+                    rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp));
+                    break;
+                }
+                case CUDA_R_64F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+
+                    int n = static_cast<int>(MCSRG->get_num_vertices());
+                    nvgraph::Vector<double> co(n, handle->stream);
+                    nvgraph::Sssp<int, double> sssp_solver(*MCSRG->get_valued_csr_graph(weight_index));
+                    nvgraph::set_connectivity<int, double>(n, *source_vert, 0.0, DBL_MAX, co.raw());
+                    MCSRG->get_vertex_dim(sssp).copy(co);
+                    rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp));
+                    break;
+                }
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphTraversal_impl(nvgraphHandle_t handle,
+                                                      const nvgraphGraphDescr_t descrG,
+                                                      const nvgraphTraversal_t traversalT,
+                                                      const int *source_vertex_ptr,
+                                                      const nvgraphTraversalParameter_t params) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_ptr(source_vertex_ptr))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph (storing results)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->T != CUDA_R_32I) //results are ints
+                return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+
+            //Results (bfs distances, predecessors..) are written in dimension in mvcsrg
+            nvgraph::MultiValuedCsrGraph<int, int> *MCSRG = static_cast<nvgraph::MultiValuedCsrGraph<
+                    int, int>*>(descrG->graph_handle);
+
+            //
+            //Computing traversal parameters
+            //
+
+            size_t distancesIndex, predecessorsIndex, edgeMaskIndex;
+            size_t undirectedFlagParam;
+            size_t alpha_ul, beta_ul;
+
+            int *distances = NULL, *predecessors = NULL, *edge_mask = NULL;
+
+            nvgraphTraversalGetDistancesIndex(params, &distancesIndex);
+            nvgraphTraversalGetPredecessorsIndex(params, &predecessorsIndex);
+            nvgraphTraversalGetEdgeMaskIndex(params, &edgeMaskIndex);
+            nvgraphTraversalGetUndirectedFlag(params, &undirectedFlagParam);
+            nvgraphTraversalGetAlpha(params, &alpha_ul);
+            nvgraphTraversalGetBeta(params, &beta_ul);
+
+            int alpha = static_cast<int>(alpha_ul);
+            int beta = static_cast<int>(beta_ul);
+
+            //If distances_index was set by user, then use it
+            if (distancesIndex <= MCSRG->get_num_vertex_dim()) {
+                distances = MCSRG->get_vertex_dim(distancesIndex).raw();
+            }
+
+            //If predecessors_index was set by user, then use it
+            if (predecessorsIndex <= MCSRG->get_num_vertex_dim()) {
+                predecessors = MCSRG->get_vertex_dim(predecessorsIndex).raw();
+            }
+
+            //If edgemask_index was set by user, then use it
+            if (edgeMaskIndex <= MCSRG->get_num_vertex_dim()) {
+                edge_mask = MCSRG->get_edge_dim(edgeMaskIndex).raw();
+            }
+
+            int source_vertex = *source_vertex_ptr;
+
+            int n = static_cast<int>(MCSRG->get_num_vertices());
+            int nnz = static_cast<int>(MCSRG->get_num_edges());
+            int *row_offsets = MCSRG->get_raw_row_offsets();
+            int *col_indices = MCSRG->get_raw_column_indices();
+
+            bool undirected = (bool) undirectedFlagParam;
+
+            if (source_vertex < 0 || source_vertex >= n) {
+                return NVGRAPH_STATUS_INVALID_VALUE;
+            }
+
+            //Calling corresponding implementation
+            switch (traversalT) {
+                case NVGRAPH_TRAVERSAL_BFS:
+                    nvgraph::Bfs<int> bfs_solver(n,
+                                                 nnz,
+                                                 row_offsets,
+                                                 col_indices,
+                                                 !undirected,
+                                                 alpha,
+                                                 beta,
+                                                 handle->stream);
+
+                    //To easily implement multi source with single source,
+                    //loop on those two
+                    rc = bfs_solver.configure(distances, predecessors, edge_mask);
+                    rc = bfs_solver.traverse(source_vertex);
+                    break;
+            };
+
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    /**
+     * CAPI Method for calling 2d BFS algorithm.
+     * @param handle Nvgraph context handle.
+     * @param descrG Graph handle (must be 2D partitioned)
+     * @param source_vert The source vertex ID
+     * @param distances Pointer to memory allocated to store the distances.
+     * @param predecessors Pointer to memory allocated to store the predecessors
+     * @return Status code.
+     */
+    nvgraphStatus_t NVGRAPH_API nvgraph2dBfs_impl(nvgraphHandle_t handle,
+                                                  const nvgraphGraphDescr_t descrG,
+                                                  const int32_t source_vert,
+                                                  int32_t* distances,
+                                                  int32_t* predecessors) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try {
+            if (check_context(handle) || check_graph(descrG))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+            if (descrG->graphStatus == IS_EMPTY)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+            if (descrG->TT != NVGRAPH_2D_32I_32I)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+            if (descrG->T != CUDA_R_32I)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+            nvgraph::Matrix2d<int32_t, int32_t, int32_t>* m = static_cast<nvgraph::Matrix2d<int32_t,
+                    int32_t, int32_t>*>(descrG->graph_handle);
+//            std::cout << m->toString();
+            nvgraph::Bfs2d<int32_t, int32_t, int32_t> bfs(m, true, 0, 0);
+            rc = bfs.configure(distances, predecessors);
+            rc = bfs.traverse(source_vert);
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphWidestPath_impl(nvgraphHandle_t handle,
+                                                       const nvgraphGraphDescr_t descrG,
+                                                       const size_t weight_index,
+                                                       const int *source_vert,
+                                                       const size_t widest_path) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
+                    || check_int_ptr(source_vert))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
+                return NVGRAPH_STATUS_INVALID_VALUE;
 
 //        cudaError_t cuda_status;
 
-			switch (descrG->T)
-			{
-				case CUDA_R_32F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-
-					int n = static_cast<int>(MCSRG->get_num_vertices());
-					nvgraph::Vector<float> co(n, handle->stream);
-					nvgraph::WidestPath<int, float> widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index));
-					nvgraph::set_connectivity<int, float>(n, *source_vert, FLT_MAX, -FLT_MAX, co.raw());
-					MCSRG->get_vertex_dim(widest_path).copy(co);
-					rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path));
-					break;
-				}
-				case CUDA_R_64F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-
-					int n = static_cast<int>(MCSRG->get_num_vertices());
-					nvgraph::Vector<double> co(n, handle->stream);
-					nvgraph::WidestPath<int, double> widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index));
-					nvgraph::set_connectivity<int, double>(n, *source_vert, DBL_MAX, -DBL_MAX, co.raw());
-					MCSRG->get_vertex_dim(widest_path).copy(co);
-					rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path));
-					break;
-				}
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphPagerank_impl(nvgraphHandle_t handle,
-																		const nvgraphGraphDescr_t descrG,
-																		const size_t weight_index,
-																		const void *alpha,
-																		const size_t bookmark,
-																		const int has_guess,
-																		const size_t rank,
-																		const float tolerance,
-																		const int max_iter)
-																		{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
-					|| check_ptr(alpha))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (!(has_guess == 0 || has_guess == 1))
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			int max_it;
-			float tol;
-
-			if (max_iter > 0)
-				max_it = max_iter;
-			else
-				max_it = 500;
-
-			if (tolerance == 0.0f)
-				tol = 1.0E-6f;
-			else if (tolerance < 1.0f && tolerance > 0.0f)
-				tol = tolerance;
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			switch (descrG->T)
-			{
-				case CUDA_R_32F:
-					{
-					float alphaT = *static_cast<const float*>(alpha);
-					if (alphaT <= 0.0f || alphaT >= 1.0f)
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| bookmark >= MCSRG->get_num_vertex_dim()
-							|| rank >= MCSRG->get_num_vertex_dim()) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-
-					int n = static_cast<int>(MCSRG->get_num_vertices());
-					nvgraph::Vector<float> guess(n, handle->stream);
-					nvgraph::Vector<float> bm(n, handle->stream);
-					if (has_guess)
-						guess.copy(MCSRG->get_vertex_dim(rank));
-					else
-						guess.fill(static_cast<float>(1.0 / n));
-					bm.copy(MCSRG->get_vertex_dim(bookmark));
-					nvgraph::Pagerank<int, float> pagerank_solver(	*MCSRG->get_valued_csr_graph(weight_index),
-																					bm);
-					rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it);
-					break;
-				}
-				case CUDA_R_64F:
-					{
-					double alphaT = *static_cast<const double*>(alpha);
-					if (alphaT <= 0.0 || alphaT >= 1.0)
-						return NVGRAPH_STATUS_INVALID_VALUE;
-
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| bookmark >= MCSRG->get_num_vertex_dim()
-							|| rank >= MCSRG->get_num_vertex_dim()) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-
-					int n = static_cast<int>(MCSRG->get_num_vertices());
-					nvgraph::Vector<double> guess(n, handle->stream);
-					nvgraph::Vector<double> bm(n, handle->stream);
-					bm.copy(MCSRG->get_vertex_dim(bookmark));
-					if (has_guess)
-						guess.copy(MCSRG->get_vertex_dim(rank));
-					else
-						guess.fill(static_cast<float>(1.0 / n));
-					nvgraph::Pagerank<int, double> pagerank_solver(	*MCSRG->get_valued_csr_graph(weight_index),
-																					bm);
-					rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it);
-					break;
-				}
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphKrylovPagerank_impl(nvgraphHandle_t handle,
-																				const nvgraphGraphDescr_t descrG,
-																				const size_t weight_index,
-																				const void *alpha,
-																				const size_t bookmark,
-																				const float tolerance,
-																				const int max_iter,
-																				const int subspace_size,
-																				const int has_guess,
-																				const size_t rank)
-																				{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
-					|| check_ptr(alpha))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
-				return NVGRAPH_STATUS_INVALID_VALUE;
+            switch (descrG->T)
+            {
+                case CUDA_R_32F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+
+                    int n = static_cast<int>(MCSRG->get_num_vertices());
+                    nvgraph::Vector<float> co(n, handle->stream);
+                    nvgraph::WidestPath<int, float> widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index));
+                    nvgraph::set_connectivity<int, float>(n, *source_vert, FLT_MAX, -FLT_MAX, co.raw());
+                    MCSRG->get_vertex_dim(widest_path).copy(co);
+                    rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path));
+                    break;
+                }
+                case CUDA_R_64F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+
+                    int n = static_cast<int>(MCSRG->get_num_vertices());
+                    nvgraph::Vector<double> co(n, handle->stream);
+                    nvgraph::WidestPath<int, double> widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index));
+                    nvgraph::set_connectivity<int, double>(n, *source_vert, DBL_MAX, -DBL_MAX, co.raw());
+                    MCSRG->get_vertex_dim(widest_path).copy(co);
+                    rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path));
+                    break;
+                }
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphPagerank_impl(nvgraphHandle_t handle,
+                                                     const nvgraphGraphDescr_t descrG,
+                                                     const size_t weight_index,
+                                                     const void *alpha,
+                                                     const size_t bookmark,
+                                                     const int has_guess,
+                                                     const size_t rank,
+                                                     const float tolerance,
+                                                     const int max_iter) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
+                    || check_ptr(alpha))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (!(has_guess == 0 || has_guess == 1))
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            int max_it;
+            float tol;
+
+            if (max_iter > 0)
+                max_it = max_iter;
+            else
+                max_it = 500;
+
+            if (tolerance == 0.0f)
+                tol = 1.0E-6f;
+            else if (tolerance < 1.0f && tolerance > 0.0f)
+                tol = tolerance;
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            switch (descrG->T)
+            {
+                case CUDA_R_32F:
+                    {
+                    float alphaT = *static_cast<const float*>(alpha);
+                    if (alphaT <= 0.0f || alphaT >= 1.0f)
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || bookmark >= MCSRG->get_num_vertex_dim()
+                            || rank >= MCSRG->get_num_vertex_dim()) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+
+                    int n = static_cast<int>(MCSRG->get_num_vertices());
+                    nvgraph::Vector<float> guess(n, handle->stream);
+                    nvgraph::Vector<float> bm(n, handle->stream);
+                    if (has_guess)
+                        guess.copy(MCSRG->get_vertex_dim(rank));
+                    else
+                        guess.fill(static_cast<float>(1.0 / n));
+                    bm.copy(MCSRG->get_vertex_dim(bookmark));
+                    nvgraph::Pagerank<int, float> pagerank_solver(*MCSRG->get_valued_csr_graph(weight_index), bm);
+                    rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it);
+                    break;
+                }
+                case CUDA_R_64F:
+                    {
+                    double alphaT = *static_cast<const double*>(alpha);
+                    if (alphaT <= 0.0 || alphaT >= 1.0)
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || bookmark >= MCSRG->get_num_vertex_dim()
+                            || rank >= MCSRG->get_num_vertex_dim()) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+
+                    int n = static_cast<int>(MCSRG->get_num_vertices());
+                    nvgraph::Vector<double> guess(n, handle->stream);
+                    nvgraph::Vector<double> bm(n, handle->stream);
+                    bm.copy(MCSRG->get_vertex_dim(bookmark));
+                    if (has_guess)
+                        guess.copy(MCSRG->get_vertex_dim(rank));
+                    else
+                        guess.fill(static_cast<float>(1.0 / n));
+                    nvgraph::Pagerank<int, double> pagerank_solver(*MCSRG->get_valued_csr_graph(weight_index), bm);
+                    rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it);
+                    break;
+                }
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphKrylovPagerank_impl(nvgraphHandle_t handle,
+                                                           const nvgraphGraphDescr_t descrG,
+                                                           const size_t weight_index,
+                                                           const void *alpha,
+                                                           const size_t bookmark,
+                                                           const float tolerance,
+                                                           const int max_iter,
+                                                           const int subspace_size,
+                                                           const int has_guess,
+                                                           const size_t rank) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)
+                    || check_ptr(alpha))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->TT != NVGRAPH_CSC_32) // supported topologies
+                return NVGRAPH_STATUS_INVALID_VALUE;
 
 //        cudaError_t cuda_status;
-			int max_it;
-			int ss_sz;
-			float tol;
-
-			if (max_iter > 0)
-				max_it = max_iter;
-			else
-				max_it = 500;
-
-			if (subspace_size > 0)
-				ss_sz = subspace_size;
-			else
-				ss_sz = 8;
-
-			if (tolerance == 0.0f)
-				tol = 1.0E-6f;
-			else if (tolerance < 1.0f && tolerance > 0.0f)
-				tol = tolerance;
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			switch (descrG->T)
-			{
-				case CUDA_R_32F:
-					{
-					float alphaT = *static_cast<const float*>(alpha);
-					if (alphaT <= 0.0f || alphaT >= 1.0f)
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| bookmark >= MCSRG->get_num_vertex_dim()
-							|| rank >= MCSRG->get_num_vertex_dim()) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-
-					int n = static_cast<int>(MCSRG->get_num_vertices());
-					nvgraph::Vector<float> guess(n, handle->stream), eigVals(1, handle->stream);
-					if (has_guess)
-						guess.copy(MCSRG->get_vertex_dim(rank));
-					else
-						guess.fill(static_cast<float>(1.0 / n));
-					nvgraph::ImplicitArnoldi<int, float> iram_solver(	*MCSRG->get_valued_csr_graph(weight_index),
-																						MCSRG->get_vertex_dim(bookmark),
-																						tol,
-																						max_it,
-																						alphaT);
-					rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank));
-					break;
-				}
-				case CUDA_R_64F:
-					{
-					// curently iram solver accept float for alpha
-					double alphaTemp = *static_cast<const double*>(alpha);
-					float alphaT = static_cast<float>(alphaTemp);
-					if (alphaT <= 0.0f || alphaT >= 1.0f)
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| bookmark >= MCSRG->get_num_vertex_dim()
-							|| rank >= MCSRG->get_num_vertex_dim()) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-
-					int n = static_cast<int>(MCSRG->get_num_vertices());
-					nvgraph::Vector<double> guess(n, handle->stream), eigVals(1, handle->stream);
-					if (has_guess)
-						guess.copy(MCSRG->get_vertex_dim(rank));
-					else
-						guess.fill(static_cast<float>(1.0 / n));
-					nvgraph::ImplicitArnoldi<int, double> iram_solver(	*MCSRG->get_valued_csr_graph(weight_index),
-																						MCSRG->get_vertex_dim(bookmark),
-																						tol,
-																						max_it,
-																						alphaT);
-					rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank));
-					break;
-				}
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex_impl(nvgraphHandle_t handle,
-																							nvgraphGraphDescr_t descrG,
-																							nvgraphGraphDescr_t subdescrG,
-																							int *subvertices,
-																							size_t numvertices)
-																							{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		typedef int IndexType;
-
-		try
-		{
-			if (check_context(handle) ||
-					check_graph(descrG) ||
-					!subdescrG ||
-					check_int_size(numvertices) ||
-					check_ptr(subvertices))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (!numvertices)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			subdescrG->TT = descrG->TT;
-			subdescrG->T = descrG->T;
-
-			switch (descrG->graphStatus)
-			{
-				case HAS_TOPOLOGY: //CsrGraph
-				{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<IndexType>*>(descrG->graph_handle);
-
-					Graph<IndexType>* subgraph = extract_subgraph_by_vertices(*CSRG,
-																									subvertices,
-																									numvertices,
-																									handle->stream);
-
-					subdescrG->graph_handle = subgraph;
-					subdescrG->graphStatus = HAS_TOPOLOGY;
-				}
-					break;
-
-				case HAS_VALUES: //MultiValuedCsrGraph
-					if (descrG->T == CUDA_R_32F)
-							{
-						nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-								static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-
-						nvgraph::MultiValuedCsrGraph<int, float>* subgraph =
-								extract_subgraph_by_vertices(*MCSRG,
-																		subvertices,
-																		numvertices,
-																		handle->stream);
-
-						subdescrG->graph_handle = subgraph;
-						subdescrG->graphStatus = HAS_VALUES;
-					}
-					else if (descrG->T == CUDA_R_64F)
-							{
-						nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-								static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-
-						nvgraph::MultiValuedCsrGraph<int, double>* subgraph =
-								extract_subgraph_by_vertices(*MCSRG,
-																		subvertices,
-																		numvertices,
-																		handle->stream);
-
-						subdescrG->graph_handle = subgraph;
-						subdescrG->graphStatus = HAS_VALUES;
-					}
-					else
-						return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-					break;
-
-				default:
-					return NVGRAPH_STATUS_INVALID_VALUE;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge_impl(nvgraphHandle_t handle,
-																						nvgraphGraphDescr_t descrG,
-																						nvgraphGraphDescr_t subdescrG,
-																						int *subedges,
-																						size_t numedges)
-																						{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		//TODO: extract handle->stream info, from handler/nvgraphContext (?)
-		typedef int IndexType;
-
-		try
-		{
-			if (check_context(handle) ||
-					check_graph(descrG) ||
-					!subdescrG ||
-					check_int_size(numedges) ||
-					check_ptr(subedges))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (!numedges)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			subdescrG->TT = descrG->TT;
-			subdescrG->T = descrG->T;
-
-			switch (descrG->graphStatus)
-			{
-				case HAS_TOPOLOGY: //CsrGraph
-				{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-					Graph<IndexType>* subgraph = extract_subgraph_by_edges(*CSRG,
-																								subedges,
-																								numedges,
-																								handle->stream);
-
-					subdescrG->graph_handle = subgraph;
-					subdescrG->graphStatus = HAS_TOPOLOGY;
-				}
-					break;
-
-				case HAS_VALUES: //MultiValuedCsrGraph
-					if (descrG->T == CUDA_R_32F)
-							{
-						nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-								static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-
-						nvgraph::MultiValuedCsrGraph<int, float>* subgraph =
-								extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream);
-
-						subdescrG->graph_handle = subgraph;
-						subdescrG->graphStatus = HAS_VALUES;
-					}
-					else if (descrG->T == CUDA_R_64F)
-							{
-						nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-								static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-
-						nvgraph::MultiValuedCsrGraph<int, double>* subgraph =
-								extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream);
-
-						subdescrG->graph_handle = subgraph;
-						subdescrG->graphStatus = HAS_VALUES;
-					}
-					else
-						return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-					break;
-
-				default:
-					return NVGRAPH_STATUS_INVALID_VALUE;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphBalancedCutClustering_impl(nvgraphHandle_t handle,
-																						const nvgraphGraphDescr_t descrG,
-																						const size_t weight_index,
-																						const int n_clusters,
-																						const int n_eig_vects,
-																						const int evs_type,
-																						const float evs_tolerance,
-																						const int evs_max_iter,
-																						const float kmean_tolerance,
-																						const int kmean_max_iter,
-																						int* clustering,
-																						void* eig_vals,
-																						void* eig_vects)
-																						{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			int evs_max_it, kmean_max_it;
-			int iters_lanczos, iters_kmeans;
-			float evs_tol, kmean_tol;
-
-			if (evs_max_iter > 0)
-				evs_max_it = evs_max_iter;
-			else
-				evs_max_it = 4000;
-
-			if (evs_tolerance == 0.0f)
-				evs_tol = 1.0E-3f;
-			else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f)
-				evs_tol = evs_tolerance;
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (kmean_max_iter > 0)
-				kmean_max_it = kmean_max_iter;
-			else
-				kmean_max_it = 200;
-
-			if (kmean_tolerance == 0.0f)
-				kmean_tol = 1.0E-2f;
-			else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f)
-				kmean_tol = kmean_tolerance;
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (n_clusters < 2)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (n_eig_vects > n_clusters)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (!(evs_type == 0 || evs_type == 1))
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (clustering == NULL || eig_vals == NULL || eig_vects == NULL)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			switch (descrG->T)
-			{
-				case CUDA_R_32F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::ValuedCsrGraph<int, float> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
-					Vector<float> eigVals(n_eig_vects, handle->stream);
-					Vector<float> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
-
-					if (evs_type == 0)
-							{
-						int restartIter_lanczos = 15 + n_eig_vects;
-						rc = partition<int, float>(network,
-															n_clusters,
-															n_eig_vects,
-															evs_max_it,
-															restartIter_lanczos,
-															evs_tol,
-															kmean_max_it,
-															kmean_tol,
-															clust.raw(),
-															eigVals,
-															eigVecs,
-															iters_lanczos,
-															iters_kmeans);
-					}
-					else
-					{
-						cusolverDnHandle_t cusolverHandle;
-						cusolverDnCreate(&cusolverHandle);
-						rc = partition_lobpcg<int, float>(network,
-						NULL, // preconditioner
-																		cusolverHandle,
-																		n_clusters,
-																		n_eig_vects,
-																		evs_max_it,
-																		evs_tol,
-																		kmean_max_it,
-																		kmean_tol,
-																		clust.raw(),
-																		eigVals,
-																		eigVecs,
-																		iters_lanczos,
-																		iters_kmeans);
-					}
-					// give a copy of results to the user
-					if (rc == NVGRAPH_OK)
-							{
-						CHECK_CUDA(cudaMemcpy((int* )clustering,
-														clust.raw(),
-														(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-														cudaMemcpyDefault));
-						CHECK_CUDA(cudaMemcpy((float* )eig_vals,
-														eigVals.raw(),
-														(size_t )(n_eig_vects * sizeof(float)),
-														cudaMemcpyDefault));
-						CHECK_CUDA(cudaMemcpy((float* )eig_vects,
-														eigVecs.raw(),
-														(size_t )(n_eig_vects * MCSRG->get_num_vertices()
-																* sizeof(float)),
-														cudaMemcpyDefault));
-					}
-
-					break;
-				}
-				case CUDA_R_64F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::ValuedCsrGraph<int, double> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
-					Vector<double> eigVals(n_eig_vects, handle->stream);
-					Vector<double> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
-					if (evs_type == 0)
-							{
-						int restartIter_lanczos = 15 + n_eig_vects;
-						rc = partition<int, double>(network,
-																n_clusters,
-																n_eig_vects,
-																evs_max_it,
-																restartIter_lanczos,
-																evs_tol,
-																kmean_max_it,
-																kmean_tol,
-																clust.raw(),
-																eigVals,
-																eigVecs,
-																iters_lanczos,
-																iters_kmeans);
-					}
-					else
-					{
-						cusolverDnHandle_t cusolverHandle;
-						cusolverDnCreate(&cusolverHandle);
-						rc = partition_lobpcg<int, double>(network,
-						NULL, // preconditioner
-																		cusolverHandle,
-																		n_clusters,
-																		n_eig_vects,
-																		evs_max_it,
-																		evs_tol,
-																		kmean_max_it,
-																		kmean_tol,
-																		clust.raw(),
-																		eigVals,
-																		eigVecs,
-																		iters_lanczos,
-																		iters_kmeans);
-					}
-					// give a copy of results to the user
-					if (rc == NVGRAPH_OK)
-							{
-						CHECK_CUDA(cudaMemcpy((int* )clustering,
-														clust.raw(),
-														(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-														cudaMemcpyDefault));
-						CHECK_CUDA(cudaMemcpy((double* )eig_vals,
-														eigVals.raw(),
-														(size_t )(n_eig_vects * sizeof(double)),
-														cudaMemcpyDefault));
-						CHECK_CUDA(cudaMemcpy((double* )eig_vects,
-														eigVecs.raw(),
-														(size_t )(n_eig_vects * MCSRG->get_num_vertices()
-																* sizeof(double)),
-														cudaMemcpyDefault));
-					}
-					break;
-				}
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeBalancedCut_impl(nvgraphHandle_t handle,
-																					const nvgraphGraphDescr_t descrG,
-																					const size_t weight_index,
-																					const int n_clusters,
-																					const int* clustering,
-																					float * edgeCut,
-																					float * ratioCut)
-																					{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (n_clusters < 2)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (clustering == NULL || edgeCut == NULL || ratioCut == NULL)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			switch (descrG->T)
-			{
-				case CUDA_R_32F:
-					{
-					float edge_cut, ratio_cut;
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices()))
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::ValuedCsrGraph<int, float> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
-					CHECK_CUDA(cudaMemcpy(clust.raw(),
-													(int* )clustering,
-													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-													cudaMemcpyDefault));
-					rc = analyzePartition<int, float>(network,
-																	n_clusters,
-																	clust.raw(),
-																	edge_cut,
-																	ratio_cut);
-					*edgeCut = edge_cut;
-					*ratioCut = ratio_cut;
-					break;
-				}
-				case CUDA_R_64F:
-					{
-					double edge_cut, ratio_cut;
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::ValuedCsrGraph<int, double> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
-					CHECK_CUDA(cudaMemcpy(clust.raw(),
-													(int* )clustering,
-													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-													cudaMemcpyDefault));
-					rc = analyzePartition<int, double>(network,
-																	n_clusters,
-																	clust.raw(),
-																	edge_cut,
-																	ratio_cut);
-					*edgeCut = static_cast<float>(edge_cut);
-					*ratioCut = static_cast<float>(ratio_cut);
-					break;
-				}
-
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-		return getCAPIStatusForError(rc);
-
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching_impl(	nvgraphHandle_t handle,
-																					const nvgraphGraphDescr_t descrG,
-																					const size_t weight_index,
-																					const nvgraphEdgeWeightMatching_t similarity_metric,
-																					int* aggregates,
-																					size_t* num_aggregates)
-																					{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (aggregates == NULL)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-			Matching_t sim_metric;
-			switch (similarity_metric)
-			{
-				case NVGRAPH_UNSCALED: {
-					sim_metric = USER_PROVIDED;
-					break;
-				}
-				case NVGRAPH_SCALED_BY_ROW_SUM: {
-					sim_metric = SCALED_BY_ROW_SUM;
-					break;
-				}
-				case NVGRAPH_SCALED_BY_DIAGONAL: {
-					sim_metric = SCALED_BY_DIAGONAL;
-					break;
-				}
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-
-			switch (descrG->T)
-			{
-				case CUDA_R_32F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim())
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::ValuedCsrGraph<int, float> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					Vector<int> agg(MCSRG->get_num_vertices(), handle->stream);
-					int num_agg = 0;
-					nvgraph::Size2Selector<int, float> one_phase_hand_checking(sim_metric);
-					rc = one_phase_hand_checking.setAggregates(network, agg, num_agg);
-					*num_aggregates = static_cast<size_t>(num_agg);
-					CHECK_CUDA(cudaMemcpy((int* )aggregates,
-													agg.raw(),
-													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-													cudaMemcpyDefault));
-					break;
-				}
-				case CUDA_R_64F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim())
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::ValuedCsrGraph<int, double> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					Vector<int> agg(MCSRG->get_num_vertices(), handle->stream);
-					Vector<int> agg_global(MCSRG->get_num_vertices(), handle->stream);
-					int num_agg = 0;
-					nvgraph::Size2Selector<int, double> one_phase_hand_checking(sim_metric);
-					rc = one_phase_hand_checking.setAggregates(network, agg, num_agg);
-					*num_aggregates = static_cast<size_t>(num_agg);
-					CHECK_CUDA(cudaMemcpy((int* )aggregates,
-													agg.raw(),
-													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-													cudaMemcpyDefault));
-					break;
-				}
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-		return getCAPIStatusForError(rc);
-
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphSpectralModularityMaximization_impl(	nvgraphHandle_t handle,
-																									const nvgraphGraphDescr_t descrG,
-																									const size_t weight_index,
-																									const int n_clusters,
-																									const int n_eig_vects,
-																									const float evs_tolerance,
-																									const int evs_max_iter,
-																									const float kmean_tolerance,
-																									const int kmean_max_iter,
-																									int* clustering,
-																									void* eig_vals,
-																									void* eig_vects)
-																									{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
-				return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED;
-
-			int evs_max_it, kmean_max_it;
-			int iters_lanczos, iters_kmeans;
-			float evs_tol, kmean_tol;
-
-			if (evs_max_iter > 0)
-				evs_max_it = evs_max_iter;
-			else
-				evs_max_it = 4000;
-
-			if (evs_tolerance == 0.0f)
-				evs_tol = 1.0E-3f;
-			else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f)
-				evs_tol = evs_tolerance;
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (kmean_max_iter > 0)
-				kmean_max_it = kmean_max_iter;
-			else
-				kmean_max_it = 200;
-
-			if (kmean_tolerance == 0.0f)
-				kmean_tol = 1.0E-2f;
-			else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f)
-				kmean_tol = kmean_tolerance;
-			else
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (n_clusters < 2)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (n_eig_vects > n_clusters)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (clustering == NULL || eig_vals == NULL || eig_vects == NULL)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			switch (descrG->T)
-			{
-				case CUDA_R_32F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::ValuedCsrGraph<int, float> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
-					Vector<float> eigVals(n_eig_vects, handle->stream);
-					Vector<float> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
-					int restartIter_lanczos = 15 + n_eig_vects;
-					rc = modularity_maximization<int, float>(network,
-																			n_clusters,
-																			n_eig_vects,
-																			evs_max_it,
-																			restartIter_lanczos,
-																			evs_tol,
-																			kmean_max_it,
-																			kmean_tol,
-																			clust.raw(),
-																			eigVals,
-																			eigVecs,
-																			iters_lanczos,
-																			iters_kmeans);
-
-					// give a copy of results to the user
-					if (rc == NVGRAPH_OK)
-							{
-						CHECK_CUDA(cudaMemcpy((int* )clustering,
-														clust.raw(),
-														(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-														cudaMemcpyDefault));
-						CHECK_CUDA(cudaMemcpy((float* )eig_vals,
-														eigVals.raw(),
-														(size_t )(n_eig_vects * sizeof(float)),
-														cudaMemcpyDefault));
-						CHECK_CUDA(cudaMemcpy((float* )eig_vects,
-														eigVecs.raw(),
-														(size_t )(n_eig_vects * MCSRG->get_num_vertices()
-																* sizeof(float)),
-														cudaMemcpyDefault));
-					}
-
-					break;
-				}
-				case CUDA_R_64F:
-					{
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::ValuedCsrGraph<int, double> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
-					Vector<double> eigVals(n_eig_vects, handle->stream);
-					Vector<double> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
-					int restartIter_lanczos = 15 + n_eig_vects;
-					rc = modularity_maximization<int, double>(network,
-																			n_clusters,
-																			n_eig_vects,
-																			evs_max_it,
-																			restartIter_lanczos,
-																			evs_tol,
-																			kmean_max_it,
-																			kmean_tol,
-																			clust.raw(),
-																			eigVals,
-																			eigVecs,
-																			iters_lanczos,
-																			iters_kmeans);
-					// give a copy of results to the user
-					if (rc == NVGRAPH_OK)
-							{
-						CHECK_CUDA(cudaMemcpy((int* )clustering,
-														clust.raw(),
-														(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-														cudaMemcpyDefault));
-						CHECK_CUDA(cudaMemcpy((double* )eig_vals,
-														eigVals.raw(),
-														(size_t )(n_eig_vects * sizeof(double)),
-														cudaMemcpyDefault));
-						CHECK_CUDA(cudaMemcpy((double* )eig_vects,
-														eigVecs.raw(),
-														(size_t )(n_eig_vects * MCSRG->get_num_vertices()
-																* sizeof(double)),
-														cudaMemcpyDefault));
-					}
-					break;
-				}
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-		return getCAPIStatusForError(rc);
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeModularityClustering_impl(	nvgraphHandle_t handle,
-																								const nvgraphGraphDescr_t descrG,
-																								const size_t weight_index,
-																								const int n_clusters,
-																								const int* clustering,
-																								float * modularity)
-																								{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
-				return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED;
-
-			if (n_clusters < 2)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (clustering == NULL || modularity == NULL)
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			switch (descrG->T)
-			{
-				case CUDA_R_32F:
-					{
-					float mod;
-					nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices()))
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					nvgraph::ValuedCsrGraph<int, float> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
-					CHECK_CUDA(cudaMemcpy(clust.raw(),
-													(int* )clustering,
-													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-													cudaMemcpyDefault));
-					rc = analyzeModularity<int, float>(network,
-																	n_clusters,
-																	clust.raw(),
-																	mod);
-					*modularity = mod;
-					break;
-				}
-				case CUDA_R_64F:
-					{
-					double mod;
-					nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-							static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-					if (weight_index >= MCSRG->get_num_edge_dim()
-							|| n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
-						return NVGRAPH_STATUS_INVALID_VALUE;
-					Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
-					CHECK_CUDA(cudaMemcpy(clust.raw(),
-													(int* )clustering,
-													(size_t )(MCSRG->get_num_vertices() * sizeof(int)),
-													cudaMemcpyDefault));
-					nvgraph::ValuedCsrGraph<int, double> network =
-							*MCSRG->get_valued_csr_graph(weight_index);
-					rc = analyzeModularity<int, double>(network,
-																	n_clusters,
-																	clust.raw(),
-																	mod);
-					*modularity = static_cast<float>(mod);
-					break;
-				}
-
-				default:
-					return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-			}
-		}
-		NVGRAPH_CATCHES(rc)
-		return getCAPIStatusForError(rc);
-	}
+            int max_it;
+            int ss_sz;
+            float tol;
+
+            if (max_iter > 0)
+                max_it = max_iter;
+            else
+                max_it = 500;
+
+            if (subspace_size > 0)
+                ss_sz = subspace_size;
+            else
+                ss_sz = 8;
+
+            if (tolerance == 0.0f)
+                tol = 1.0E-6f;
+            else if (tolerance < 1.0f && tolerance > 0.0f)
+                tol = tolerance;
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            switch (descrG->T)
+            {
+                case CUDA_R_32F:
+                    {
+                    float alphaT = *static_cast<const float*>(alpha);
+                    if (alphaT <= 0.0f || alphaT >= 1.0f)
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || bookmark >= MCSRG->get_num_vertex_dim()
+                            || rank >= MCSRG->get_num_vertex_dim()) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+
+                    int n = static_cast<int>(MCSRG->get_num_vertices());
+                    nvgraph::Vector<float> guess(n, handle->stream), eigVals(1, handle->stream);
+                    if (has_guess)
+                        guess.copy(MCSRG->get_vertex_dim(rank));
+                    else
+                        guess.fill(static_cast<float>(1.0 / n));
+                    nvgraph::ImplicitArnoldi<int, float> iram_solver(*MCSRG->get_valued_csr_graph(weight_index),
+                                                                     MCSRG->get_vertex_dim(bookmark),
+                                                                     tol,
+                                                                     max_it,
+                                                                     alphaT);
+                    rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank));
+                    break;
+                }
+                case CUDA_R_64F:
+                    {
+                    // curently iram solver accept float for alpha
+                    double alphaTemp = *static_cast<const double*>(alpha);
+                    float alphaT = static_cast<float>(alphaTemp);
+                    if (alphaT <= 0.0f || alphaT >= 1.0f)
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || bookmark >= MCSRG->get_num_vertex_dim()
+                            || rank >= MCSRG->get_num_vertex_dim()) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+
+                    int n = static_cast<int>(MCSRG->get_num_vertices());
+                    nvgraph::Vector<double> guess(n, handle->stream), eigVals(1, handle->stream);
+                    if (has_guess)
+                        guess.copy(MCSRG->get_vertex_dim(rank));
+                    else
+                        guess.fill(static_cast<float>(1.0 / n));
+                    nvgraph::ImplicitArnoldi<int, double> iram_solver(*MCSRG->get_valued_csr_graph(weight_index),
+                                                                      MCSRG->get_vertex_dim(bookmark),
+                                                                      tol,
+                                                                      max_it,
+                                                                      alphaT);
+                    rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank));
+                    break;
+                }
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex_impl(nvgraphHandle_t handle,
+                                                                    nvgraphGraphDescr_t descrG,
+                                                                    nvgraphGraphDescr_t subdescrG,
+                                                                    int *subvertices,
+                                                                    size_t numvertices) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        typedef int IndexType;
+
+        try
+        {
+            if (check_context(handle) ||
+                    check_graph(descrG) ||
+                    !subdescrG ||
+                    check_int_size(numvertices) ||
+                    check_ptr(subvertices))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (!numvertices)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            subdescrG->TT = descrG->TT;
+            subdescrG->T = descrG->T;
+
+            switch (descrG->graphStatus)
+            {
+                case HAS_TOPOLOGY: //CsrGraph
+                {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<IndexType>*>(descrG->graph_handle);
+
+                    Graph<IndexType>* subgraph = extract_subgraph_by_vertices(*CSRG,
+                                                                              subvertices,
+                                                                              numvertices,
+                                                                              handle->stream);
+
+                    subdescrG->graph_handle = subgraph;
+                    subdescrG->graphStatus = HAS_TOPOLOGY;
+                }
+                    break;
+
+                case HAS_VALUES: //MultiValuedCsrGraph
+                    if (descrG->T == CUDA_R_32F)
+                            {
+                        nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                                static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+
+                        nvgraph::MultiValuedCsrGraph<int, float>* subgraph =
+                                extract_subgraph_by_vertices(*MCSRG,
+                                                             subvertices,
+                                                             numvertices,
+                                                             handle->stream);
+
+                        subdescrG->graph_handle = subgraph;
+                        subdescrG->graphStatus = HAS_VALUES;
+                    }
+                    else if (descrG->T == CUDA_R_64F)
+                            {
+                        nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                                static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+
+                        nvgraph::MultiValuedCsrGraph<int, double>* subgraph =
+                                extract_subgraph_by_vertices(*MCSRG,
+                                                             subvertices,
+                                                             numvertices,
+                                                             handle->stream);
+
+                        subdescrG->graph_handle = subgraph;
+                        subdescrG->graphStatus = HAS_VALUES;
+                    }
+                    else
+                        return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                    break;
+
+                default:
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge_impl(nvgraphHandle_t handle,
+                                                                  nvgraphGraphDescr_t descrG,
+                                                                  nvgraphGraphDescr_t subdescrG,
+                                                                  int *subedges,
+                                                                  size_t numedges) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        //TODO: extract handle->stream info, from handler/nvgraphContext (?)
+        typedef int IndexType;
+
+        try
+        {
+            if (check_context(handle) ||
+                    check_graph(descrG) ||
+                    !subdescrG ||
+                    check_int_size(numedges) ||
+                    check_ptr(subedges))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (!numedges)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            subdescrG->TT = descrG->TT;
+            subdescrG->T = descrG->T;
+
+            switch (descrG->graphStatus)
+            {
+                case HAS_TOPOLOGY: //CsrGraph
+                {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+                    Graph<IndexType>* subgraph = extract_subgraph_by_edges(*CSRG,
+                                                                           subedges,
+                                                                           numedges,
+                                                                           handle->stream);
+
+                    subdescrG->graph_handle = subgraph;
+                    subdescrG->graphStatus = HAS_TOPOLOGY;
+                }
+                    break;
+
+                case HAS_VALUES: //MultiValuedCsrGraph
+                    if (descrG->T == CUDA_R_32F)
+                            {
+                        nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                                static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+
+                        nvgraph::MultiValuedCsrGraph<int, float>* subgraph =
+                                extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream);
+
+                        subdescrG->graph_handle = subgraph;
+                        subdescrG->graphStatus = HAS_VALUES;
+                    }
+                    else if (descrG->T == CUDA_R_64F)
+                            {
+                        nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                                static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+
+                        nvgraph::MultiValuedCsrGraph<int, double>* subgraph =
+                                extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream);
+
+                        subdescrG->graph_handle = subgraph;
+                        subdescrG->graphStatus = HAS_VALUES;
+                    }
+                    else
+                        return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                    break;
+
+                default:
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphBalancedCutClustering_impl(nvgraphHandle_t handle,
+                                                                  const nvgraphGraphDescr_t descrG,
+                                                                  const size_t weight_index,
+                                                                  const int n_clusters,
+                                                                  const int n_eig_vects,
+                                                                  const int evs_type,
+                                                                  const float evs_tolerance,
+                                                                  const int evs_max_iter,
+                                                                  const float kmean_tolerance,
+                                                                  const int kmean_max_iter,
+                                                                  int* clustering,
+                                                                  void* eig_vals,
+                                                                  void* eig_vects) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            int evs_max_it, kmean_max_it;
+            int iters_lanczos, iters_kmeans;
+            float evs_tol, kmean_tol;
+
+            if (evs_max_iter > 0)
+                evs_max_it = evs_max_iter;
+            else
+                evs_max_it = 4000;
+
+            if (evs_tolerance == 0.0f)
+                evs_tol = 1.0E-3f;
+            else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f)
+                evs_tol = evs_tolerance;
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (kmean_max_iter > 0)
+                kmean_max_it = kmean_max_iter;
+            else
+                kmean_max_it = 200;
+
+            if (kmean_tolerance == 0.0f)
+                kmean_tol = 1.0E-2f;
+            else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f)
+                kmean_tol = kmean_tolerance;
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (n_clusters < 2)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (n_eig_vects > n_clusters)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (!(evs_type == 0 || evs_type == 1))
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (clustering == NULL || eig_vals == NULL || eig_vects == NULL)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            switch (descrG->T)
+            {
+                case CUDA_R_32F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::ValuedCsrGraph<int, float> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+                    Vector<float> eigVals(n_eig_vects, handle->stream);
+                    Vector<float> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
+
+                    if (evs_type == 0)
+                            {
+                        int restartIter_lanczos = 15 + n_eig_vects;
+                        rc = partition<int, float>(network,
+                                                   n_clusters,
+                                                   n_eig_vects,
+                                                   evs_max_it,
+                                                   restartIter_lanczos,
+                                                   evs_tol,
+                                                   kmean_max_it,
+                                                   kmean_tol,
+                                                   clust.raw(),
+                                                   eigVals,
+                                                   eigVecs,
+                                                   iters_lanczos,
+                                                   iters_kmeans);
+                    }
+                    else
+                    {
+                        cusolverDnHandle_t cusolverHandle;
+                        cusolverDnCreate(&cusolverHandle);
+                        rc = partition_lobpcg<int, float>(network,
+                                                          NULL, // preconditioner
+                                                          cusolverHandle,
+                                                          n_clusters,
+                                                          n_eig_vects,
+                                                          evs_max_it,
+                                                          evs_tol,
+                                                          kmean_max_it,
+                                                          kmean_tol,
+                                                          clust.raw(),
+                                                          eigVals,
+                                                          eigVecs,
+                                                          iters_lanczos,
+                                                          iters_kmeans);
+                    }
+                    // give a copy of results to the user
+                    if (rc == NVGRAPH_OK)
+                            {
+                        CHECK_CUDA(cudaMemcpy((int* )clustering,
+                                                        clust.raw(),
+                                                        (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                        cudaMemcpyDefault));
+                        CHECK_CUDA(cudaMemcpy((float* )eig_vals,
+                                                        eigVals.raw(),
+                                                        (size_t )(n_eig_vects * sizeof(float)),
+                                                        cudaMemcpyDefault));
+                        CHECK_CUDA(cudaMemcpy((float* )eig_vects,
+                                                        eigVecs.raw(),
+                                                        (size_t )(n_eig_vects * MCSRG->get_num_vertices()
+                                                                * sizeof(float)),
+                                                        cudaMemcpyDefault));
+                    }
+
+                    break;
+                }
+                case CUDA_R_64F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::ValuedCsrGraph<int, double> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+                    Vector<double> eigVals(n_eig_vects, handle->stream);
+                    Vector<double> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
+                    if (evs_type == 0)
+                            {
+                        int restartIter_lanczos = 15 + n_eig_vects;
+                        rc = partition<int, double>(network,
+                                                    n_clusters,
+                                                    n_eig_vects,
+                                                    evs_max_it,
+                                                    restartIter_lanczos,
+                                                    evs_tol,
+                                                    kmean_max_it,
+                                                    kmean_tol,
+                                                    clust.raw(),
+                                                    eigVals,
+                                                    eigVecs,
+                                                    iters_lanczos,
+                                                    iters_kmeans);
+                    }
+                    else
+                    {
+                        cusolverDnHandle_t cusolverHandle;
+                        cusolverDnCreate(&cusolverHandle);
+                        rc = partition_lobpcg<int, double>(network,
+                                                           NULL, // preconditioner
+                                                           cusolverHandle,
+                                                           n_clusters,
+                                                           n_eig_vects,
+                                                           evs_max_it,
+                                                           evs_tol,
+                                                           kmean_max_it,
+                                                           kmean_tol,
+                                                           clust.raw(),
+                                                           eigVals,
+                                                           eigVecs,
+                                                           iters_lanczos,
+                                                           iters_kmeans);
+                    }
+                    // give a copy of results to the user
+                    if (rc == NVGRAPH_OK)
+                            {
+                        CHECK_CUDA(cudaMemcpy((int* )clustering,
+                                                        clust.raw(),
+                                                        (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                        cudaMemcpyDefault));
+                        CHECK_CUDA(cudaMemcpy((double* )eig_vals,
+                                                        eigVals.raw(),
+                                                        (size_t )(n_eig_vects * sizeof(double)),
+                                                        cudaMemcpyDefault));
+                        CHECK_CUDA(cudaMemcpy((double* )eig_vects,
+                                                        eigVecs.raw(),
+                                                        (size_t )(n_eig_vects * MCSRG->get_num_vertices()
+                                                                * sizeof(double)),
+                                                        cudaMemcpyDefault));
+                    }
+                    break;
+                }
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeBalancedCut_impl(nvgraphHandle_t handle,
+                                                               const nvgraphGraphDescr_t descrG,
+                                                               const size_t weight_index,
+                                                               const int n_clusters,
+                                                               const int* clustering,
+                                                               float * edgeCut,
+                                                               float * ratioCut) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (n_clusters < 2)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (clustering == NULL || edgeCut == NULL || ratioCut == NULL)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            switch (descrG->T)
+            {
+                case CUDA_R_32F:
+                    {
+                    float edge_cut, ratio_cut;
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || n_clusters > static_cast<int>(MCSRG->get_num_vertices()))
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::ValuedCsrGraph<int, float> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+                    CHECK_CUDA(cudaMemcpy(clust.raw(),
+                                                    (int* )clustering,
+                                                    (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                    cudaMemcpyDefault));
+                    rc = analyzePartition<int, float>(network,
+                                                      n_clusters,
+                                                      clust.raw(),
+                                                      edge_cut,
+                                                      ratio_cut);
+                    *edgeCut = edge_cut;
+                    *ratioCut = ratio_cut;
+                    break;
+                }
+                case CUDA_R_64F:
+                    {
+                    double edge_cut, ratio_cut;
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::ValuedCsrGraph<int, double> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+                    CHECK_CUDA(cudaMemcpy(clust.raw(),
+                                                    (int* )clustering,
+                                                    (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                    cudaMemcpyDefault));
+                    rc = analyzePartition<int, double>(network,
+                                                       n_clusters,
+                                                       clust.raw(),
+                                                       edge_cut,
+                                                       ratio_cut);
+                    *edgeCut = static_cast<float>(edge_cut);
+                    *ratioCut = static_cast<float>(ratio_cut);
+                    break;
+                }
+
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+        return getCAPIStatusForError(rc);
+
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching_impl(nvgraphHandle_t handle,
+                                                              const nvgraphGraphDescr_t descrG,
+                                                              const size_t weight_index,
+                                                              const nvgraphEdgeWeightMatching_t similarity_metric,
+                                                              int* aggregates,
+                                                              size_t* num_aggregates) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (aggregates == NULL)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+            Matching_t sim_metric;
+            switch (similarity_metric)
+            {
+                case NVGRAPH_UNSCALED: {
+                    sim_metric = USER_PROVIDED;
+                    break;
+                }
+                case NVGRAPH_SCALED_BY_ROW_SUM: {
+                    sim_metric = SCALED_BY_ROW_SUM;
+                    break;
+                }
+                case NVGRAPH_SCALED_BY_DIAGONAL: {
+                    sim_metric = SCALED_BY_DIAGONAL;
+                    break;
+                }
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+
+            switch (descrG->T)
+            {
+                case CUDA_R_32F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim())
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::ValuedCsrGraph<int, float> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    Vector<int> agg(MCSRG->get_num_vertices(), handle->stream);
+                    int num_agg = 0;
+                    nvgraph::Size2Selector<int, float> one_phase_hand_checking(sim_metric);
+                    rc = one_phase_hand_checking.setAggregates(network, agg, num_agg);
+                    *num_aggregates = static_cast<size_t>(num_agg);
+                    CHECK_CUDA(cudaMemcpy((int* )aggregates,
+                                                    agg.raw(),
+                                                    (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                    cudaMemcpyDefault));
+                    break;
+                }
+                case CUDA_R_64F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim())
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::ValuedCsrGraph<int, double> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    Vector<int> agg(MCSRG->get_num_vertices(), handle->stream);
+                    Vector<int> agg_global(MCSRG->get_num_vertices(), handle->stream);
+                    int num_agg = 0;
+                    nvgraph::Size2Selector<int, double> one_phase_hand_checking(sim_metric);
+                    rc = one_phase_hand_checking.setAggregates(network, agg, num_agg);
+                    *num_aggregates = static_cast<size_t>(num_agg);
+                    CHECK_CUDA(cudaMemcpy((int* )aggregates,
+                                                    agg.raw(),
+                                                    (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                    cudaMemcpyDefault));
+                    break;
+                }
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+        return getCAPIStatusForError(rc);
+
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphSpectralModularityMaximization_impl(nvgraphHandle_t handle,
+                                                                           const nvgraphGraphDescr_t descrG,
+                                                                           const size_t weight_index,
+                                                                           const int n_clusters,
+                                                                           const int n_eig_vects,
+                                                                           const float evs_tolerance,
+                                                                           const int evs_max_iter,
+                                                                           const float kmean_tolerance,
+                                                                           const int kmean_max_iter,
+                                                                           int* clustering,
+                                                                           void* eig_vals,
+                                                                           void* eig_vects) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+                return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED;
+
+            int evs_max_it, kmean_max_it;
+            int iters_lanczos, iters_kmeans;
+            float evs_tol, kmean_tol;
+
+            if (evs_max_iter > 0)
+                evs_max_it = evs_max_iter;
+            else
+                evs_max_it = 4000;
+
+            if (evs_tolerance == 0.0f)
+                evs_tol = 1.0E-3f;
+            else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f)
+                evs_tol = evs_tolerance;
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (kmean_max_iter > 0)
+                kmean_max_it = kmean_max_iter;
+            else
+                kmean_max_it = 200;
+
+            if (kmean_tolerance == 0.0f)
+                kmean_tol = 1.0E-2f;
+            else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f)
+                kmean_tol = kmean_tolerance;
+            else
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (n_clusters < 2)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (n_eig_vects > n_clusters)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (clustering == NULL || eig_vals == NULL || eig_vects == NULL)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            switch (descrG->T)
+            {
+                case CUDA_R_32F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::ValuedCsrGraph<int, float> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+                    Vector<float> eigVals(n_eig_vects, handle->stream);
+                    Vector<float> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
+                    int restartIter_lanczos = 15 + n_eig_vects;
+                    rc = modularity_maximization<int, float>(network,
+                                                             n_clusters,
+                                                             n_eig_vects,
+                                                             evs_max_it,
+                                                             restartIter_lanczos,
+                                                             evs_tol,
+                                                             kmean_max_it,
+                                                             kmean_tol,
+                                                             clust.raw(),
+                                                             eigVals,
+                                                             eigVecs,
+                                                             iters_lanczos,
+                                                             iters_kmeans);
+
+                    // give a copy of results to the user
+                    if (rc == NVGRAPH_OK)
+                            {
+                        CHECK_CUDA(cudaMemcpy((int* )clustering,
+                                                        clust.raw(),
+                                                        (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                        cudaMemcpyDefault));
+                        CHECK_CUDA(cudaMemcpy((float* )eig_vals,
+                                                        eigVals.raw(),
+                                                        (size_t )(n_eig_vects * sizeof(float)),
+                                                        cudaMemcpyDefault));
+                        CHECK_CUDA(cudaMemcpy((float* )eig_vects,
+                                                        eigVecs.raw(),
+                                                        (size_t )(n_eig_vects * MCSRG->get_num_vertices()
+                                                                * sizeof(float)),
+                                                        cudaMemcpyDefault));
+                    }
+
+                    break;
+                }
+                case CUDA_R_64F:
+                    {
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::ValuedCsrGraph<int, double> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+                    Vector<double> eigVals(n_eig_vects, handle->stream);
+                    Vector<double> eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream);
+                    int restartIter_lanczos = 15 + n_eig_vects;
+                    rc = modularity_maximization<int, double>(network,
+                                                              n_clusters,
+                                                              n_eig_vects,
+                                                              evs_max_it,
+                                                              restartIter_lanczos,
+                                                              evs_tol,
+                                                              kmean_max_it,
+                                                              kmean_tol,
+                                                              clust.raw(),
+                                                              eigVals,
+                                                              eigVecs,
+                                                              iters_lanczos,
+                                                              iters_kmeans);
+                    // give a copy of results to the user
+                    if (rc == NVGRAPH_OK)
+                            {
+                        CHECK_CUDA(cudaMemcpy((int* )clustering,
+                                                        clust.raw(),
+                                                        (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                        cudaMemcpyDefault));
+                        CHECK_CUDA(cudaMemcpy((double* )eig_vals,
+                                                        eigVals.raw(),
+                                                        (size_t )(n_eig_vects * sizeof(double)),
+                                                        cudaMemcpyDefault));
+                        CHECK_CUDA(cudaMemcpy((double* )eig_vects,
+                                                        eigVecs.raw(),
+                                                        (size_t )(n_eig_vects * MCSRG->get_num_vertices()
+                                                                * sizeof(double)),
+                                                        cudaMemcpyDefault));
+                    }
+                    break;
+                }
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+        return getCAPIStatusForError(rc);
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeModularityClustering_impl(nvgraphHandle_t handle,
+                                                                        const nvgraphGraphDescr_t descrG,
+                                                                        const size_t weight_index,
+                                                                        const int n_clusters,
+                                                                        const int* clustering,
+                                                                        float * modularity) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->TT != NVGRAPH_CSR_32) // supported topologies
+                return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED;
+
+            if (n_clusters < 2)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (clustering == NULL || modularity == NULL)
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            switch (descrG->T)
+            {
+                case CUDA_R_32F:
+                    {
+                    float mod;
+                    nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || n_clusters > static_cast<int>(MCSRG->get_num_vertices()))
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    nvgraph::ValuedCsrGraph<int, float> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+                    CHECK_CUDA(cudaMemcpy(clust.raw(),
+                                                    (int* )clustering,
+                                                    (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                    cudaMemcpyDefault));
+                    rc = analyzeModularity<int, float>(network,
+                                                       n_clusters,
+                                                       clust.raw(),
+                                                       mod);
+                    *modularity = mod;
+                    break;
+                }
+                case CUDA_R_64F:
+                    {
+                    double mod;
+                    nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                            static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+                    if (weight_index >= MCSRG->get_num_edge_dim()
+                            || n_clusters > static_cast<int>(MCSRG->get_num_vertices())) // base index is 0
+                        return NVGRAPH_STATUS_INVALID_VALUE;
+                    Vector<int> clust(MCSRG->get_num_vertices(), handle->stream);
+                    CHECK_CUDA(cudaMemcpy(clust.raw(),
+                                                    (int* )clustering,
+                                                    (size_t )(MCSRG->get_num_vertices() * sizeof(int)),
+                                                    cudaMemcpyDefault));
+                    nvgraph::ValuedCsrGraph<int, double> network =
+                            *MCSRG->get_valued_csr_graph(weight_index);
+                    rc = analyzeModularity<int, double>(network,
+                                                        n_clusters,
+                                                        clust.raw(),
+                                                        mod);
+                    *modularity = static_cast<float>(mod);
+                    break;
+                }
+
+                default:
+                    return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+            }
+        }
+        NVGRAPH_CATCHES(rc)
+        return getCAPIStatusForError(rc);
+    }
 #ifndef NVGRAPH_LIGHT
-	nvgraphStatus_t NVGRAPH_API nvgraphContractGraph_impl(nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			nvgraphGraphDescr_t contrdescrG,
-																			int *aggregates,
-																			size_t numaggregates,
-																			nvgraphSemiringOps_t VertexCombineOp,
-																			nvgraphSemiringOps_t VertexReduceOp,
-																			nvgraphSemiringOps_t EdgeCombineOp,
-																			nvgraphSemiringOps_t EdgeReduceOp,
-																			int flag) //unused, for now
-																			{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		typedef int IndexType;
-
-		try {
-			if (check_context(handle) ||
-					check_graph(descrG) ||
-					!contrdescrG ||
-					check_int_size(numaggregates) ||
-					check_ptr(aggregates))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			contrdescrG->TT = descrG->TT;
-			contrdescrG->T = descrG->T;
-
-			switch (descrG->graphStatus)
-			{
-				case HAS_TOPOLOGY: //CsrGraph
-				{
-					nvgraph::CsrGraph<int> *CSRG =
-							static_cast<nvgraph::CsrGraph<IndexType>*>(descrG->graph_handle);
-
-					Graph<IndexType>* contracted_graph = NULL;
-
-					switch (VertexCombineOp)
-					{
-						case NVGRAPH_MULTIPLY:
-							contracted_graph = contract_graph_csr_mul(*CSRG,
-																					aggregates,
-																					numaggregates,
-																					handle->stream,
-																					VertexCombineOp,
-																					VertexReduceOp,
-																					EdgeCombineOp,
-																					EdgeReduceOp);
-							break;
-						case NVGRAPH_SUM:
-							contracted_graph = contract_graph_csr_sum(*CSRG,
-																					aggregates,
-																					numaggregates,
-																					handle->stream,
-																					VertexCombineOp,
-																					VertexReduceOp,
-																					EdgeCombineOp,
-																					EdgeReduceOp);
-							break;
-						case NVGRAPH_MIN:
-							contracted_graph = contract_graph_csr_min(*CSRG,
-																					aggregates,
-																					numaggregates,
-																					handle->stream,
-																					VertexCombineOp,
-																					VertexReduceOp,
-																					EdgeCombineOp,
-																					EdgeReduceOp);
-							break;
-						case NVGRAPH_MAX:
-							contracted_graph = contract_graph_csr_max(*CSRG,
-																					aggregates,
-																					numaggregates,
-																					handle->stream,
-																					VertexCombineOp,
-																					VertexReduceOp,
-																					EdgeCombineOp,
-																					EdgeReduceOp);
-							break;
-					}
-
-					contrdescrG->graph_handle = contracted_graph;
-					contrdescrG->graphStatus = HAS_TOPOLOGY;
-				}
-					break;
-
-				case HAS_VALUES: //MultiValuedCsrGraph
-					if (descrG->T == CUDA_R_32F)
-							{
-						nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
-								static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
-						nvgraph::MultiValuedCsrGraph<int, float>* contracted_graph = NULL;
-
-						switch (VertexCombineOp)
-						{
-							case NVGRAPH_MULTIPLY:
-								contracted_graph = contract_graph_mv_float_mul(*MCSRG,
-																								aggregates,
-																								numaggregates,
-																								handle->stream,
-																								VertexCombineOp,
-																								VertexReduceOp,
-																								EdgeCombineOp,
-																								EdgeReduceOp);
-								break;
-							case NVGRAPH_SUM:
-								contracted_graph = contract_graph_mv_float_sum(*MCSRG,
-																								aggregates,
-																								numaggregates,
-																								handle->stream,
-																								VertexCombineOp,
-																								VertexReduceOp,
-																								EdgeCombineOp,
-																								EdgeReduceOp);
-								break;
-							case NVGRAPH_MIN:
-								contracted_graph = contract_graph_mv_float_min(*MCSRG,
-																								aggregates,
-																								numaggregates,
-																								handle->stream,
-																								VertexCombineOp,
-																								VertexReduceOp,
-																								EdgeCombineOp,
-																								EdgeReduceOp);
-								break;
-							case NVGRAPH_MAX:
-								contracted_graph = contract_graph_mv_float_max(*MCSRG,
-																								aggregates,
-																								numaggregates,
-																								handle->stream,
-																								VertexCombineOp,
-																								VertexReduceOp,
-																								EdgeCombineOp,
-																								EdgeReduceOp);
-								break;
-						}
-
-						contrdescrG->graph_handle = contracted_graph;
-						contrdescrG->graphStatus = HAS_VALUES;
-					}
-					else if (descrG->T == CUDA_R_64F)
-							{
-						nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
-								static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
-
-						nvgraph::MultiValuedCsrGraph<int, double>* contracted_graph = NULL;
-
-						switch (VertexCombineOp)
-						{
-							case NVGRAPH_MULTIPLY:
-								contracted_graph = contract_graph_mv_double_mul(*MCSRG,
-																								aggregates,
-																								numaggregates,
-																								handle->stream,
-																								VertexCombineOp,
-																								VertexReduceOp,
-																								EdgeCombineOp,
-																								EdgeReduceOp);
-								break;
-							case NVGRAPH_SUM:
-								contracted_graph = contract_graph_mv_double_sum(*MCSRG,
-																								aggregates,
-																								numaggregates,
-																								handle->stream,
-																								VertexCombineOp,
-																								VertexReduceOp,
-																								EdgeCombineOp,
-																								EdgeReduceOp);
-								break;
-							case NVGRAPH_MIN:
-								contracted_graph = contract_graph_mv_double_min(*MCSRG,
-																								aggregates,
-																								numaggregates,
-																								handle->stream,
-																								VertexCombineOp,
-																								VertexReduceOp,
-																								EdgeCombineOp,
-																								EdgeReduceOp);
-								break;
-							case NVGRAPH_MAX:
-								contracted_graph = contract_graph_mv_double_max(*MCSRG,
-																								aggregates,
-																								numaggregates,
-																								handle->stream,
-																								VertexCombineOp,
-																								VertexReduceOp,
-																								EdgeCombineOp,
-																								EdgeReduceOp);
-								break;
-						}
-
-						contrdescrG->graph_handle = contracted_graph;
-						contrdescrG->graphStatus = HAS_VALUES;
-					}
-					else
-						return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
-					break;
-
-				default:
-					return NVGRAPH_STATUS_INVALID_VALUE;
-			}
-
-		}
-		NVGRAPH_CATCHES(rc)
-
-		return getCAPIStatusForError(rc);
-	}
+    nvgraphStatus_t NVGRAPH_API nvgraphContractGraph_impl(nvgraphHandle_t handle,
+                                                          nvgraphGraphDescr_t descrG,
+                                                          nvgraphGraphDescr_t contrdescrG,
+                                                          int *aggregates,
+                                                          size_t numaggregates,
+                                                          nvgraphSemiringOps_t VertexCombineOp,
+                                                          nvgraphSemiringOps_t VertexReduceOp,
+                                                          nvgraphSemiringOps_t EdgeCombineOp,
+                                                          nvgraphSemiringOps_t EdgeReduceOp,
+                                                          int flag) {//unused, for now
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        typedef int IndexType;
+
+        try {
+            if (check_context(handle) ||
+                    check_graph(descrG) ||
+                    !contrdescrG ||
+                    check_int_size(numaggregates) ||
+                    check_ptr(aggregates))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            contrdescrG->TT = descrG->TT;
+            contrdescrG->T = descrG->T;
+
+            switch (descrG->graphStatus)
+            {
+                case HAS_TOPOLOGY: //CsrGraph
+                {
+                    nvgraph::CsrGraph<int> *CSRG =
+                            static_cast<nvgraph::CsrGraph<IndexType>*>(descrG->graph_handle);
+
+                    Graph<IndexType>* contracted_graph = NULL;
+
+                    switch (VertexCombineOp)
+                    {
+                        case NVGRAPH_MULTIPLY:
+                            contracted_graph = contract_graph_csr_mul(*CSRG,
+                                                                      aggregates,
+                                                                      numaggregates,
+                                                                      handle->stream,
+                                                                      VertexCombineOp,
+                                                                      VertexReduceOp,
+                                                                      EdgeCombineOp,
+                                                                      EdgeReduceOp);
+                            break;
+                        case NVGRAPH_SUM:
+                            contracted_graph = contract_graph_csr_sum(*CSRG,
+                                                                      aggregates,
+                                                                      numaggregates,
+                                                                      handle->stream,
+                                                                      VertexCombineOp,
+                                                                      VertexReduceOp,
+                                                                      EdgeCombineOp,
+                                                                      EdgeReduceOp);
+                            break;
+                        case NVGRAPH_MIN:
+                            contracted_graph = contract_graph_csr_min(*CSRG,
+                                                                      aggregates,
+                                                                      numaggregates,
+                                                                      handle->stream,
+                                                                      VertexCombineOp,
+                                                                      VertexReduceOp,
+                                                                      EdgeCombineOp,
+                                                                      EdgeReduceOp);
+                            break;
+                        case NVGRAPH_MAX:
+                            contracted_graph = contract_graph_csr_max(*CSRG,
+                                                                      aggregates,
+                                                                      numaggregates,
+                                                                      handle->stream,
+                                                                      VertexCombineOp,
+                                                                      VertexReduceOp,
+                                                                      EdgeCombineOp,
+                                                                      EdgeReduceOp);
+                            break;
+                    }
+
+                    contrdescrG->graph_handle = contracted_graph;
+                    contrdescrG->graphStatus = HAS_TOPOLOGY;
+                }
+                    break;
+
+                case HAS_VALUES: //MultiValuedCsrGraph
+                    if (descrG->T == CUDA_R_32F)
+                            {
+                        nvgraph::MultiValuedCsrGraph<int, float> *MCSRG =
+                                static_cast<nvgraph::MultiValuedCsrGraph<int, float>*>(descrG->graph_handle);
+                        nvgraph::MultiValuedCsrGraph<int, float>* contracted_graph = NULL;
+
+                        switch (VertexCombineOp)
+                        {
+                            case NVGRAPH_MULTIPLY:
+                                contracted_graph = contract_graph_mv_float_mul(*MCSRG,
+                                                                               aggregates,
+                                                                               numaggregates,
+                                                                               handle->stream,
+                                                                               VertexCombineOp,
+                                                                               VertexReduceOp,
+                                                                               EdgeCombineOp,
+                                                                               EdgeReduceOp);
+                                break;
+                            case NVGRAPH_SUM:
+                                contracted_graph = contract_graph_mv_float_sum(*MCSRG,
+                                                                               aggregates,
+                                                                               numaggregates,
+                                                                               handle->stream,
+                                                                               VertexCombineOp,
+                                                                               VertexReduceOp,
+                                                                               EdgeCombineOp,
+                                                                               EdgeReduceOp);
+                                break;
+                            case NVGRAPH_MIN:
+                                contracted_graph = contract_graph_mv_float_min(*MCSRG,
+                                                                               aggregates,
+                                                                               numaggregates,
+                                                                               handle->stream,
+                                                                               VertexCombineOp,
+                                                                               VertexReduceOp,
+                                                                               EdgeCombineOp,
+                                                                               EdgeReduceOp);
+                                break;
+                            case NVGRAPH_MAX:
+                                contracted_graph = contract_graph_mv_float_max(*MCSRG,
+                                                                               aggregates,
+                                                                               numaggregates,
+                                                                               handle->stream,
+                                                                               VertexCombineOp,
+                                                                               VertexReduceOp,
+                                                                               EdgeCombineOp,
+                                                                               EdgeReduceOp);
+                                break;
+                        }
+
+                        contrdescrG->graph_handle = contracted_graph;
+                        contrdescrG->graphStatus = HAS_VALUES;
+                    }
+                    else if (descrG->T == CUDA_R_64F)
+                            {
+                        nvgraph::MultiValuedCsrGraph<int, double> *MCSRG =
+                                static_cast<nvgraph::MultiValuedCsrGraph<int, double>*>(descrG->graph_handle);
+
+                        nvgraph::MultiValuedCsrGraph<int, double>* contracted_graph = NULL;
+
+                        switch (VertexCombineOp)
+                        {
+                            case NVGRAPH_MULTIPLY:
+                                contracted_graph = contract_graph_mv_double_mul(*MCSRG,
+                                                                                aggregates,
+                                                                                numaggregates,
+                                                                                handle->stream,
+                                                                                VertexCombineOp,
+                                                                                VertexReduceOp,
+                                                                                EdgeCombineOp,
+                                                                                EdgeReduceOp);
+                                break;
+                            case NVGRAPH_SUM:
+                                contracted_graph = contract_graph_mv_double_sum(*MCSRG,
+                                                                                aggregates,
+                                                                                numaggregates,
+                                                                                handle->stream,
+                                                                                VertexCombineOp,
+                                                                                VertexReduceOp,
+                                                                                EdgeCombineOp,
+                                                                                EdgeReduceOp);
+                                break;
+                            case NVGRAPH_MIN:
+                                contracted_graph = contract_graph_mv_double_min(*MCSRG,
+                                                                                aggregates,
+                                                                                numaggregates,
+                                                                                handle->stream,
+                                                                                VertexCombineOp,
+                                                                                VertexReduceOp,
+                                                                                EdgeCombineOp,
+                                                                                EdgeReduceOp);
+                                break;
+                            case NVGRAPH_MAX:
+                                contracted_graph = contract_graph_mv_double_max(*MCSRG,
+                                                                                aggregates,
+                                                                                numaggregates,
+                                                                                handle->stream,
+                                                                                VertexCombineOp,
+                                                                                VertexReduceOp,
+                                                                                EdgeCombineOp,
+                                                                                EdgeReduceOp);
+                                break;
+                        }
+
+                        contrdescrG->graph_handle = contracted_graph;
+                        contrdescrG->graphStatus = HAS_VALUES;
+                    }
+                    else
+                        return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED;
+                    break;
+
+                default:
+                    return NVGRAPH_STATUS_INVALID_VALUE;
+            }
+
+        }
+        NVGRAPH_CATCHES(rc)
+
+        return getCAPIStatusForError(rc);
+    }
 #endif
-	
-	nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle.
-																					const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights)
-																					const size_t weight_index, // Index of the edge set for the weights.
-																					const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter
-																					int* clustering, // (output) clustering
-																					void* eig_vals, // (output) eigenvalues
-																					void* eig_vects) // (output) eigenvectors
-																					{
-		if (check_ptr(params) || check_ptr(clustering) || check_ptr(eig_vals) || check_ptr(eig_vects))
-			FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-		if (params->algorithm == NVGRAPH_MODULARITY_MAXIMIZATION)
-			return nvgraph::nvgraphSpectralModularityMaximization_impl(handle,
-																							descrG,
-																							weight_index,
-																							params->n_clusters,
-																							params->n_eig_vects,
-																							params->evs_tolerance,
-																							params->evs_max_iter,
-																							params->kmean_tolerance,
-																							params->kmean_max_iter,
-																							clustering,
-																							eig_vals,
-																							eig_vects);
-		else if (params->algorithm == NVGRAPH_BALANCED_CUT_LANCZOS)
-			return nvgraph::nvgraphBalancedCutClustering_impl(handle,
-																				descrG,
-																				weight_index,
-																				params->n_clusters,
-																				params->n_eig_vects,
-																				0,
-																				params->evs_tolerance,
-																				params->evs_max_iter,
-																				params->kmean_tolerance,
-																				params->kmean_max_iter,
-																				clustering,
-																				eig_vals,
-																				eig_vects);
-		else if (params->algorithm == NVGRAPH_BALANCED_CUT_LOBPCG)
-			return nvgraph::nvgraphBalancedCutClustering_impl(handle,
-																				descrG,
-																				weight_index,
-																				params->n_clusters,
-																				params->n_eig_vects,
-																				1,
-																				params->evs_tolerance,
-																				params->evs_max_iter,
-																				params->kmean_tolerance,
-																				params->kmean_max_iter,
-																				clustering,
-																				eig_vals,
-																				eig_vects);
-		else
-			return NVGRAPH_STATUS_INVALID_VALUE;
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle.
-																					const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights)
-																					const size_t weight_index, // Index of the edge set for the weights.
-																					const int n_clusters, //number of clusters
-																					const int* clustering, // clustering to analyse
-																					nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality
-																					float * score) // (output) clustering score telling how good the clustering is for the selected metric.
-																					{
-		if (check_ptr(clustering) || check_ptr(score))
-			FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-		if (metric == NVGRAPH_MODULARITY)
-			return nvgraphAnalyzeModularityClustering_impl(handle,
-																			descrG,
-																			weight_index,
-																			n_clusters,
-																			clustering,
-																			score);
-		else if (metric == NVGRAPH_EDGE_CUT)
-				{
-			float dummy = 0;
-			return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle,
-																			descrG,
-																			weight_index,
-																			n_clusters,
-																			clustering,
-																			score,
-																			&dummy);
-		}
-		else if (metric == NVGRAPH_RATIO_CUT)
-				{
-			float dummy = 0;
-			return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle,
-																			descrG,
-																			weight_index,
-																			n_clusters,
-																			clustering,
-																			&dummy,
-																			score);
-		}
-		else
-			return NVGRAPH_STATUS_INVALID_VALUE;
-	}
-
-	nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount_impl(nvgraphHandle_t handle,
-																			const nvgraphGraphDescr_t descrG,
-																			uint64_t* result)
-																			{
-		NVGRAPH_ERROR rc = NVGRAPH_OK;
-		try
-		{
-			if (check_context(handle) || check_graph(descrG) || check_ptr(result))
-				FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
-
-			if (descrG->TT != NVGRAPH_CSR_32 && descrG->TT != NVGRAPH_CSC_32) // supported topologies
-				return NVGRAPH_STATUS_INVALID_VALUE;
-
-			if (descrG->graphStatus != HAS_TOPOLOGY && descrG->graphStatus != HAS_VALUES)
-			{
-				return NVGRAPH_STATUS_INVALID_VALUE; // should have topology
-			}
-
-			nvgraph::CsrGraph<int> *CSRG = static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
-			if (CSRG == NULL)
-				return NVGRAPH_STATUS_MAPPING_ERROR;
-			nvgraph::triangles_counting::TrianglesCount<int> counter(*CSRG); /* stream, device */
-			rc = counter.count();
-			uint64_t s_res = counter.get_triangles_count();
-			*result = static_cast<uint64_t>(s_res);
-
-		}
-		NVGRAPH_CATCHES(rc)
-		return getCAPIStatusForError(rc);
-	}
+
+    nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle.
+                                                               const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights)
+                                                               const size_t weight_index, // Index of the edge set for the weights.
+                                                               const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter
+                                                               int* clustering, // (output) clustering
+                                                               void* eig_vals, // (output) eigenvalues
+                                                               void* eig_vects) {// (output) eigenvectors
+        if (check_ptr(params) || check_ptr(clustering) || check_ptr(eig_vals) || check_ptr(eig_vects))
+            FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+        if (params->algorithm == NVGRAPH_MODULARITY_MAXIMIZATION)
+            return nvgraph::nvgraphSpectralModularityMaximization_impl(handle,
+                                                                       descrG,
+                                                                       weight_index,
+                                                                       params->n_clusters,
+                                                                       params->n_eig_vects,
+                                                                       params->evs_tolerance,
+                                                                       params->evs_max_iter,
+                                                                       params->kmean_tolerance,
+                                                                       params->kmean_max_iter,
+                                                                       clustering,
+                                                                       eig_vals,
+                                                                       eig_vects);
+        else if (params->algorithm == NVGRAPH_BALANCED_CUT_LANCZOS)
+            return nvgraph::nvgraphBalancedCutClustering_impl(handle,
+                                                              descrG,
+                                                              weight_index,
+                                                              params->n_clusters,
+                                                              params->n_eig_vects,
+                                                              0,
+                                                              params->evs_tolerance,
+                                                              params->evs_max_iter,
+                                                              params->kmean_tolerance,
+                                                              params->kmean_max_iter,
+                                                              clustering,
+                                                              eig_vals,
+                                                              eig_vects);
+        else if (params->algorithm == NVGRAPH_BALANCED_CUT_LOBPCG)
+            return nvgraph::nvgraphBalancedCutClustering_impl(handle,
+                                                              descrG,
+                                                              weight_index,
+                                                              params->n_clusters,
+                                                              params->n_eig_vects,
+                                                              1,
+                                                              params->evs_tolerance,
+                                                              params->evs_max_iter,
+                                                              params->kmean_tolerance,
+                                                              params->kmean_max_iter,
+                                                              clustering,
+                                                              eig_vals,
+                                                              eig_vects);
+        else
+            return NVGRAPH_STATUS_INVALID_VALUE;
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle.
+                                                              const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights)
+                                                              const size_t weight_index, // Index of the edge set for the weights.
+                                                              const int n_clusters, //number of clusters
+                                                              const int* clustering, // clustering to analyse
+                                                              nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality
+                                                              float * score) {// (output) clustering score telling how good the clustering is for the selected metric.
+        if (check_ptr(clustering) || check_ptr(score))
+            FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+        if (metric == NVGRAPH_MODULARITY)
+            return nvgraphAnalyzeModularityClustering_impl(handle,
+                                                           descrG,
+                                                           weight_index,
+                                                           n_clusters,
+                                                           clustering,
+                                                           score);
+        else if (metric == NVGRAPH_EDGE_CUT) {
+            float dummy = 0;
+            return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle,
+                                                           descrG,
+                                                           weight_index,
+                                                           n_clusters,
+                                                           clustering,
+                                                           score,
+                                                           &dummy);
+        }
+        else if (metric == NVGRAPH_RATIO_CUT) {
+            float dummy = 0;
+            return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle,
+                                                           descrG,
+                                                           weight_index,
+                                                           n_clusters,
+                                                           clustering,
+                                                           &dummy,
+                                                           score);
+        }
+        else
+            return NVGRAPH_STATUS_INVALID_VALUE;
+    }
+
+    nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount_impl(nvgraphHandle_t handle,
+                                                          const nvgraphGraphDescr_t descrG,
+                                                          uint64_t* result) {
+        NVGRAPH_ERROR rc = NVGRAPH_OK;
+        try
+        {
+            if (check_context(handle) || check_graph(descrG) || check_ptr(result))
+                FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS);
+
+            if (descrG->TT != NVGRAPH_CSR_32 && descrG->TT != NVGRAPH_CSC_32) // supported topologies
+                return NVGRAPH_STATUS_INVALID_VALUE;
+
+            if (descrG->graphStatus != HAS_TOPOLOGY && descrG->graphStatus != HAS_VALUES)
+            {
+                return NVGRAPH_STATUS_INVALID_VALUE; // should have topology
+            }
+
+            nvgraph::CsrGraph<int> *CSRG = static_cast<nvgraph::CsrGraph<int>*>(descrG->graph_handle);
+            if (CSRG == NULL)
+                return NVGRAPH_STATUS_MAPPING_ERROR;
+            nvgraph::triangles_counting::TrianglesCount<int> counter(*CSRG); /* stream, device */
+            rc = counter.count();
+            uint64_t s_res = counter.get_triangles_count();
+            *result = static_cast<uint64_t>(s_res);
+
+        }
+        NVGRAPH_CATCHES(rc)
+        return getCAPIStatusForError(rc);
+    }
 
 } /*namespace nvgraph*/
 
@@ -3538,337 +3433,324 @@ namespace nvgraph
  *        API
  *************************/
 
-nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value)
-																{
-	switch (type) {
-		case MAJOR_VERSION:
-			*value = CUDART_VERSION / 1000;
-			break;
-		case MINOR_VERSION:
-			*value = (CUDART_VERSION % 1000) / 10;
-			break;
-		case PATCH_LEVEL:
-			*value = 0;
-			break;
-		default:
-			return NVGRAPH_STATUS_INVALID_VALUE;
-	}
-	return NVGRAPH_STATUS_SUCCESS;
+nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value) {
+    switch (type) {
+        case MAJOR_VERSION:
+            *value = CUDART_VERSION / 1000;
+            break;
+        case MINOR_VERSION:
+            *value = (CUDART_VERSION % 1000) / 10;
+            break;
+        case PATCH_LEVEL:
+            *value = 0;
+            break;
+        default:
+            return NVGRAPH_STATUS_INVALID_VALUE;
+    }
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
-nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle)
-														{
-	return nvgraph::nvgraphCreate_impl(handle);
+nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle) {
+    return nvgraph::nvgraphCreate_impl(handle);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphCreateMulti(nvgraphHandle_t *handle,
-																int numDevices,
-																int* devices) {
-	return nvgraph::nvgraphCreateMulti_impl(handle, numDevices, devices);
+                                               int numDevices,
+                                               int* devices) {
+    return nvgraph::nvgraphCreateMulti_impl(handle, numDevices, devices);
 }
 
-nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle)
-															{
-	return nvgraph::nvgraphDestroy_impl(handle);
+nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle) {
+    return nvgraph::nvgraphDestroy_impl(handle);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphCreateGraphDescr(nvgraphHandle_t handle,
-																		nvgraphGraphDescr_t *descrG)
-																		{
-	return nvgraph::nvgraphCreateGraphDescr_impl(handle, descrG);
+                                                    nvgraphGraphDescr_t *descrG) {
+    return nvgraph::nvgraphCreateGraphDescr_impl(handle, descrG);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphDestroyGraphDescr(nvgraphHandle_t handle,
-																		nvgraphGraphDescr_t descrG)
-																		{
-	return nvgraph::nvgraphDestroyGraphDescr_impl(handle, descrG);
+                                                     nvgraphGraphDescr_t descrG) {
+    return nvgraph::nvgraphDestroyGraphDescr_impl(handle, descrG);
 }
 
-nvgraphStatus_t NVGRAPH_API nvgraphSetStream(nvgraphHandle_t handle, cudaStream_t stream)
-															{
-	return nvgraph::nvgraphSetStream_impl(handle, stream);
+nvgraphStatus_t NVGRAPH_API nvgraphSetStream(nvgraphHandle_t handle, cudaStream_t stream) {
+    return nvgraph::nvgraphSetStream_impl(handle, stream);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure(nvgraphHandle_t handle,
-																		nvgraphGraphDescr_t descrG,
-																		void* topologyData,
-																		nvgraphTopologyType_t topologyType)
-																		{
-	return nvgraph::nvgraphSetGraphStructure_impl(handle, descrG, topologyData, topologyType);
+                                                     nvgraphGraphDescr_t descrG,
+                                                     void* topologyData,
+                                                     nvgraphTopologyType_t topologyType) {
+    return nvgraph::nvgraphSetGraphStructure_impl(handle, descrG, topologyData, topologyType);
 }
+
 nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure(nvgraphHandle_t handle,
-																		nvgraphGraphDescr_t descrG,
-																		void* topologyData,
-																		nvgraphTopologyType_t* topologyType)
-																		{
-	return nvgraph::nvgraphGetGraphStructure_impl(handle, descrG, topologyData, topologyType);
+                                                     nvgraphGraphDescr_t descrG,
+                                                     void* topologyData,
+                                                     nvgraphTopologyType_t* topologyType) {
+    return nvgraph::nvgraphGetGraphStructure_impl(handle, descrG, topologyData, topologyType);
 }
 nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData(nvgraphHandle_t handle,
-																		nvgraphGraphDescr_t descrG,
-																		size_t numsets,
-																		cudaDataType_t *settypes)
-																		{
-	return nvgraph::nvgraphAllocateVertexData_impl(handle, descrG, numsets, settypes);
+                                                      nvgraphGraphDescr_t descrG,
+                                                      size_t numsets,
+                                                      cudaDataType_t *settypes) {
+    return nvgraph::nvgraphAllocateVertexData_impl(handle, descrG, numsets, settypes);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData(nvgraphHandle_t handle,
-																		nvgraphGraphDescr_t descrG,
-																		size_t numsets,
-																		cudaDataType_t *settypes)
-																		{
-	return nvgraph::nvgraphAllocateEdgeData_impl(handle, descrG, numsets, settypes);
+                                                    nvgraphGraphDescr_t descrG,
+                                                    size_t numsets,
+                                                    cudaDataType_t *settypes) {
+    return nvgraph::nvgraphAllocateEdgeData_impl(handle, descrG, numsets, settypes);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex(nvgraphHandle_t handle,
-																				nvgraphGraphDescr_t descrG,
-																				nvgraphGraphDescr_t subdescrG,
-																				int *subvertices,
-																				size_t numvertices)
-																				{
-	return nvgraph::nvgraphExtractSubgraphByVertex_impl(handle,
-																			descrG,
-																			subdescrG,
-																			subvertices,
-																			numvertices);
+                                                           nvgraphGraphDescr_t descrG,
+                                                           nvgraphGraphDescr_t subdescrG,
+                                                           int *subvertices,
+                                                           size_t numvertices) {
+    return nvgraph::nvgraphExtractSubgraphByVertex_impl(handle,
+                                                        descrG,
+                                                        subdescrG,
+                                                        subvertices,
+                                                        numvertices);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge(nvgraphHandle_t handle,
-																			nvgraphGraphDescr_t descrG,
-																			nvgraphGraphDescr_t subdescrG,
-																			int *subedges,
-																			size_t numedges)
-																			{
-	return nvgraph::nvgraphExtractSubgraphByEdge_impl(handle, descrG, subdescrG, subedges, numedges);
+                                                         nvgraphGraphDescr_t descrG,
+                                                         nvgraphGraphDescr_t subdescrG,
+                                                         int *subedges,
+                                                         size_t numedges) {
+    return nvgraph::nvgraphExtractSubgraphByEdge_impl(handle, descrG, subdescrG, subedges, numedges);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData(nvgraphHandle_t handle,
-																	nvgraphGraphDescr_t descrG,
-																	void *vertexData,
-																	size_t setnum)
-																	{
-	return nvgraph::nvgraphSetVertexData_impl(handle, descrG, vertexData, setnum);
+                                                 nvgraphGraphDescr_t descrG,
+                                                 void *vertexData,
+                                                 size_t setnum) {
+    return nvgraph::nvgraphSetVertexData_impl(handle, descrG, vertexData, setnum);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData(nvgraphHandle_t handle,
-																	nvgraphGraphDescr_t descrG,
-																	void *vertexData,
-																	size_t setnum)
-																	{
-	return nvgraph::nvgraphGetVertexData_impl(handle, descrG, vertexData, setnum);
+                                                 nvgraphGraphDescr_t descrG,
+                                                 void *vertexData,
+                                                 size_t setnum) {
+    return nvgraph::nvgraphGetVertexData_impl(handle, descrG, vertexData, setnum);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology(nvgraphHandle_t handle,
-																	nvgraphTopologyType_t srcTType,
-																	void *srcTopology,
-																	void *srcEdgeData,
-																	cudaDataType_t *dataType,
-																	nvgraphTopologyType_t dstTType,
-																	void *dstTopology,
-																	void *dstEdgeData) {
-	return nvgraph::nvgraphConvertTopology_impl(handle,
-																srcTType,
-																srcTopology,
-																srcEdgeData,
-																dataType,
-																dstTType,
-																dstTopology,
-																dstEdgeData);
+                                                   nvgraphTopologyType_t srcTType,
+                                                   void *srcTopology,
+                                                   void *srcEdgeData,
+                                                   cudaDataType_t *dataType,
+                                                   nvgraphTopologyType_t dstTType,
+                                                   void *dstTopology,
+                                                   void *dstEdgeData) {
+    return nvgraph::nvgraphConvertTopology_impl(handle,
+                                                srcTType,
+                                                srcTopology,
+                                                srcEdgeData,
+                                                dataType,
+                                                dstTType,
+                                                dstTopology,
+                                                dstEdgeData);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph(nvgraphHandle_t handle,
-																nvgraphGraphDescr_t srcDescrG,
-																nvgraphGraphDescr_t dstDescrG,
-																nvgraphTopologyType_t dstTType) {
-	return nvgraph::nvgraphConvertGraph_impl(handle, srcDescrG, dstDescrG, dstTType);
+                                                nvgraphGraphDescr_t srcDescrG,
+                                                nvgraphGraphDescr_t dstDescrG,
+                                                nvgraphTopologyType_t dstTType) {
+    return nvgraph::nvgraphConvertGraph_impl(handle, srcDescrG, dstDescrG, dstTType);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData(nvgraphHandle_t handle,
-																nvgraphGraphDescr_t descrG,
-																void *edgeData,
-																size_t setnum) {
-	return nvgraph::nvgraphSetEdgeData_impl(handle, descrG, edgeData, setnum);
+                                               nvgraphGraphDescr_t descrG,
+                                               void *edgeData,
+                                               size_t setnum) {
+    return nvgraph::nvgraphSetEdgeData_impl(handle, descrG, edgeData, setnum);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData(nvgraphHandle_t handle,
-																nvgraphGraphDescr_t descrG,
-																void *edgeData,
-																size_t setnum) {
-	return nvgraph::nvgraphGetEdgeData_impl(handle, descrG, edgeData, setnum);
+                                               nvgraphGraphDescr_t descrG,
+                                               void *edgeData,
+                                               size_t setnum) {
+    return nvgraph::nvgraphGetEdgeData_impl(handle, descrG, edgeData, setnum);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv(nvgraphHandle_t handle,
-														const nvgraphGraphDescr_t descrG,
-														const size_t weight_index,
-														const void *alpha,
-														const size_t x,
-														const void *beta,
-														const size_t y,
-														const nvgraphSemiring_t SR) {
-	return nvgraph::nvgraphSrSpmv_impl_cub(handle, descrG, weight_index, alpha, x, beta, y, SR);
+                                          const nvgraphGraphDescr_t descrG,
+                                          const size_t weight_index,
+                                          const void *alpha,
+                                          const size_t x,
+                                          const void *beta,
+                                          const size_t y,
+                                          const nvgraphSemiring_t SR) {
+    return nvgraph::nvgraphSrSpmv_impl_cub(handle, descrG, weight_index, alpha, x, beta, y, SR);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphSssp(nvgraphHandle_t handle,
-														const nvgraphGraphDescr_t descrG,
-														const size_t weight_index,
-														const int *source_vert,
-														const size_t sssp) {
-	return nvgraph::nvgraphSssp_impl(handle, descrG, weight_index, source_vert, sssp);
+                                        const nvgraphGraphDescr_t descrG,
+                                        const size_t weight_index,
+                                        const int *source_vert,
+                                        const size_t sssp) {
+    return nvgraph::nvgraphSssp_impl(handle, descrG, weight_index, source_vert, sssp);
 }
 
 //nvgraphTraversal
 
 typedef enum {
-	NVGRAPH_TRAVERSAL_DISTANCES_INDEX = 0,
-	NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX = 1,
-	NVGRAPH_TRAVERSAL_MASK_INDEX = 2,
-	NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX = 3,
-	NVGRAPH_TRAVERSAL_ALPHA = 4,
-	NVGRAPH_TRAVERSAL_BETA = 5
+    NVGRAPH_TRAVERSAL_DISTANCES_INDEX = 0,
+    NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX = 1,
+    NVGRAPH_TRAVERSAL_MASK_INDEX = 2,
+    NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX = 3,
+    NVGRAPH_TRAVERSAL_ALPHA = 4,
+    NVGRAPH_TRAVERSAL_BETA = 5
 } nvgraphTraversalParameterIndex_t;
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversalParameterInit(nvgraphTraversalParameter_t *param) {
-	if (check_ptr(param))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+    if (check_ptr(param))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = INT_MAX;
-	param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = INT_MAX;
-	param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = INT_MAX;
-	param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = 0;
-	param->pad[NVGRAPH_TRAVERSAL_ALPHA] = TRAVERSAL_DEFAULT_ALPHA;
-	param->pad[NVGRAPH_TRAVERSAL_BETA] = TRAVERSAL_DEFAULT_BETA;
+    param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = INT_MAX;
+    param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = INT_MAX;
+    param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = INT_MAX;
+    param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = 0;
+    param->pad[NVGRAPH_TRAVERSAL_ALPHA] = TRAVERSAL_DEFAULT_ALPHA;
+    param->pad[NVGRAPH_TRAVERSAL_BETA] = TRAVERSAL_DEFAULT_BETA;
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetDistancesIndex(nvgraphTraversalParameter_t *param,
-																					const size_t value) {
-	if (check_ptr(param))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+                                                              const size_t value) {
+    if (check_ptr(param))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = value;
+    param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = value;
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
-nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex(	const nvgraphTraversalParameter_t param,
-																					size_t *value) {
-	if (check_ptr(value))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex(const nvgraphTraversalParameter_t param,
+                                                              size_t *value) {
+    if (check_ptr(value))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	*value = param.pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX];
+    *value = param.pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX];
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetPredecessorsIndex(nvgraphTraversalParameter_t *param,
-																						const size_t value) {
-	if (check_ptr(param))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+                                                                 const size_t value) {
+    if (check_ptr(param))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = value;
+    param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = value;
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
-nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex(	const nvgraphTraversalParameter_t param,
-																						size_t *value) {
-	if (check_ptr(value))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex(const nvgraphTraversalParameter_t param,
+                                                                 size_t *value) {
+    if (check_ptr(value))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	*value = param.pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX];
+    *value = param.pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX];
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetEdgeMaskIndex(nvgraphTraversalParameter_t *param,
-																					const size_t value) {
-	if (check_ptr(param))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+                                                             const size_t value) {
+    if (check_ptr(param))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = value;
+    param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = value;
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
-nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex(	const nvgraphTraversalParameter_t param,
-																					size_t *value) {
-	if (check_ptr(value))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex(const nvgraphTraversalParameter_t param,
+                                                             size_t *value) {
+    if (check_ptr(value))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	*value = param.pad[NVGRAPH_TRAVERSAL_MASK_INDEX];
+    *value = param.pad[NVGRAPH_TRAVERSAL_MASK_INDEX];
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetUndirectedFlag(nvgraphTraversalParameter_t *param,
-																					const size_t value) {
-	if (check_ptr(param))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+                                                              const size_t value) {
+    if (check_ptr(param))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = value;
+    param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = value;
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 
 }
 
-nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag(	const nvgraphTraversalParameter_t param,
-																					size_t *value) {
-	if (check_ptr(value))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag(const nvgraphTraversalParameter_t param,
+                                                              size_t *value) {
+    if (check_ptr(value))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	*value = param.pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX];
+    *value = param.pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX];
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetAlpha(nvgraphTraversalParameter_t *param,
-																		const size_t value) {
-	if (check_ptr(param))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+                                                     const size_t value) {
+    if (check_ptr(param))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	param->pad[NVGRAPH_TRAVERSAL_ALPHA] = value;
+    param->pad[NVGRAPH_TRAVERSAL_ALPHA] = value;
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetAlpha(const nvgraphTraversalParameter_t param,
-																		size_t *value) {
-	if (check_ptr(value))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+                                                     size_t *value) {
+    if (check_ptr(value))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	*value = param.pad[NVGRAPH_TRAVERSAL_ALPHA];
+    *value = param.pad[NVGRAPH_TRAVERSAL_ALPHA];
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetBeta(nvgraphTraversalParameter_t *param,
-																		const size_t value) {
-	if (check_ptr(param))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+                                                    const size_t value) {
+    if (check_ptr(param))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	param->pad[NVGRAPH_TRAVERSAL_BETA] = value;
+    param->pad[NVGRAPH_TRAVERSAL_BETA] = value;
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetBeta(const nvgraphTraversalParameter_t param,
-																		size_t *value) {
-	if (check_ptr(value))
-		return NVGRAPH_STATUS_INVALID_VALUE;
+                                                    size_t *value) {
+    if (check_ptr(value))
+        return NVGRAPH_STATUS_INVALID_VALUE;
 
-	*value = param.pad[NVGRAPH_TRAVERSAL_BETA];
+    *value = param.pad[NVGRAPH_TRAVERSAL_BETA];
 
-	return NVGRAPH_STATUS_SUCCESS;
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle,
-															const nvgraphGraphDescr_t descrG,
-															const nvgraphTraversal_t traversalT,
-															const int *source_vert,
-															const nvgraphTraversalParameter_t params) {
-	return nvgraph::nvgraphTraversal_impl(handle, descrG, traversalT, source_vert, params);
+                                             const nvgraphGraphDescr_t descrG,
+                                             const nvgraphTraversal_t traversalT,
+                                             const int *source_vert,
+                                             const nvgraphTraversalParameter_t params) {
+    return nvgraph::nvgraphTraversal_impl(handle, descrG, traversalT, source_vert, params);
 }
 
 /**
@@ -3881,243 +3763,234 @@ nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle,
  * @return Status code.
  */
 nvgraphStatus_t NVGRAPH_API nvgraph2dBfs(nvgraphHandle_t handle,
-														const nvgraphGraphDescr_t descrG,
-														const int32_t source_vert,
-														int32_t* distances,
-														int32_t* predecessors) {
-	return nvgraph::nvgraph2dBfs_impl(handle, descrG, source_vert, distances, predecessors);
+                                         const nvgraphGraphDescr_t descrG,
+                                         const int32_t source_vert,
+                                         int32_t* distances,
+                                         int32_t* predecessors) {
+    return nvgraph::nvgraph2dBfs_impl(handle, descrG, source_vert, distances, predecessors);
 }
 
 //nvgraphWidestPath
 
 nvgraphStatus_t NVGRAPH_API nvgraphWidestPath(nvgraphHandle_t handle,
-																const nvgraphGraphDescr_t descrG,
-																const size_t weight_index,
-																const int *source_vert,
-																const size_t widest_path)
-																{
-	return nvgraph::nvgraphWidestPath_impl(handle, descrG, weight_index, source_vert, widest_path);
+                                              const nvgraphGraphDescr_t descrG,
+                                              const size_t weight_index,
+                                              const int *source_vert,
+                                              const size_t widest_path) {
+    return nvgraph::nvgraphWidestPath_impl(handle, descrG, weight_index, source_vert, widest_path);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphPagerank(nvgraphHandle_t handle,
-															const nvgraphGraphDescr_t descrG,
-															const size_t weight_index,
-															const void *alpha,
-															const size_t bookmark,
-															const int has_guess,
-															const size_t pagerank_index,
-															const float tolerance,
-															const int max_iter)
-															{
-	return nvgraph::nvgraphPagerank_impl(handle,
-														descrG,
-														weight_index,
-														alpha,
-														bookmark,
-														has_guess,
-														pagerank_index,
-														tolerance,
-														max_iter);
+                                            const nvgraphGraphDescr_t descrG,
+                                            const size_t weight_index,
+                                            const void *alpha,
+                                            const size_t bookmark,
+                                            const int has_guess,
+                                            const size_t pagerank_index,
+                                            const float tolerance,
+                                            const int max_iter) {
+    return nvgraph::nvgraphPagerank_impl(handle,
+                                         descrG,
+                                         weight_index,
+                                         alpha,
+                                         bookmark,
+                                         has_guess,
+                                         pagerank_index,
+                                         tolerance,
+                                         max_iter);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphKrylovPagerank(nvgraphHandle_t handle,
-																	const nvgraphGraphDescr_t descrG,
-																	const size_t weight_index,
-																	const void *alpha,
-																	const size_t bookmark,
-																	const float tolerance,
-																	const int max_iter,
-																	const int subspace_size,
-																	const int has_guess,
-																	const size_t rank)
-																	{
-	return nvgraph::nvgraphKrylovPagerank_impl(handle,
-																descrG,
-																weight_index,
-																alpha,
-																bookmark,
-																tolerance,
-																max_iter,
-																subspace_size,
-																has_guess,
-																rank);
+                                                  const nvgraphGraphDescr_t descrG,
+                                                  const size_t weight_index,
+                                                  const void *alpha,
+                                                  const size_t bookmark,
+                                                  const float tolerance,
+                                                  const int max_iter,
+                                                  const int subspace_size,
+                                                  const int has_guess,
+                                                  const size_t rank) {
+    return nvgraph::nvgraphKrylovPagerank_impl(handle,
+                                               descrG,
+                                               weight_index,
+                                               alpha,
+                                               bookmark,
+                                               tolerance,
+                                               max_iter,
+                                               subspace_size,
+                                               has_guess,
+                                               rank);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphBalancedCutClustering(nvgraphHandle_t handle,
-																			const nvgraphGraphDescr_t descrG,
-																			const size_t weight_index,
-																			const int n_clusters,
-																			const int n_eig_vects,
-																			const int evs_type,
-																			const float evs_tolerance,
-																			const int evs_max_iter,
-																			const float kmean_tolerance,
-																			const int kmean_max_iter,
-																			int* clustering,
-																			void* eig_vals,
-																			void* eig_vects)
-																			{
-	return nvgraph::nvgraphBalancedCutClustering_impl(handle,
-																		descrG,
-																		weight_index,
-																		n_clusters,
-																		n_eig_vects,
-																		evs_type,
-																		evs_tolerance,
-																		evs_max_iter,
-																		kmean_tolerance,
-																		kmean_max_iter,
-																		clustering,
-																		eig_vals,
-																		eig_vects);
+                                                         const nvgraphGraphDescr_t descrG,
+                                                         const size_t weight_index,
+                                                         const int n_clusters,
+                                                         const int n_eig_vects,
+                                                         const int evs_type,
+                                                         const float evs_tolerance,
+                                                         const int evs_max_iter,
+                                                         const float kmean_tolerance,
+                                                         const int kmean_max_iter,
+                                                         int* clustering,
+                                                         void* eig_vals,
+                                                         void* eig_vects) {
+    return nvgraph::nvgraphBalancedCutClustering_impl(handle,
+                                                      descrG,
+                                                      weight_index,
+                                                      n_clusters,
+                                                      n_eig_vects,
+                                                      evs_type,
+                                                      evs_tolerance,
+                                                      evs_max_iter,
+                                                      kmean_tolerance,
+                                                      kmean_max_iter,
+                                                      clustering,
+                                                      eig_vals,
+                                                      eig_vects);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeBalancedCut(nvgraphHandle_t handle,
-																		const nvgraphGraphDescr_t descrG,
-																		const size_t weight_index,
-																		const int n_clusters,
-																		const int* clustering,
-																		float * edgeCut,
-																		float * ratioCut)
-																		{
-	return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle,
-																	descrG,
-																	weight_index,
-																	n_clusters,
-																	clustering,
-																	edgeCut,
-																	ratioCut);
+                                                      const nvgraphGraphDescr_t descrG,
+                                                      const size_t weight_index,
+                                                      const int n_clusters,
+                                                      const int* clustering,
+                                                      float * edgeCut,
+                                                      float * ratioCut) {
+    return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle,
+                                                   descrG,
+                                                   weight_index,
+                                                   n_clusters,
+                                                   clustering,
+                                                   edgeCut,
+                                                   ratioCut);
 }
 
-nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching(	nvgraphHandle_t handle,
-																		const nvgraphGraphDescr_t descrG,
-																		const size_t weight_index,
-																		const nvgraphEdgeWeightMatching_t similarity_metric,
-																		int* aggregates,
-																		size_t* num_aggregates)
-																		{
-	return nvgraph::nvgraphHeavyEdgeMatching_impl(handle,
-																	descrG,
-																	weight_index,
-																	similarity_metric,
-																	aggregates,
-																	num_aggregates);
+nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching(nvgraphHandle_t handle,
+                                                     const nvgraphGraphDescr_t descrG,
+                                                     const size_t weight_index,
+                                                     const nvgraphEdgeWeightMatching_t similarity_metric,
+                                                     int* aggregates,
+                                                     size_t* num_aggregates) {
+    return nvgraph::nvgraphHeavyEdgeMatching_impl(handle,
+                                                  descrG,
+                                                  weight_index,
+                                                  similarity_metric,
+                                                  aggregates,
+                                                  num_aggregates);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphSpectralModularityMaximization(nvgraphHandle_t handle,
-																						const nvgraphGraphDescr_t descrG,
-																						const size_t weight_index,
-																						const int n_clusters,
-																						const int n_eig_vects,
-																						const float evs_tolerance,
-																						const int evs_max_iter,
-																						const float kmean_tolerance,
-																						const int kmean_max_iter,
-																						int* clustering,
-																						void* eig_vals,
-																						void* eig_vects)
-																						{
-	return nvgraph::nvgraphSpectralModularityMaximization_impl(handle,
-																					descrG,
-																					weight_index,
-																					n_clusters,
-																					n_eig_vects,
-																					evs_tolerance,
-																					evs_max_iter,
-																					kmean_tolerance,
-																					kmean_max_iter,
-																					clustering,
-																					eig_vals,
-																					eig_vects);
+                                                                  const nvgraphGraphDescr_t descrG,
+                                                                  const size_t weight_index,
+                                                                  const int n_clusters,
+                                                                  const int n_eig_vects,
+                                                                  const float evs_tolerance,
+                                                                  const int evs_max_iter,
+                                                                  const float kmean_tolerance,
+                                                                  const int kmean_max_iter,
+                                                                  int* clustering,
+                                                                  void* eig_vals,
+                                                                  void* eig_vects) {
+    return nvgraph::nvgraphSpectralModularityMaximization_impl(handle,
+                                                               descrG,
+                                                               weight_index,
+                                                               n_clusters,
+                                                               n_eig_vects,
+                                                               evs_tolerance,
+                                                               evs_max_iter,
+                                                               kmean_tolerance,
+                                                               kmean_max_iter,
+                                                               clustering,
+                                                               eig_vals,
+                                                               eig_vects);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeModularityClustering(nvgraphHandle_t handle,
-																					const nvgraphGraphDescr_t descrG,
-																					const size_t weight_index,
-																					const int n_clusters,
-																					const int* clustering,
-																					float * modularity)
-																					{
-	return nvgraph::nvgraphAnalyzeModularityClustering_impl(handle,
-																				descrG,
-																				weight_index,
-																				n_clusters,
-																				clustering,
-																				modularity);
+                                                               const nvgraphGraphDescr_t descrG,
+                                                               const size_t weight_index,
+                                                               const int n_clusters,
+                                                               const int* clustering,
+                                                               float * modularity) {
+    return nvgraph::nvgraphAnalyzeModularityClustering_impl(handle,
+                                                            descrG,
+                                                            weight_index,
+                                                            n_clusters,
+                                                            clustering,
+                                                            modularity);
 }
 #ifndef NVGRAPH_LIGHT
 nvgraphStatus_t NVGRAPH_API nvgraphContractGraph(nvgraphHandle_t handle,
-																	nvgraphGraphDescr_t descrG,
-																	nvgraphGraphDescr_t contrdescrG,
-																	int *aggregates,
-																	size_t numaggregates,
-																	nvgraphSemiringOps_t VertexCombineOp,
-																	nvgraphSemiringOps_t VertexReduceOp,
-																	nvgraphSemiringOps_t EdgeCombineOp,
-																	nvgraphSemiringOps_t EdgeReduceOp,
-																	int flag)
-																	{
-	return nvgraph::nvgraphContractGraph_impl(handle,
-															descrG,
-															contrdescrG,
-															aggregates,
-															numaggregates,
-															VertexCombineOp,
-															VertexReduceOp,
-															EdgeCombineOp,
-															EdgeReduceOp,
-															flag);
+                                                 nvgraphGraphDescr_t descrG,
+                                                 nvgraphGraphDescr_t contrdescrG,
+                                                 int *aggregates,
+                                                 size_t numaggregates,
+                                                 nvgraphSemiringOps_t VertexCombineOp,
+                                                 nvgraphSemiringOps_t VertexReduceOp,
+                                                 nvgraphSemiringOps_t EdgeCombineOp,
+                                                 nvgraphSemiringOps_t EdgeReduceOp,
+                                                 int flag) {
+    return nvgraph::nvgraphContractGraph_impl(handle,
+                                              descrG,
+                                              contrdescrG,
+                                              aggregates,
+                                              numaggregates,
+                                              VertexCombineOp,
+                                              VertexReduceOp,
+                                              EdgeCombineOp,
+                                              EdgeReduceOp,
+                                              flag);
 }
-#endif 
+#endif
 
 nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering(nvgraphHandle_t handle, // nvGRAPH library handle.
-																		const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights)
-																		const size_t weight_index, // Index of the edge set for the weights.
-																		const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter
-																		int* clustering, // (output) clustering
-																		void* eig_vals,   // (output) eigenvalues
-																		void* eig_vects)  // (output) eigenvectors
-																		{
-	return nvgraph::nvgraphSpectralClustering_impl(handle,
-																	descrG,
-																	weight_index,
-																	params,
-																	clustering,
-																	eig_vals,
-																	eig_vects);
+                                                      const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights)
+                                                      const size_t weight_index, // Index of the edge set for the weights.
+                                                      const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter
+                                                      int* clustering, // (output) clustering
+                                                      void* eig_vals,   // (output) eigenvalues
+                                                      void* eig_vects)  // (output) eigenvectors
+{
+    return nvgraph::nvgraphSpectralClustering_impl(handle,
+                                                   descrG,
+                                                   weight_index,
+                                                   params,
+                                                   clustering,
+                                                   eig_vals,
+                                                   eig_vects);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering(nvgraphHandle_t handle, // nvGRAPH library handle.
-																		const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights)
-																		const size_t weight_index, // Index of the edge set for the weights.
-																		const int n_clusters, //number of clusters
-																		const int* clustering, // clustering to analyse
-																		nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality
-																		float * score) // (output) clustering score telling how good the clustering is for the selected metric.
-																		{
-	return nvgraph::nvgraphAnalyzeClustering_impl(handle,
-																	descrG,
-																	weight_index,
-																	n_clusters,
-																	clustering,
-																	metric,
-																	score);
+                                                     const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights)
+                                                     const size_t weight_index, // Index of the edge set for the weights.
+                                                     const int n_clusters, //number of clusters
+                                                     const int* clustering, // clustering to analyse
+                                                     nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality
+                                                     float * score) // (output) clustering score telling how good the clustering is for the selected metric.
+{
+    return nvgraph::nvgraphAnalyzeClustering_impl(handle,
+                                                  descrG,
+                                                  weight_index,
+                                                  n_clusters,
+                                                  clustering,
+                                                  metric,
+                                                  score);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount(nvgraphHandle_t handle,
-																	const nvgraphGraphDescr_t descrG,
-																	uint64_t* result)
-																	{
-	return nvgraph::nvgraphTriangleCount_impl(handle, descrG, result);
+                                                 const nvgraphGraphDescr_t descrG,
+                                                 uint64_t* result)
+{
+    return nvgraph::nvgraphTriangleCount_impl(handle, descrG, result);
 }
 
 
-nvgraphStatus_t NVGRAPH_API nvgraphLouvain (cudaDataType_t index_type, cudaDataType_t val_type, const size_t num_vertex, const size_t num_edges, 
-                            void* csr_ptr, void* csr_ind, void* csr_val, int weighted, int has_init_cluster, void* init_cluster, 
+nvgraphStatus_t NVGRAPH_API nvgraphLouvain (cudaDataType_t index_type, cudaDataType_t val_type, const size_t num_vertex, const size_t num_edges,
+                            void* csr_ptr, void* csr_ind, void* csr_val, int weighted, int has_init_cluster, void* init_cluster,
                             void* final_modularity, void* best_cluster_vec, void* num_level)
 {
     NVLOUVAIN_STATUS status = NVLOUVAIN_OK;
-    if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) || 
+    if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) ||
         ((init_cluster == NULL) && (has_init_cluster == 1)) || (final_modularity == NULL) || (best_cluster_vec == NULL) || (num_level == NULL))
        return NVGRAPH_STATUS_INVALID_VALUE;
 
@@ -4125,71 +3998,72 @@ nvgraphStatus_t NVGRAPH_API nvgraphLouvain (cudaDataType_t index_type, cudaDataT
     bool weighted_b = weighted;
     bool has_init_cluster_b = has_init_cluster;
     if (val_type == CUDA_R_32F)
-        status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (float*)csr_val, num_vertex, num_edges, 
-               weighted_b, has_init_cluster_b, (int*)init_cluster, *((float*)final_modularity), 
+        status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (float*)csr_val, num_vertex, num_edges,
+               weighted_b, has_init_cluster_b, (int*)init_cluster, *((float*)final_modularity),
               (int*)best_cluster_vec,*((int*)num_level), log);
     else
-        status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (double*)csr_val, num_vertex, num_edges, 
-                weighted_b, has_init_cluster_b, (int*)init_cluster, *((double*)final_modularity), 
+        status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (double*)csr_val, num_vertex, num_edges,
+                weighted_b, has_init_cluster_b, (int*)init_cluster, *((double*)final_modularity),
                 (int*)best_cluster_vec,*((int*)num_level), log);
 
     if (status != NVLOUVAIN_OK)
         return NVGRAPH_STATUS_INTERNAL_ERROR;
 
-    return NVGRAPH_STATUS_SUCCESS; 
+    return NVGRAPH_STATUS_SUCCESS;
 }
 
-nvgraphStatus_t NVGRAPH_API nvgraphJaccard (cudaDataType_t index_type, cudaDataType_t val_type, const size_t n, 
+nvgraphStatus_t NVGRAPH_API nvgraphJaccard (cudaDataType_t index_type, cudaDataType_t val_type, const size_t n,
                             const size_t e, void* csr_ptr, void* csr_ind, void* csr_val, int weighted, void* v, void* gamma, void* weight_j)
 {
     int status = 0;
 
-    if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) || (gamma == NULL) || (weight_j == NULL)) 
+    if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) || (gamma == NULL) || (weight_j == NULL))
         return NVGRAPH_STATUS_INVALID_VALUE;
 
     bool weighted_b = weighted;
+    cudaStream_t stream{nullptr};
 
     if (val_type == CUDA_R_32F)
     {
-        float* weight_i = NULL, *weight_s = NULL, *work = NULL;    
-        NVG_CUDA_TRY(cudaMalloc ((void**)&weight_i, sizeof(float) * e));
-        NVG_CUDA_TRY(cudaMalloc ((void**)&weight_s, sizeof(float) * e)); 
+        float* weight_i = NULL, *weight_s = NULL, *work = NULL;
+        NVG_RMM_TRY(RMM_ALLOC((void**)&weight_i, sizeof(float) * e, stream));
+        NVG_RMM_TRY(RMM_ALLOC((void**)&weight_s, sizeof(float) * e, stream));
         if (weighted_b == true)
         {
-            NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(float) * n));
+            NVG_RMM_TRY(RMM_ALLOC((void**)&work, sizeof(float) * n, stream));
             status = nvlouvain::jaccard <true> (n, e, (int*) csr_ptr, (int*) csr_ind, (float*) csr_val, (float*) v, work, *((float*) gamma), weight_i, weight_s, (float*)weight_j);
-            NVG_CUDA_TRY(cudaFree (work));
+            NVG_RMM_TRY(RMM_FREE(work, stream));
         }
         else
         {
-            NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(float) * n));
+            NVG_RMM_TRY(RMM_ALLOC((void**)&work, sizeof(float) * n, stream));
             nvlouvain::fill(e, (float*)weight_j, (float)1.0);
             status = nvlouvain::jaccard <false> (n, e, (int*) csr_ptr, (int*) csr_ind, (float*) csr_val, (float*) v, work, *((float*) gamma), weight_i, weight_s, (float*)weight_j);
-            NVG_CUDA_TRY(cudaFree (work));
+            NVG_RMM_TRY(RMM_FREE(work, stream));
         }
-        NVG_CUDA_TRY(cudaFree (weight_s));
-        NVG_CUDA_TRY(cudaFree (weight_i));
+        NVG_RMM_TRY(RMM_FREE(weight_s, stream));
+        NVG_RMM_TRY(RMM_FREE(weight_i, stream));
     }
     else
     {
-        double* weight_i = NULL, *weight_s = NULL, *work = NULL;    
-        NVG_CUDA_TRY(cudaMalloc ((void**)&weight_i, sizeof(double) * e));
-        NVG_CUDA_TRY(cudaMalloc ((void**)&weight_s, sizeof(double) * e));
+        double* weight_i = NULL, *weight_s = NULL, *work = NULL;
+        NVG_RMM_TRY(RMM_ALLOC((void**)&weight_i, sizeof(double) * e, stream));
+        NVG_RMM_TRY(RMM_ALLOC((void**)&weight_s, sizeof(double) * e, stream));
         if (weighted_b == true)
         {
-            NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(double) * n));
+            NVG_RMM_TRY(RMM_ALLOC((void**)&work, sizeof(double) * n, stream));
             status = nvlouvain::jaccard <true> (n, e, (int*) csr_ptr, (int*) csr_ind, (double*) csr_val, (double*) v, work, *((double*) gamma), weight_i, weight_s, (double*)weight_j);
-            NVG_CUDA_TRY(cudaFree (work));
+            NVG_RMM_TRY(RMM_FREE(work, stream));
         }
         else
         {
-            NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(double) * n));
+            NVG_RMM_TRY(RMM_ALLOC((void**)&work, sizeof(double) * n, stream));
             nvlouvain::fill(e, (double*)weight_j, (double)1.0);
             status = nvlouvain::jaccard <false> (n, e, (int*) csr_ptr, (int*) csr_ind, (double*) csr_val, (double*) v, work, *((double*) gamma), weight_i, weight_s, (double*)weight_j);
-            NVG_CUDA_TRY(cudaFree (work));
+            NVG_RMM_TRY(RMM_FREE(work, stream));
         }
-        NVG_CUDA_TRY(cudaFree (weight_s));
-        NVG_CUDA_TRY(cudaFree (weight_i));
+        NVG_RMM_TRY(RMM_FREE(weight_s, stream));
+        NVG_RMM_TRY(RMM_FREE(weight_i, stream));
     }
 
     if (status != 0)
@@ -4198,27 +4072,25 @@ nvgraphStatus_t NVGRAPH_API nvgraphJaccard (cudaDataType_t index_type, cudaDataT
     return NVGRAPH_STATUS_SUCCESS;
 }
 
-
 nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure(nvgraphHandle_t handle,
-														nvgraphGraphDescr_t descrG,
-														void* topologyData,
-														nvgraphTopologyType_t TT) {
-	return nvgraph::nvgraphAttachGraphStructure_impl( handle, descrG, topologyData, TT);
+                                                        nvgraphGraphDescr_t descrG,
+                                                        void* topologyData,
+                                                        nvgraphTopologyType_t TT) {
+    return nvgraph::nvgraphAttachGraphStructure_impl( handle, descrG, topologyData, TT);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData(nvgraphHandle_t handle,
-													 nvgraphGraphDescr_t descrG,
-													 size_t setnum,
-													 cudaDataType_t settype,
-													 void *vertexData) {
-	return nvgraph::nvgraphAttachVertexData_impl( handle, descrG, setnum, settype, vertexData);
+                                                     nvgraphGraphDescr_t descrG,
+                                                     size_t setnum,
+                                                     cudaDataType_t settype,
+                                                     void *vertexData) {
+    return nvgraph::nvgraphAttachVertexData_impl( handle, descrG, setnum, settype, vertexData);
 }
 
 nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData(nvgraphHandle_t handle,
-											      nvgraphGraphDescr_t descrG,
-											      size_t setnum,
-											      cudaDataType_t settype,
-											      void *edgeData) {
-	return nvgraph::nvgraphAttachEdgeData_impl( handle, descrG, setnum, settype, edgeData);
+                                                  nvgraphGraphDescr_t descrG,
+                                                  size_t setnum,
+                                                  cudaDataType_t settype,
+                                                  void *edgeData) {
+    return nvgraph::nvgraphAttachEdgeData_impl( handle, descrG, setnum, settype, edgeData);
 }
-
diff --git a/cpp/nvgraph/cpp/src/pagerank.cu b/cpp/nvgraph/cpp/src/pagerank.cu
index 30ecc3165f5..479bd2326d9 100644
--- a/cpp/nvgraph/cpp/src/pagerank.cu
+++ b/cpp/nvgraph/cpp/src/pagerank.cu
@@ -30,12 +30,6 @@
 #include <algorithm>
 #include <iomanip>
 
-
-#include "debug_macros.h"
-#ifdef DEBUG
-  #define PR_VERBOSE
-#endif
-
 namespace nvgraph
 {
 template <typename IndexType_, typename ValueType_>
@@ -167,18 +161,6 @@ bool Pagerank<IndexType_, ValueType_>::solve_it()
 template <typename IndexType_, typename ValueType_>
 NVGRAPH_ERROR Pagerank<IndexType_, ValueType_>::solve(ValueType damping_factor, Vector<ValueType>& initial_guess, Vector<ValueType>& pagerank_vector, float tolerance, int max_it)
 {
-   
-    #ifdef PR_VERBOSE
-        std::stringstream ss;
-        ss.str(std::string());
-        size_t used_mem, free_mem, total_mem;
-        ss <<" ------------------PageRank------------------"<< std::endl;
-        ss <<" --------------------------------------------"<< std::endl;
-        ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl;
-        ss <<" --------------------------------------------"<< std::endl;
-        COUT()<<ss.str();
-        cuda_timer timer; timer.start();
-    #endif
     m_max_it = max_it;
     m_tolerance = static_cast<ValueType_>(tolerance);
     setup(damping_factor, initial_guess, pagerank_vector);
@@ -190,25 +172,9 @@ NVGRAPH_ERROR Pagerank<IndexType_, ValueType_>::solve(ValueType damping_factor,
         m_iterations = i;
         converged = solve_it();
         i++;
-         #ifdef PR_VERBOSE
-            ss.str(std::string());
-            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
-            used_mem=total_mem-free_mem;
-            ss << std::setw(10) << i ;
-            ss.precision(3);
-            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
-            ss << std::setw(15) << std::scientific << m_residual  << std::endl;
-            COUT()<<ss.str();
-        #endif
     }
     m_iterations = i;
-    #ifdef PR_VERBOSE
-        COUT() <<" --------------------------------------------"<< std::endl;
-        //stop timer
-        COUT() <<" Total Time : "<< timer.stop() << "ms"<<std::endl;
-        COUT() <<" --------------------------------------------"<< std::endl;
-    #endif
-    
+
     if (converged)    
     {
         pagerank_vector = m_pagerank;
diff --git a/cpp/nvgraph/cpp/src/partition.cu b/cpp/nvgraph/cpp/src/partition.cu
index c1f0dd77425..17d5b40d8a0 100644
--- a/cpp/nvgraph/cpp/src/partition.cu
+++ b/cpp/nvgraph/cpp/src/partition.cu
@@ -535,8 +535,8 @@ namespace nvgraph {
     //nEigVecs - nrmR
     //lwork - Workspace max Lwork value (for either potrf or gesvd)
     //2 - devInfo
-    cudaMalloc(&lanczosVecs, (9*nEigVecs*n + 36*nEigVecs*nEigVecs + nEigVecs + lwork+2)*sizeof(ValueType_)); 
-    cudaCheckError();
+    auto rmm_result = RMM_ALLOC(&lanczosVecs, (9*nEigVecs*n + 36*nEigVecs*nEigVecs + nEigVecs + lwork+2)*sizeof(ValueType_), stream); 
+    rmmCheckError(rmm_result);
 
     //Setup preconditioner M for Laplacian L
     t1=timer();
diff --git a/cpp/nvgraph/cpp/src/sssp.cu b/cpp/nvgraph/cpp/src/sssp.cu
index 47ba109561c..617a388baf3 100644
--- a/cpp/nvgraph/cpp/src/sssp.cu
+++ b/cpp/nvgraph/cpp/src/sssp.cu
@@ -30,10 +30,7 @@
 #include "cub_semiring/cub.cuh"
 #endif
 #include <cfloat>
-#include "debug_macros.h"
-#ifdef DEBUG
-  #define SP_VERBOSE 0
-#endif
+
 namespace nvgraph
 {
 template <typename IndexType_, typename ValueType_>
@@ -136,41 +133,12 @@ NVGRAPH_ERROR Sssp<IndexType_, ValueType_>::solve(IndexType source_index, Vector
     bool converged = false;
     int max_it = static_cast<int>(m_network.get_num_edges()), i = 0;
 
-
-    #ifdef SP_VERBOSE
-        //int n = static_cast<int>(m_network.get_num_vertices()), nnz =  static_cast<int>(m_network.get_num_edges());
-        //dump_raw_vec(m_network.get_raw_row_offsets(), n, 0);
-        //dump_raw_vec(m_network.get_raw_column_indices(),n, 0);
-        //dump_raw_vec(m_network.get_raw_values(), nnz, 0);
-
-        std::stringstream ss;
-        ss.str(std::string());
-        size_t used_mem, free_mem, total_mem;
-        ss <<" --------------------Sssp--------------------"<< std::endl;
-        ss <<" --------------------------------------------"<< std::endl;
-        ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl;
-        ss <<" --------------------------------------------"<< std::endl;
-        COUT()<<ss.str();
-    #endif
     while (!converged && i < max_it)
     {
         converged = solve_it();
         i++;
-         #ifdef SP_VERBOSE
-            ss.str(std::string());
-            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
-            used_mem=total_mem-free_mem;
-            ss << std::setw(10) << i ;
-            ss.precision(3);
-            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
-            ss << std::setw(15) << std::scientific << m_residual  << std::endl;
-            COUT()<<ss.str();
-        #endif
     }
     m_iterations = i;
-    #ifdef SP_VERBOSE
-        COUT() <<" --------------------------------------------"<< std::endl;
-    #endif
     return converged ? NVGRAPH_OK : NVGRAPH_ERR_NOT_CONVERGED;
 }
 template class Sssp<int, double>;
diff --git a/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu b/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu
index d2a961ebd84..740a2afd341 100644
--- a/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu
+++ b/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu
@@ -33,7 +33,7 @@
 #include "sm_utils.h"
 using namespace cub;
 
-#include "cnmem.h"
+#include "rmm/rmm.h"
 
 #define TH_CENT_K_LOCLEN (34)
 #define WP_LEN_TH1 (24)
@@ -58,29 +58,25 @@ namespace nvgraph
 
   namespace triangles_counting
   {
-
-// hide behind 
-    void* tmp_get(size_t size, cudaStream_t stream)
-                  {
-      void *t = NULL;
-      cnmemStatus_t status = cnmemMalloc(&t, size, stream);
-      if (status == CNMEM_STATUS_OUT_OF_MEMORY)
-          {
+    // Better return std::unique_ptr than a raw pointer, but we haven't decide
+    // whether to create our own unique_ptr with RMM's deleter or to implement
+    // this in librmm. So, we may wait till this decision is made.
+    void* get_temp_storage(size_t size, cudaStream_t stream) {
+      auto t = static_cast<void*>(nullptr);
+      auto status = RMM_ALLOC(&t, size, stream);
+      if (status == RMM_ERROR_OUT_OF_MEMORY) {
         FatalError("Not enough memory", NVGRAPH_ERR_NO_MEMORY);
       }
-      else if (status != CNMEM_STATUS_SUCCESS)
-          {
+      else if (status != RMM_SUCCESS) {
         FatalError("Memory manager internal error (alloc)", NVGRAPH_ERR_UNKNOWN);
       }
 
       return t;
     }
 
-    void tmp_release(void* ptr, cudaStream_t stream)
-                     {
-      cnmemStatus_t status = cnmemFree(ptr, stream);
-      if (status != CNMEM_STATUS_SUCCESS)
-          {
+    void free_temp_storage(void* ptr, cudaStream_t stream) {
+      auto status = RMM_FREE(ptr, stream);
+      if (status != RMM_SUCCESS) {
         FatalError("Memory manager internal error (release)", NVGRAPH_ERR_UNKNOWN);
       }
     }
@@ -107,7 +103,7 @@ namespace nvgraph
                                 stream, debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes,
                                 d_in,
                                 d_out, num_items, reduction_op,
@@ -115,7 +111,7 @@ namespace nvgraph
                                 stream, debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -135,14 +131,14 @@ namespace nvgraph
                              debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes,
                              d_in,
                              d_out, num_items, stream,
                              debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -165,7 +161,7 @@ namespace nvgraph
                                      debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes,
                                      d_keys_in,
                                      d_keys_out, num_items,
@@ -174,7 +170,7 @@ namespace nvgraph
                                      debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -200,7 +196,7 @@ namespace nvgraph
                                       stream, debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
                                       d_keys_in,
                                       d_keys_out, d_values_in,
@@ -210,7 +206,7 @@ namespace nvgraph
                                       stream, debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -235,7 +231,7 @@ namespace nvgraph
                                                 stream, debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
                                                 d_keys_in,
                                                 d_keys_out, d_values_in,
@@ -243,7 +239,7 @@ namespace nvgraph
                                                 num_items, begin_bit,
                                                 end_bit,
                                                 stream, debug_synchronous);
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -267,7 +263,7 @@ namespace nvgraph
                                 stream, debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes,
                                 d_in,
                                 d_out, d_num_selected_out,
@@ -275,7 +271,7 @@ namespace nvgraph
                                 stream, debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -301,7 +297,7 @@ namespace nvgraph
                                          debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes,
                                          d_in,
                                          d_unique_out, d_counts_out,
@@ -310,7 +306,7 @@ namespace nvgraph
                                          debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -331,14 +327,14 @@ namespace nvgraph
                              debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes,
                              d_in,
                              d_out, num_items, stream,
                              debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -359,14 +355,14 @@ namespace nvgraph
                              debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes,
                              d_in,
                              d_out, num_items, stream,
                              debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -392,7 +388,7 @@ namespace nvgraph
                             debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes,
                             d_in,
                             d_out, d_num_selected_out,
@@ -401,7 +397,7 @@ namespace nvgraph
                             debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -427,7 +423,7 @@ namespace nvgraph
                                  stream, debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes,
                                  d_in,
                                  d_flags, d_out, d_num_selected_out,
@@ -435,7 +431,7 @@ namespace nvgraph
                                  stream, debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -456,14 +452,14 @@ namespace nvgraph
                                     debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes,
                                     d_in,
                                     d_out, num_items, stream,
                                     debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -484,14 +480,14 @@ namespace nvgraph
                                     debug_synchronous);
       cudaCheckError()
       ;
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes,
                                     d_in,
                                     d_out, num_items, stream,
                                     debug_synchronous);
       cudaCheckError()
       ;
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -525,7 +521,7 @@ namespace nvgraph
                                      num_items,
                                      stream, debug_synchronous);
       cudaCheckError();
-      d_temp_storage = tmp_get(temp_storage_bytes, stream);
+      d_temp_storage = get_temp_storage(temp_storage_bytes, stream);
       cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes,
                                      d_keys_in,
                                      d_unique_out,
@@ -536,7 +532,7 @@ namespace nvgraph
                                      num_items,
                                      stream, debug_synchronous);
       cudaCheckError();
-      tmp_release(d_temp_storage, stream);
+      free_temp_storage(d_temp_storage, stream);
 
       return;
     }
@@ -1175,12 +1171,12 @@ namespace nvgraph
         return;
       thrust::counting_iterator<T> it(0);
       NonEmptyRow<T> temp_func(roff);
-      T* d_out_num = (T*) tmp_get(sizeof(*n_nonempty), stream);
+      T* d_out_num = (T*) get_temp_storage(sizeof(*n_nonempty), stream);
 
       cubIf(it, p_nonempty, d_out_num, n, temp_func, stream);
       cudaMemcpy(n_nonempty, d_out_num, sizeof(*n_nonempty), cudaMemcpyDeviceToHost);
       cudaCheckError();
-      tmp_release(d_out_num, stream);
+      free_temp_storage(d_out_num, stream);
       cudaCheckError();
     }
 
@@ -1188,13 +1184,13 @@ namespace nvgraph
     uint64_t reduce(uint64_t *v_d, T n, cudaStream_t stream) {
 
       uint64_t n_h;
-      uint64_t *n_d = (uint64_t *) tmp_get(sizeof(*n_d), stream);
+      uint64_t *n_d = (uint64_t *) get_temp_storage(sizeof(*n_d), stream);
 
       cubSum(v_d, n_d, n, stream);
       cudaCheckError();
       cudaMemcpy(&n_h, n_d, sizeof(*n_d), cudaMemcpyDeviceToHost);
       cudaCheckError();
-      tmp_release(n_d, stream);
+      free_temp_storage(n_d, stream);
 
       return n_h;
     }
diff --git a/cpp/nvgraph/cpp/src/valued_csr_graph.cpp b/cpp/nvgraph/cpp/src/valued_csr_graph.cpp
index 3882c1607c2..abc46d80524 100644
--- a/cpp/nvgraph/cpp/src/valued_csr_graph.cpp
+++ b/cpp/nvgraph/cpp/src/valued_csr_graph.cpp
@@ -15,7 +15,6 @@
  */
 
 #include "valued_csr_graph.hxx"
-#include "cnmem_shared_ptr.hxx" // interface with CuMem (memory pool lib) for shared ptr
 
 namespace nvgraph
 {
diff --git a/cpp/nvgraph/cpp/src/widest_path.cu b/cpp/nvgraph/cpp/src/widest_path.cu
index 4da42856574..101dde6a4e0 100644
--- a/cpp/nvgraph/cpp/src/widest_path.cu
+++ b/cpp/nvgraph/cpp/src/widest_path.cu
@@ -30,10 +30,6 @@
 #include "nvgraph_csrmv.hxx"
 #include "widest_path.hxx"
 
-#include "debug_macros.h"
-#ifdef DEBUG
-#define MF_VERBOSE 0
-#endif
 namespace nvgraph
 {
 template <typename IndexType_, typename ValueType_>
@@ -157,35 +153,12 @@ NVGRAPH_ERROR WidestPath<IndexType_, ValueType_>::solve(IndexType source_index,
     setup(source_index, source_connection, widest_path_result);
     bool converged = false;
     int max_it = 100000, i = 0;
-    #ifdef MF_VERBOSE
-        std::stringstream ss;
-        ss.str(std::string());
-        size_t used_mem, free_mem, total_mem;
-        ss <<" ------------------WidestPath------------------"<< std::endl;
-        ss <<" --------------------------------------------"<< std::endl;
-        ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl;
-        ss <<" --------------------------------------------"<< std::endl;
-        COUT()<<ss.str();
-    #endif
     while (!converged && i < max_it)
     {
         converged = solve_it();
         i++;
-         #ifdef MF_VERBOSE
-            ss.str(std::string());
-            cnmemMemGetInfo(&free_mem, &total_mem, NULL);
-            used_mem=total_mem-free_mem;
-            ss << std::setw(10) << i ;
-            ss.precision(3);
-            ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0;
-            ss << std::setw(15) << std::scientific << m_residual  << std::endl;
-            COUT()<<ss.str();
-        #endif
     }
     m_iterations = i;
-    #ifdef MF_VERBOSE
-        COUT() <<" --------------------------------------------"<< std::endl;
-    #endif
     return converged ? NVGRAPH_OK : NVGRAPH_ERR_NOT_CONVERGED;
 }
 template class WidestPath<int, double>;
diff --git a/cpp/nvgraph/cpp/tests/CMakeLists.txt b/cpp/nvgraph/cpp/tests/CMakeLists.txt
index eda3443f398..2db70117943 100644
--- a/cpp/nvgraph/cpp/tests/CMakeLists.txt
+++ b/cpp/nvgraph/cpp/tests/CMakeLists.txt
@@ -1,6 +1,6 @@
 ﻿cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 
-project(CUDF_TESTS LANGUAGES C CXX CUDA)
+project(NVGRAPH_TESTS LANGUAGES C CXX CUDA)
 
 ###################################################################################################
 # - compiler function -----------------------------------------------------------------------------
@@ -8,7 +8,7 @@ project(CUDF_TESTS LANGUAGES C CXX CUDA)
 function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC)
     add_executable(${CMAKE_TEST_NAME} ${CMAKE_TEST_SRC})
     set_target_properties(${CMAKE_TEST_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_link_libraries(${CMAKE_TEST_NAME} gmock gtest gmock_main gtest_main pthread nvgraph_rapids cublas cusparse curand cusolver cudart)
+    target_link_libraries(${CMAKE_TEST_NAME} gmock gtest gmock_main gtest_main pthread nvgraph_rapids)
     set_target_properties(${CMAKE_TEST_NAME} PROPERTIES
                             RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gtests")
     add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
@@ -20,10 +20,10 @@ endfunction(ConfigureTest)
 include_directories(
                     "${CMAKE_BINARY_DIR}/include"
                     "${CMAKE_SOURCE_DIR}/include"
-                    "${CMAKE_SOURCE_DIR}/thirdparty/cnmem/include"
                     "${CMAKE_SOURCE_DIR}/thirdparty/cub"
                     "${CMAKE_SOURCE_DIR}/../external"
                     "${CMAKE_SOURCE_DIR}/../external/cusp"
+                    "${RMM_INCLUDE}"
                     "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
                    )
 
@@ -32,7 +32,8 @@ include_directories(
 
 link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc
                  "${CMAKE_BINARY_DIR}/lib"
-                 "${GTEST_LIBRARY_DIR}")
+                 "${GTEST_LIBRARY_DIR}"
+                 "${RMM_LIBRARY_DIR}")
 
 ###################################################################################################
 ### test sources ##################################################################################
diff --git a/cpp/nvgraph/cpp/thirdparty/cnmem b/cpp/nvgraph/cpp/thirdparty/cnmem
deleted file mode 160000
index 37896cc9bfc..00000000000
--- a/cpp/nvgraph/cpp/thirdparty/cnmem
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 37896cc9bfc6536a8c878a1e675835c22d827821
diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh
index 7432d58d69a..7a4691b55c3 100644
--- a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh
@@ -762,7 +762,7 @@ struct DispatchSpmv
 
 #if (CUB_PTX_ARCH == 0)
             // Init textures
-            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
+            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x, spmv_params.num_cols * sizeof(ValueT)))) break;
 #endif
 
             if (search_grid_size < sm_count)
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh
index 623609452fd..d47b214ca82 100644
--- a/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh
+++ b/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh
@@ -161,7 +161,7 @@ public:
     template <typename QualifiedT>
     cudaError_t BindTexture(
         QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
+        size_t          bytes,              ///< Number of bytes in the range
         size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
     {
         this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
diff --git a/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh
index 0305c78cd2c..e67b52c07f0 100644
--- a/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh
+++ b/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh
@@ -91,13 +91,13 @@ struct IteratorTexRef
         static TexRef ref;
 
         /// Bind texture
-        static cudaError_t BindTexture(void *d_in, size_t &offset)
+        static cudaError_t BindTexture(void *d_in, size_t &bytes, size_t &offset)
         {
             if (d_in)
             {
                 cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
                 ref.channelDesc = tex_desc;
-                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
+                return (CubDebug(cudaBindTexture(&offset, ref, d_in, bytes)));
             }
 
             return cudaSuccess;
@@ -245,12 +245,12 @@ public:
     template <typename QualifiedT>
     cudaError_t BindTexture(
         QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes = size_t(-1),     ///< Number of bytes in the range
+        size_t          bytes,                  ///< Number of bytes in the range
         size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
     {
         this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
         size_t offset;
-        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
+        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, bytes, offset);
         this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
         return retval;
     }
diff --git a/cpp/src/bfs.cu b/cpp/src/bfs.cu
deleted file mode 100644
index 903a514018d..00000000000
--- a/cpp/src/bfs.cu
+++ /dev/null
@@ -1,471 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-#include <algorithm>
-#include <iomanip>
-#include "bfs.cuh"
-#include <limits>
-#include "rmm_utils.h"
-
-#include "graph_utils.cuh"
-#include "bfs_kernels.cuh"
-
-using namespace bfs_kernels;
-
-namespace cugraph {
-	enum BFS_ALGO_STATE {
-		TOPDOWN, BOTTOMUP
-	};
-
-	template<typename IndexType>
-	void Bfs<IndexType>::setup() {
-
-		// Determinism flag, false by default
-		deterministic = false;
-		//Working data
-		//Each vertex can be in the frontier at most once
-		ALLOC_MANAGED_TRY(&frontier, n * sizeof(IndexType), nullptr);
-
-		//We will update frontier during the execution
-		//We need the orig to reset frontier, or cudaFree
-		original_frontier = frontier;
-
-		//size of bitmaps for vertices
-		vertices_bmap_size = (n / (8 * sizeof(int)) + 1);
-		//ith bit of visited_bmap is set <=> ith vertex is visited
-		ALLOC_MANAGED_TRY(&visited_bmap, sizeof(int) * vertices_bmap_size, nullptr);
-
-		//ith bit of isolated_bmap is set <=> degree of ith vertex = 0
-		ALLOC_MANAGED_TRY(&isolated_bmap, sizeof(int) * vertices_bmap_size, nullptr);
-
-		//vertices_degree[i] = degree of vertex i
-		ALLOC_MANAGED_TRY(&vertex_degree, sizeof(IndexType) * n, nullptr);
-
-		//Cub working data
-		cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes);
-
-		//We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive
-		ALLOC_MANAGED_TRY(&buffer_np1_1, (n + 1) * sizeof(IndexType), nullptr);
-		ALLOC_MANAGED_TRY(&buffer_np1_2, (n + 1) * sizeof(IndexType), nullptr);
-
-		//Using buffers : top down
-
-		//frontier_vertex_degree[i] is the degree of vertex frontier[i]
-		frontier_vertex_degree = buffer_np1_1;
-		//exclusive sum of frontier_vertex_degree
-		exclusive_sum_frontier_vertex_degree = buffer_np1_2;
-
-		//Using buffers : bottom up
-		//contains list of unvisited vertices
-		unvisited_queue = buffer_np1_1;
-		//size of the "last" unvisited queue : size_last_unvisited_queue
-		//refers to the size of unvisited_queue
-		//which may not be up to date (the queue may contains vertices that are now visited)
-
-		//We may leave vertices unvisited after bottom up main kernels - storing them here
-		left_unvisited_queue = buffer_np1_2;
-
-		//We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket
-		//See top down kernels for more details
-		ALLOC_MANAGED_TRY(&exclusive_sum_frontier_vertex_buckets_offsets,
-						((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType), nullptr);
-
-		//Init device-side counters
-		//Those counters must be/can be reset at each bfs iteration
-		//Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck
-		ALLOC_MANAGED_TRY(&d_counters_pad, 4 * sizeof(IndexType), nullptr);
-
-		d_new_frontier_cnt = &d_counters_pad[0];
-		d_mu = &d_counters_pad[1];
-		d_unvisited_cnt = &d_counters_pad[2];
-		d_left_unvisited_cnt = &d_counters_pad[3];
-
-		//Lets use this int* for the next 3 lines
-		//Its dereferenced value is not initialized - so we dont care about what we put in it
-		IndexType * d_nisolated = d_new_frontier_cnt;
-		cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
-
-		//Computing isolated_bmap
-		//Only dependent on graph - not source vertex - done once
-		flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream);
-		cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-
-		//We need nisolated to be ready to use
-		cudaStreamSynchronize(stream);
-	}
-
-	template<typename IndexType>
-	void Bfs<IndexType>::configure(IndexType *_distances,
-												IndexType *_predecessors,
-												int *_edge_mask)
-												{
-		distances = _distances;
-		predecessors = _predecessors;
-		edge_mask = _edge_mask;
-
-		useEdgeMask = (edge_mask != NULL);
-		computeDistances = (distances != NULL);
-		computePredecessors = (predecessors != NULL);
-
-		//We need distances to use bottom up
-		if (directed && !computeDistances)
-			ALLOC_MANAGED_TRY(&distances, n * sizeof(IndexType), nullptr);
-	}
-
-	template<typename IndexType>
-	void Bfs<IndexType>::traverse(IndexType source_vertex) {
-
-		//Init visited_bmap
-		//If the graph is undirected, we not that
-		//we will never discover isolated vertices (in degree = out degree = 0)
-		//we avoid a lot of work by flagging them now
-		//in g500 graphs they represent ~25% of total vertices
-		//more than that for wiki and twitter graphs
-
-		if (directed) {
-			cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream);
-		}
-		else {
-			cudaMemcpyAsync(visited_bmap,
-									isolated_bmap,
-									vertices_bmap_size * sizeof(int),
-									cudaMemcpyDeviceToDevice,
-									stream);
-		}
-
-		//If needed, setting all vertices as undiscovered (inf distance)
-		//We dont use computeDistances here
-		//if the graph is undirected, we may need distances even if
-		//computeDistances is false
-		if (distances)
-			fill_vec(distances, n, vec_t<IndexType>::max, stream);
-
-		//If needed, setting all predecessors to non-existent (-1)
-		if (computePredecessors) {
-			cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream);
-		}
-
-		//
-		//Initial frontier
-		//
-
-		frontier = original_frontier;
-
-		if (distances) {
-			cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream);
-		}
-
-		//Setting source_vertex as visited
-		//There may be bit already set on that bmap (isolated vertices) - if the graph is undirected
-		int current_visited_bmap_source_vert = 0;
-
-		if (!directed) {
-			cudaMemcpyAsync(&current_visited_bmap_source_vert,
-									&visited_bmap[source_vertex / INT_SIZE],
-									sizeof(int),
-									cudaMemcpyDeviceToHost);
-			//We need current_visited_bmap_source_vert
-			cudaStreamSynchronize(stream);
-		}
-
-		int m = (1 << (source_vertex % INT_SIZE));
-
-		//In that case, source is isolated, done now
-		if (!directed && (m & current_visited_bmap_source_vert)) {
-			//Init distances and predecessors are done, (cf Streamsync in previous if)
-			return;
-		}
-
-		m |= current_visited_bmap_source_vert;
-
-		cudaMemcpyAsync(&visited_bmap[source_vertex / INT_SIZE],
-								&m,
-								sizeof(int),
-								cudaMemcpyHostToDevice,
-								stream);
-
-		//Adding source_vertex to init frontier
-		cudaMemcpyAsync(&frontier[0],
-								&source_vertex,
-								sizeof(IndexType),
-								cudaMemcpyHostToDevice,
-								stream);
-
-		//mf : edges in frontier
-		//nf : vertices in frontier
-		//mu : edges undiscovered
-		//nu : nodes undiscovered
-		//lvl : current frontier's depth
-		IndexType mf, nf, mu, nu;
-		bool growing;
-		IndexType lvl = 1;
-
-		//Frontier has one vertex
-		nf = 1;
-
-		//all edges are undiscovered (by def isolated vertices have 0 edges)
-		mu = nnz;
-
-		//all non isolated vertices are undiscovered (excepted source vertex, which is in frontier)
-		//That number is wrong if source_vertex is also isolated - but it's not important
-		nu = n - nisolated - nf;
-
-		//Last frontier was 0, now it is 1
-		growing = true;
-
-		IndexType size_last_left_unvisited_queue = n; //we just need value > 0
-		IndexType size_last_unvisited_queue = 0; //queue empty
-
-		//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-		set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream);
-		exclusive_sum(d_cub_exclusive_sum_storage,
-							cub_exclusive_sum_storage_bytes,
-							frontier_vertex_degree,
-							exclusive_sum_frontier_vertex_degree,
-							nf + 1,
-							stream);
-
-		cudaMemcpyAsync(&mf,
-								&exclusive_sum_frontier_vertex_degree[nf],
-								sizeof(IndexType),
-								cudaMemcpyDeviceToHost,
-								stream);
-
-		//We need mf
-		cudaStreamSynchronize(stream);
-
-		//At first we know we have to use top down
-		BFS_ALGO_STATE algo_state = TOPDOWN;
-
-		//useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data
-		//undirected g : need parents to be in children's neighbors
-		bool can_use_bottom_up = !directed && distances;
-
-		while (nf > 0) {
-			//Each vertices can appear only once in the frontierer array - we know it will fit
-			new_frontier = frontier + nf;
-			IndexType old_nf = nf;
-			resetDevicePointers();
-
-			if (can_use_bottom_up) {
-				//Choosing algo
-				//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
-
-				switch (algo_state) {
-					case TOPDOWN:
-						if (mf > mu / alpha)
-							algo_state = BOTTOMUP;
-						break;
-					case BOTTOMUP:
-						if (!growing && nf < n / beta) {
-
-							//We need to prepare the switch back to top down
-							//We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here
-							count_unvisited_edges(unvisited_queue,
-															size_last_unvisited_queue,
-															visited_bmap,
-															vertex_degree,
-															d_mu,
-															stream);
-
-							//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-							set_frontier_degree(frontier_vertex_degree,
-														frontier,
-														vertex_degree,
-														nf,
-														stream);
-							exclusive_sum(d_cub_exclusive_sum_storage,
-												cub_exclusive_sum_storage_bytes,
-												frontier_vertex_degree,
-												exclusive_sum_frontier_vertex_degree,
-												nf + 1,
-												stream);
-
-							cudaMemcpyAsync(&mf,
-													&exclusive_sum_frontier_vertex_degree[nf],
-													sizeof(IndexType),
-													cudaMemcpyDeviceToHost,
-													stream);
-
-							cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
-
-							//We will need mf and mu
-							cudaStreamSynchronize(stream);
-							algo_state = TOPDOWN;
-						}
-						break;
-				}
-			}
-
-			//Executing algo
-
-			switch (algo_state) {
-				case TOPDOWN:
-					compute_bucket_offsets(exclusive_sum_frontier_vertex_degree,
-													exclusive_sum_frontier_vertex_buckets_offsets,
-													nf,
-													mf,
-													stream);
-					frontier_expand(row_offsets,
-											col_indices,
-											frontier,
-											nf,
-											mf,
-											lvl,
-											new_frontier,
-											d_new_frontier_cnt,
-											exclusive_sum_frontier_vertex_degree,
-											exclusive_sum_frontier_vertex_buckets_offsets,
-											visited_bmap,
-											distances,
-											predecessors,
-											edge_mask,
-											isolated_bmap,
-											directed,
-											stream,
-											deterministic);
-
-					mu -= mf;
-
-					cudaMemcpyAsync(&nf,
-											d_new_frontier_cnt,
-											sizeof(IndexType),
-											cudaMemcpyDeviceToHost,
-											stream);
-					cudaCheckError();
-
-					//We need nf
-					cudaStreamSynchronize(stream);
-
-					if (nf) {
-						//Typical pre-top down workflow. set_frontier_degree + exclusive-scan
-						set_frontier_degree(frontier_vertex_degree,
-													new_frontier,
-													vertex_degree,
-													nf,
-													stream);
-						exclusive_sum(d_cub_exclusive_sum_storage,
-											cub_exclusive_sum_storage_bytes,
-											frontier_vertex_degree,
-											exclusive_sum_frontier_vertex_degree,
-											nf + 1,
-											stream);
-						cudaMemcpyAsync(&mf,
-												&exclusive_sum_frontier_vertex_degree[nf],
-												sizeof(IndexType),
-												cudaMemcpyDeviceToHost,
-												stream);
-
-						//We need mf
-						cudaStreamSynchronize(stream);
-					}
-					break;
-
-				case BOTTOMUP:
-					fill_unvisited_queue(visited_bmap,
-												vertices_bmap_size,
-												n,
-												unvisited_queue,
-												d_unvisited_cnt,
-												stream,
-												deterministic);
-
-					size_last_unvisited_queue = nu;
-
-					bottom_up_main(unvisited_queue,
-										size_last_unvisited_queue,
-										left_unvisited_queue,
-										d_left_unvisited_cnt,
-										visited_bmap,
-										row_offsets,
-										col_indices,
-										lvl,
-										new_frontier,
-										d_new_frontier_cnt,
-										distances,
-										predecessors,
-										edge_mask,
-										stream,
-										deterministic);
-
-					//The number of vertices left unvisited decreases
-					//If it wasnt necessary last time, it wont be this time
-					if (size_last_left_unvisited_queue) {
-						cudaMemcpyAsync(&size_last_left_unvisited_queue,
-												d_left_unvisited_cnt,
-												sizeof(IndexType),
-												cudaMemcpyDeviceToHost,
-												stream);
-						cudaCheckError()
-						//We need last_left_unvisited_size
-						cudaStreamSynchronize(stream);
-						bottom_up_large(left_unvisited_queue,
-												size_last_left_unvisited_queue,
-												visited_bmap,
-												row_offsets,
-												col_indices,
-												lvl,
-												new_frontier,
-												d_new_frontier_cnt,
-												distances,
-												predecessors,
-												edge_mask,
-												stream,
-												deterministic);
-					}
-					cudaMemcpyAsync(&nf,
-											d_new_frontier_cnt,
-											sizeof(IndexType),
-											cudaMemcpyDeviceToHost,
-											stream);
-					cudaCheckError()
-
-					//We will need nf
-					cudaStreamSynchronize(stream);
-					break;
-			}
-
-			//Updating undiscovered edges count
-			nu -= nf;
-
-			//Using new frontier
-			frontier = new_frontier;
-			growing = (nf > old_nf);
-
-			++lvl;
-		}
-	}
-
-	template<typename IndexType>
-	void Bfs<IndexType>::resetDevicePointers() {
-		cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream);
-	}
-
-	template<typename IndexType>
-	void Bfs<IndexType>::clean() {
-		//the vectors have a destructor that takes care of cleaning
-		ALLOC_FREE_TRY(original_frontier, nullptr);
-		ALLOC_FREE_TRY(visited_bmap, nullptr);
-		ALLOC_FREE_TRY(isolated_bmap, nullptr);
-		ALLOC_FREE_TRY(vertex_degree, nullptr);
-		ALLOC_FREE_TRY(d_cub_exclusive_sum_storage, nullptr);
-		ALLOC_FREE_TRY(buffer_np1_1, nullptr);
-		ALLOC_FREE_TRY(buffer_np1_2, nullptr);
-		ALLOC_FREE_TRY(exclusive_sum_frontier_vertex_buckets_offsets, nullptr);
-		ALLOC_FREE_TRY(d_counters_pad, nullptr);
-
-		//In that case, distances is a working data
-		if (directed && !computeDistances)
-			ALLOC_FREE_TRY(distances, nullptr);
-	}
-
-	template class Bfs<int> ;
-} // end namespace cugraph
diff --git a/cpp/src/bfs_kernels.cuh b/cpp/src/bfs_kernels.cuh
deleted file mode 100644
index c12ac40f533..00000000000
--- a/cpp/src/bfs_kernels.cuh
+++ /dev/null
@@ -1,1575 +0,0 @@
-/*
- * Copyright (c) 2018 NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <iostream>
-
-#include <cub/cub.cuh>
-#include <utilities/sm_utils.h>
-
-#define MAXBLOCKS 65535
-#define WARP_SIZE 32
-#define INT_SIZE 32
-
-//
-// Bottom up macros
-//
-
-#define FILL_UNVISITED_QUEUE_DIMX 256
-
-#define COUNT_UNVISITED_EDGES_DIMX 256
-
-#define MAIN_BOTTOMUP_DIMX 256
-#define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX/WARP_SIZE)
-
-#define LARGE_BOTTOMUP_DIMX 256
-
-//Number of edges processed in the main bottom up kernel
-#define MAIN_BOTTOMUP_MAX_EDGES 6
-
-//Power of 2 < 32 (strict <)
-#define BOTTOM_UP_LOGICAL_WARP_SIZE 4
-
-//
-// Top down macros
-//
-
-// We will precompute the results the binsearch_maxle every TOP_DOWN_BUCKET_SIZE edges
-#define TOP_DOWN_BUCKET_SIZE 32
-
-// DimX of the kernel
-#define TOP_DOWN_EXPAND_DIMX 256
-
-// TOP_DOWN_EXPAND_DIMX edges -> NBUCKETS_PER_BLOCK buckets
-#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX/TOP_DOWN_BUCKET_SIZE)
-
-// How many items_per_thread we can process with one bucket_offset loading
-// the -1 is here because we need the +1 offset
-#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1)
-
-// instruction parallelism
-// for how many edges will we create instruction parallelism
-#define TOP_DOWN_BATCH_SIZE 2
-
-#define COMPUTE_BUCKET_OFFSETS_DIMX 512
-
-//Other macros
-
-#define FLAG_ISOLATED_VERTICES_DIMX 128
-
-//Number of vertices handled by one thread
-//Must be power of 2, lower than 32
-#define FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD 4 
-
-//Number of threads involved in the "construction" of one int in the bitset
-#define FLAG_ISOLATED_VERTICES_THREADS_PER_INT (INT_SIZE/FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD)
-
-//
-// Parameters of the heuristic to switch between bottomup/topdown
-//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf 
-//
-
-using namespace cugraph;
-
-namespace bfs_kernels {
-	//
-	// gives the equivalent vectors from a type
-	// for the max val, would be better to use numeric_limits<>::max() once
-	// cpp11 is allowed in nvgraph
-	//
-
-	template<typename >
-	struct vec_t {
-		typedef int4 vec4;
-		typedef int2 vec2;
-	};
-
-	template<>
-	struct vec_t<int> {
-		typedef int4 vec4;
-		typedef int2 vec2;
-		static const int max = INT_MAX;
-	};
-
-	template<>
-	struct vec_t<long long int> {
-		typedef longlong4 vec4;
-		typedef longlong2 vec2;
-		static const long long int max = LLONG_MAX;
-	};
-
-	//
-	// ------------------------- Helper device functions -------------------
-	//
-
-	__forceinline__ __device__ int getMaskNRightmostBitSet(int n) {
-		if (n == INT_SIZE)
-			return (~0);
-		int mask = (1 << n) - 1;
-		return mask;
-	}
-
-	__forceinline__ __device__ int getMaskNLeftmostBitSet(int n) {
-		if (n == 0)
-			return 0;
-		int mask = ~((1 << (INT_SIZE - n)) - 1);
-		return mask;
-	}
-
-	__forceinline__ __device__ int getNextZeroBit(int& val) {
-		int ibit = __ffs(~val) - 1;
-		val |= (1 << ibit);
-
-		return ibit;
-	}
-
-	struct BitwiseAnd
-	{
-		template<typename T>
-		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
-																			{
-			return (a & b);
-		}
-	};
-
-	struct BitwiseOr
-	{
-		template<typename T>
-		__host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
-																			{
-			return (a | b);
-		}
-	};
-
-	template<typename IndexType>
-	__device__ IndexType binsearch_maxle(	const IndexType *vec,
-														const IndexType val,
-														IndexType low,
-														IndexType high) {
-		while (true) {
-			if (low == high)
-				return low; //we know it exists
-			if ((low + 1) == high)
-				return (vec[high] <= val) ? high : low;
-
-			IndexType mid = low + (high - low) / 2;
-
-			if (vec[mid] > val)
-				high = mid - 1;
-			else
-				low = mid;
-
-		}
-	}
-
-	//
-	//  -------------------------  Bottom up -------------------------
-	//
-
-	//
-	// fill_unvisited_queue_kernel
-	//
-	// Finding unvisited vertices in the visited_bmap, and putting them in the queue
-	// Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted
-	// For instance, the queue can look like this :
-	// 34 38 45 58 61 4 18 24 29 71 84 85 90
-	// Because they are represented by those ints in the bitmap :
-	// [34 38 45 58 61] [4 18 24 29] [71 84 85 90]
-
-	//visited_bmap_nints = the visited_bmap is made of that number of ints
-
-	template<typename IndexType>
-	__global__ void fill_unvisited_queue_kernel(	int *visited_bmap,
-																IndexType visited_bmap_nints,
-																IndexType n,
-																IndexType *unvisited,
-																IndexType *unvisited_cnt) {
-		typedef cub::BlockScan<int, FILL_UNVISITED_QUEUE_DIMX> BlockScan;
-		__shared__ typename BlockScan::TempStorage scan_temp_storage;
-
-		//When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) )
-		//We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in
-		//unvisited_common_block_offset
-		__shared__ IndexType unvisited_common_block_offset;
-
-		//We don't want threads divergence in the loop (we're going to call __syncthreads)
-		//Using a block-only dependent in the condition of the loop
-		for (IndexType block_v_idx = blockIdx.x * blockDim.x;
-				block_v_idx < visited_bmap_nints;
-				block_v_idx += blockDim.x * gridDim.x) {
-
-			//Index of visited_bmap that this thread will compute
-			IndexType v_idx = block_v_idx + threadIdx.x;
-
-			int thread_visited_int = (v_idx < visited_bmap_nints)
-												? visited_bmap[v_idx]
-													:
-													(~0); //will be neutral in the next lines (virtual vertices all visited)
-
-			//The last int can only be partially valid
-			//If we are indeed taking care of the last visited int in this thread,
-			//We need to first disable (ie set as "visited") the inactive bits (vertices >= n)
-			if (v_idx == (visited_bmap_nints - 1)) {
-				int active_bits = n - (INT_SIZE * v_idx);
-				int inactive_bits = INT_SIZE - active_bits;
-				int mask = getMaskNLeftmostBitSet(inactive_bits);
-				thread_visited_int |= mask; //Setting inactive bits as visited
-			}
-
-			//Counting number of unvisited vertices represented by this int
-			int n_unvisited_in_int = __popc(~thread_visited_int);
-			int unvisited_thread_offset;
-
-			//We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue
-			//We ask for that space when computing the block scan, that will tell where to write those
-			//vertices in the queue, using the common offset of the block (see below)
-			BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset);
-
-			//Last thread knows how many vertices will be written to the queue by this block
-			//Asking for that space in the queue using the global count, and saving the common offset
-			if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) {
-				IndexType total = unvisited_thread_offset + n_unvisited_in_int;
-				unvisited_common_block_offset = atomicAdd(unvisited_cnt, total);
-			}
-
-			//syncthreads for two reasons : 
-			// - we need to broadcast unvisited_common_block_offset
-			// - we will reuse scan_temp_storage (cf CUB doc)
-			__syncthreads();
-
-			IndexType current_unvisited_index = unvisited_common_block_offset
-					+ unvisited_thread_offset;
-			int nvertices_to_write = n_unvisited_in_int;
-
-			// getNextZeroBit uses __ffs, which gives least significant bit set
-			// which means that as long as n_unvisited_in_int is valid,
-			// we will use valid bits
-
-			while (nvertices_to_write > 0) {
-				if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) {
-					typename vec_t<IndexType>::vec4 vec_v;
-
-					vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-					vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-					vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-					vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-
-					typename vec_t<IndexType>::vec4 *unvisited_i4 = reinterpret_cast<typename vec_t<
-							IndexType>::vec4*>(&unvisited[current_unvisited_index]);
-					*unvisited_i4 = vec_v;
-
-					current_unvisited_index += 4;
-					nvertices_to_write -= 4;
-				}
-				else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) {
-					typename vec_t<IndexType>::vec2 vec_v;
-
-					vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-					vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-
-					typename vec_t<IndexType>::vec2 *unvisited_i2 = reinterpret_cast<typename vec_t<
-							IndexType>::vec2*>(&unvisited[current_unvisited_index]);
-					*unvisited_i2 = vec_v;
-
-					current_unvisited_index += 2;
-					nvertices_to_write -= 2;
-				} else {
-					IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
-
-					unvisited[current_unvisited_index] = v;
-
-					current_unvisited_index += 1;
-					nvertices_to_write -= 1;
-				}
-
-			}
-		}
-	}
-
-	//Wrapper
-	template<typename IndexType>
-	void fill_unvisited_queue(	int *visited_bmap,
-										IndexType visited_bmap_nints,
-										IndexType n,
-										IndexType *unvisited,
-										IndexType *unvisited_cnt,
-										cudaStream_t m_stream,
-										bool deterministic) {
-		dim3 grid, block;
-		block.x = FILL_UNVISITED_QUEUE_DIMX;
-
-		grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x);
-
-		fill_unvisited_queue_kernel<<<grid, block, 0, m_stream>>>(	visited_bmap,
-																						visited_bmap_nints,
-																						n,
-																						unvisited,
-																						unvisited_cnt);
-		cudaCheckError()
-		;
-	}
-
-	//
-	// count_unvisited_edges_kernel
-	// Couting the total number of unvisited edges in the graph - using an potentially unvisited queue
-	// We need the current unvisited vertices to be in the unvisited queue
-	// But visited vertices can be in the potentially_unvisited queue
-	// We first check if the vertex is still unvisited before using it
-	// Useful when switching from "Bottom up" to "Top down"
-	//
-
-	template<typename IndexType>
-	__global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited,
-																const IndexType potentially_unvisited_size,
-																const int *visited_bmap,
-																IndexType *degree_vertices,
-																IndexType *mu) {
-		typedef cub::BlockReduce<IndexType, COUNT_UNVISITED_EDGES_DIMX> BlockReduce;
-		__shared__ typename BlockReduce::TempStorage reduce_temp_storage;
-
-		//number of undiscovered edges counted by this thread
-		IndexType thread_unvisited_edges_count = 0;
-
-		for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
-				idx < potentially_unvisited_size;
-				idx += blockDim.x * gridDim.x) {
-
-			IndexType u = potentially_unvisited[idx];
-			int u_visited_bmap = visited_bmap[u / INT_SIZE];
-			int is_visited = u_visited_bmap & (1 << (u % INT_SIZE));
-
-			if (!is_visited)
-				thread_unvisited_edges_count += degree_vertices[u];
-
-		}
-
-		//We need all thread_unvisited_edges_count to be ready before reducing
-		__syncthreads();
-
-		IndexType block_unvisited_edges_count =
-				BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count);
-
-		//block_unvisited_edges_count is only defined is th.x == 0
-		if (threadIdx.x == 0)
-			atomicAdd(mu, block_unvisited_edges_count);
-	}
-
-	//Wrapper
-	template<typename IndexType>
-	void count_unvisited_edges(const IndexType *potentially_unvisited,
-										const IndexType potentially_unvisited_size,
-										const int *visited_bmap,
-										IndexType *node_degree,
-										IndexType *mu,
-										cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = COUNT_UNVISITED_EDGES_DIMX;
-		grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x);
-
-		count_unvisited_edges_kernel<<<grid, block, 0, m_stream>>>(	potentially_unvisited,
-																						potentially_unvisited_size,
-																						visited_bmap,
-																						node_degree,
-																						mu);
-		cudaCheckError()
-		;
-	}
-
-	//
-	// Main Bottom Up kernel
-	// Here we will start to process unvisited vertices in the unvisited queue
-	// We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges
-	// If it's not possible to define a valid parent using only those edges,
-	// add it to the "left_unvisited_queue"
-	//
-
-	//
-	// We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property
-	// It is used to do a reduction locally and fully build the new visited_bmap
-	//
-
-	template<typename IndexType>
-	__global__ void main_bottomup_kernel(	const IndexType *unvisited,
-														const IndexType unvisited_size,
-														IndexType *left_unvisited,
-														IndexType *left_unvisited_cnt,
-														int *visited_bmap,
-														const IndexType *row_ptr,
-														const IndexType *col_ind,
-														IndexType lvl,
-														IndexType *new_frontier,
-														IndexType *new_frontier_cnt,
-														IndexType *distances,
-														IndexType *predecessors,
-														int *edge_mask) {
-		typedef cub::BlockDiscontinuity<IndexType, MAIN_BOTTOMUP_DIMX> BlockDiscontinuity;
-		typedef cub::WarpReduce<int> WarpReduce;
-		typedef cub::BlockScan<int, MAIN_BOTTOMUP_DIMX> BlockScan;
-
-		__shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage;
-		__shared__ typename WarpReduce::TempStorage reduce_temp_storage;
-		__shared__ typename BlockScan::TempStorage scan_temp_storage;
-
-		//To write vertices in the frontier,
-		//We will use a block scan to locally compute the offsets
-		//frontier_common_block_offset contains the common offset for the block
-		__shared__ IndexType frontier_common_block_offset;
-
-		// When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints
-		// from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23)
-		// vertices represented by the same int will be designed as part of the same "group"
-		// To detect the deliminations between those groups, we use BlockDiscontinuity
-		// Then we need to create the new "visited_bmap" within those group.
-		// We use a warp reduction that takes into account limits between groups to do it
-		// But a group can be cut in two different warps : in that case, the second warp
-		// put the result of its local reduction in local_visited_bmap_warp_head
-		// the first warp will then read it and finish the reduction
-
-		__shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS];
-
-		const int warpid = threadIdx.x / WARP_SIZE;
-		const int laneid = threadIdx.x % WARP_SIZE;
-
-		// we will call __syncthreads inside the loop
-		// we need to keep complete block active
-		for (IndexType block_off = blockIdx.x * blockDim.x;
-				block_off < unvisited_size;
-				block_off += blockDim.x * gridDim.x)
-						{
-			IndexType idx = block_off + threadIdx.x;
-
-			// This thread will take care of unvisited_vertex
-			// in the visited_bmap, it is represented by the int at index
-			// visited_bmap_index = unvisited_vertex/INT_SIZE
-			// it will be used by BlockDiscontinuity
-			// to flag the separation between groups of vertices (vertices represented by different in in visited_bmap)
-			IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one
-			visited_bmap_index[0] = -1;
-			IndexType unvisited_vertex = -1;
-
-			// local_visited_bmap gives info on the visited bit of unvisited_vertex
-			//
-			// By default, everything is visited
-			// This is because we only take care of unvisited vertices here,
-			// The other are by default unvisited
-			// If a vertex remain unvisited, we will notice it here
-			// That's why by default we consider everything visited ( ie ~0 )
-			// If we fail to assign one parent to an unvisited vertex, we will
-			// explicitly unset the bit
-			int local_visited_bmap = (~0);
-			int found = 0;
-			int more_to_visit = 0;
-			IndexType valid_parent;
-			IndexType left_unvisited_off;
-
-			if (idx < unvisited_size)
-					{
-				//Processing first STPV edges of unvisited v
-				//If bigger than that, push to left_unvisited queue
-				unvisited_vertex = unvisited[idx];
-
-				IndexType edge_begin = row_ptr[unvisited_vertex];
-				IndexType edge_end = row_ptr[unvisited_vertex + 1];
-
-				visited_bmap_index[0] = unvisited_vertex / INT_SIZE;
-
-				IndexType degree = edge_end - edge_begin;
-
-				for (IndexType edge = edge_begin;
-						edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge)
-						{
-					if (edge_mask && !edge_mask[edge])
-						continue;
-
-					IndexType parent_candidate = col_ind[edge];
-
-					if (distances[parent_candidate] == (lvl - 1))
-							{
-						found = 1;
-						valid_parent = parent_candidate;
-						break;
-					}
-				}
-
-				// This vertex will remain unvisited at the end of this kernel
-				// Explicitly say it
-				if (!found)
-					local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited
-				else
-				{
-					if (distances)
-						distances[unvisited_vertex] = lvl;
-					if (predecessors)
-						predecessors[unvisited_vertex] = valid_parent;
-				}
-
-				//If we haven't found a parent and there's more edge to check
-				if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES)
-				{
-					left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1);
-					more_to_visit = 1;
-				}
-
-			}
-
-			//
-			// We will separate vertices in group
-			// Two vertices are in the same group if represented by same int in visited_bmap
-			// ie u and v in same group <=> u/32 == v/32
-			//
-			// We will now flag the head of those group (first element of each group)
-			//
-			// 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue)
-			// 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained
-			// at most by two warps
-
-			int is_head_a[1]; //CUB need an array
-			BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a,
-																						visited_bmap_index,
-																						cub::Inequality());
-			int is_head = is_head_a[0];
-
-			// Computing the warp reduce within group
-			// This primitive uses the is_head flags to know where the limits of the groups are
-			// We use bitwise and as operator, because of the fact that 1 is the default value
-			// If a vertex is unvisited, we have to explicitly ask for it
-			int local_bmap_agg =
-					WarpReduce(reduce_temp_storage).HeadSegmentedReduce(	local_visited_bmap,
-																							is_head,
-																							BitwiseAnd());
-
-			// We need to take care of the groups cut in two in two different warps
-			// Saving second part of the reduce here, then applying it on the first part bellow
-			// Corner case : if the first thread of the warp is a head, then this group is not cut in two
-			// and then we have to be neutral (for an bitwise and, it's an ~0)
-			if (laneid == 0)
-					{
-				local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg;
-			}
-
-			//broadcasting local_visited_bmap_warp_head
-			__syncthreads();
-
-			int head_ballot = cugraph::utils::ballot(is_head);
-
-			//As long as idx < unvisited_size, we know there's at least one head per warp
-			int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot);
-
-			int is_last_head_in_warp = (laneid == laneid_last_head_in_warp);
-
-			// if laneid == 0 && is_last_head_in_warp, it's a special case where
-			// a group of size 32 starts exactly at lane 0
-			// in that case, nothing to do (this group is not cut by a warp delimitation)
-			// we also have to make sure that a warp actually exists after this one (this corner case is handled after)
-			if (laneid != 0 && (is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS))
-			{
-				local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1];
-			}
-
-			//Three cases :
-			// -> This is the first group of the block - it may be cut in two (with previous block)
-			// -> This is the last group of the block - same thing
-			// -> This group is completely contained in this block
-
-			if (warpid == 0 && laneid == 0)
-					{
-				//The first elt of this group considered in this block is unvisited_vertex
-				//We know that's the case because elts are sorted in a group, and we are at laneid == 0
-				//We will do an atomicOr - we have to be neutral about elts < unvisited_vertex
-				int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid
-				int mask = getMaskNLeftmostBitSet(INT_SIZE - iv);
-				local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex
-				atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
-			}
-			else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) &&
-					laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case
-					idx < unvisited_size //we could be out
-							)
-							{
-				//Last head of the block
-				//We don't know if this group is complete
-
-				//last_v is the last unvisited_vertex of the group IN THIS block
-				//we dont know about the rest - we have to be neutral about elts > last_v
-
-				//the destination thread of the __shfl is active
-				int laneid_max = min((IndexType) (WARP_SIZE - 1),
-											(unvisited_size - (block_off + 32 * warpid)));
-				IndexType last_v = cugraph::utils::shfl(	unvisited_vertex,
-																		laneid_max,
-																		WARP_SIZE,
-																		__activemask());
-
-				if (is_last_head_in_warp)
-				{
-					int ilast_v = last_v % INT_SIZE + 1;
-					int mask = getMaskNRightmostBitSet(ilast_v);
-					local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex
-					atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
-				}
-			}
-			else
-			{
-				//group completely in block
-				if (is_head && idx < unvisited_size) {
-					visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int
-				}
-			}
-
-			//Saving in frontier
-
-			int thread_frontier_offset;
-			BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset);
-			IndexType inclusive_sum = thread_frontier_offset + found;
-			if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum)
-					{
-				frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
-			}
-
-			//1) Broadcasting frontier_common_block_offset
-			//2) we want to reuse the *_temp_storage
-			__syncthreads();
-
-			if (found)
-				new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex;
-			if (more_to_visit)
-				left_unvisited[left_unvisited_off] = unvisited_vertex;
-
-		}
-	}
-
-	template<typename IndexType>
-	void bottom_up_main(	IndexType *unvisited,
-								IndexType unvisited_size,
-								IndexType *left_unvisited,
-								IndexType *d_left_unvisited_idx,
-								int *visited,
-								const IndexType *row_ptr,
-								const IndexType *col_ind,
-								IndexType lvl,
-								IndexType *new_frontier,
-								IndexType *new_frontier_idx,
-								IndexType *distances,
-								IndexType *predecessors,
-								int *edge_mask,
-								cudaStream_t m_stream,
-								bool deterministic) {
-		dim3 grid, block;
-		block.x = MAIN_BOTTOMUP_DIMX;
-
-		grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x);
-
-		main_bottomup_kernel<<<grid, block, 0, m_stream>>>(unvisited,
-																			unvisited_size,
-																			left_unvisited,
-																			d_left_unvisited_idx,
-																			visited,
-																			row_ptr,
-																			col_ind,
-																			lvl,
-																			new_frontier,
-																			new_frontier_idx,
-																			distances,
-																			predecessors,
-																			edge_mask);
-		cudaCheckError()
-		;
-	}
-
-	//
-	// bottom_up_large_degree_kernel
-	// finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found
-	//
-	template<typename IndexType>
-	__global__ void bottom_up_large_degree_kernel(	IndexType *left_unvisited,
-																	IndexType left_unvisited_size,
-																	int *visited,
-																	const IndexType *row_ptr,
-																	const IndexType *col_ind,
-																	IndexType lvl,
-																	IndexType *new_frontier,
-																	IndexType *new_frontier_cnt,
-																	IndexType *distances,
-																	IndexType *predecessors,
-																	int *edge_mask) {
-
-		int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE;
-		int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
-		int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
-
-		//Inactive threads are not a pb for __ballot (known behaviour)
-		for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id;
-				idx < left_unvisited_size;
-				idx += gridDim.x * logical_warps_per_block) {
-
-			//Unvisited vertices - potentially in the next frontier
-			IndexType v = left_unvisited[idx];
-
-			//Used only with symmetric graphs
-			//Parents are included in v's neighbors
-			IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited
-
-			IndexType end_i_edge = row_ptr[v + 1];
-
-			//We can have warp divergence in the next loop
-			//It's not a pb because the behaviour of __ballot
-			//is know with inactive threads
-			for (IndexType i_edge = first_i_edge + logical_lane_id;
-					i_edge < end_i_edge;
-					i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) {
-
-				IndexType valid_parent = -1;
-
-				if (!edge_mask || edge_mask[i_edge]) {
-					IndexType u = col_ind[i_edge];
-					IndexType lvl_u = distances[u];
-
-					if (lvl_u == (lvl - 1)) {
-						valid_parent = u;
-					}
-				}
-
-				unsigned int warp_valid_p_ballot = cugraph::utils::ballot((valid_parent != -1));
-
-				int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE;
-				unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1;
-				unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot
-						>> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp);
-				logical_warp_valid_p_ballot &= mask;
-
-				int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1;
-
-				if (chosen_thread == logical_lane_id) {
-					//Using only one valid parent (reduce bw)
-					IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1);
-					int m = 1 << (v % INT_SIZE);
-					atomicOr(&visited[v / INT_SIZE], m);
-					distances[v] = lvl;
-
-					if (predecessors)
-						predecessors[v] = valid_parent;
-
-					new_frontier[off] = v;
-				}
-
-				if (logical_warp_valid_p_ballot) {
-					break;
-				}
-			}
-
-		}
-	}
-
-	template<typename IndexType>
-	void bottom_up_large(IndexType *left_unvisited,
-								IndexType left_unvisited_size,
-								int *visited,
-								const IndexType *row_ptr,
-								const IndexType *col_ind,
-								IndexType lvl,
-								IndexType *new_frontier,
-								IndexType *new_frontier_idx,
-								IndexType *distances,
-								IndexType *predecessors,
-								int *edge_mask,
-								cudaStream_t m_stream,
-								bool deterministic) {
-		dim3 grid, block;
-		block.x = LARGE_BOTTOMUP_DIMX;
-		grid.x = min(	(IndexType) MAXBLOCKS,
-							((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x);
-
-		bottom_up_large_degree_kernel<<<grid, block, 0, m_stream>>>(left_unvisited,
-																						left_unvisited_size,
-																						visited,
-																						row_ptr,
-																						col_ind,
-																						lvl,
-																						new_frontier,
-																						new_frontier_idx,
-																						distances,
-																						predecessors,
-																						edge_mask);
-		cudaCheckError()
-		;
-	}
-
-	//
-	//
-	//  ------------------------------ Top down ------------------------------
-	//
-	//
-
-	//
-	// compute_bucket_offsets_kernel
-	// simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer
-	//
-
-	template<typename IndexType>
-	__global__ void compute_bucket_offsets_kernel(	const IndexType *frontier_degrees_exclusive_sum,
-																	IndexType *bucket_offsets,
-																	const IndexType frontier_size,
-																	IndexType total_degree) {
-		IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
-				* NBUCKETS_PER_BLOCK + 1);
-
-		for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x;
-				bid <= end;
-				bid += gridDim.x * blockDim.x) {
-
-			IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
-
-			bucket_offsets[bid] = binsearch_maxle(	frontier_degrees_exclusive_sum,
-																eid,
-																(IndexType) 0,
-																frontier_size - 1);
-
-		}
-	}
-
-	template<typename IndexType>
-	void compute_bucket_offsets(	IndexType *cumul,
-											IndexType *bucket_offsets,
-											IndexType frontier_size,
-											IndexType total_degree,
-											cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
-
-		grid.x = min(	(IndexType) MAXBLOCKS,
-							((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
-									* NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x);
-
-		compute_bucket_offsets_kernel<<<grid, block, 0, m_stream>>>(cumul,
-																						bucket_offsets,
-																						frontier_size,
-																						total_degree);
-		cudaCheckError()
-		;
-	}
-
-	//
-	// topdown_expand_kernel
-	// Read current frontier and compute new one with top down paradigm
-	// One thread = One edge
-	// To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than)
-	// This index k will give us the origin of this edge, which is frontier[k]
-	// This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k]
-	//
-	// To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches
-	// We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges
-	//
-	// Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k
-	// To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory
-	// We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below)
-	//
-	// We will then look which vertices are not visited yet :
-	// 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on
-	// 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue
-	//
-	// We then treat the candidates queue using the threadIdx.x < ncandidates
-	// If we are indeed the first thread to discover that vertex (result of atomicOr(visited))
-	// We add it to the new frontier
-	//
-
-	template<typename IndexType>
-	__global__ void topdown_expand_kernel(	const IndexType *row_ptr,
-														const IndexType *col_ind,
-														const IndexType *frontier,
-														const IndexType frontier_size,
-														const IndexType totaldegree,
-														const IndexType max_items_per_thread,
-														const IndexType lvl,
-														IndexType *new_frontier,
-														IndexType *new_frontier_cnt,
-														const IndexType *frontier_degrees_exclusive_sum,
-														const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
-														int *bmap,
-														IndexType *distances,
-														IndexType *predecessors,
-														const int *edge_mask,
-														const int *isolated_bmap,
-														bool directed) {
-		//BlockScan
-		typedef cub::BlockScan<IndexType, TOP_DOWN_EXPAND_DIMX> BlockScan;
-		__shared__ typename BlockScan::TempStorage scan_storage;
-
-		// We will do a scan to know where to write in frontier
-		// This will contain the common offset of the block
-		__shared__ IndexType frontier_common_block_offset;
-
-		__shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
-		__shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
-
-		//
-		// Frontier candidates local queue
-		// We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything
-		// We also save the predecessors here, because we will not be able to retrieve it after
-		//
-		__shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE
-				* TOP_DOWN_EXPAND_DIMX];
-		__shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE
-				* TOP_DOWN_EXPAND_DIMX];
-		__shared__ IndexType block_n_frontier_candidates;
-
-		IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
-		IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1)
-				/ TOP_DOWN_EXPAND_DIMX;
-
-		n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
-
-		for (;
-				(n_items_per_thread_left > 0) && (block_offset < totaldegree);
-
-				block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
-						n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) {
-
-			// In this loop, we will process batch_set_size batches
-			IndexType nitems_per_thread = min(	n_items_per_thread_left,
-															(IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD);
-
-			// Loading buckets offset (see compute_bucket_offsets_kernel)
-
-			if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
-				shared_buckets_offsets[threadIdx.x] =
-						frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE
-								+ threadIdx.x];
-
-			// We will use shared_buckets_offsets
-			__syncthreads();
-
-			//
-			// shared_buckets_offsets gives us a range of the possible indexes
-			// for edge of linear_threadx, we are looking for the value k such as
-			// k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx
-			//
-			// we have 0 <= k < frontier_size
-			// but we also have :
-			//
-			// frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
-			// <= k
-			// <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1]
-			//
-			// To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below)
-			// We will load them here
-			// We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop
-			// Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below)
-
-			//We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[
-			//If it doesn't fit, --right until it does, then loop
-			//It is excepted to fit on the first try, that's why we start right = nitems_per_thread
-
-			IndexType left = 0;
-			IndexType right = nitems_per_thread;
-
-			while (left < nitems_per_thread) {
-				//
-				// Values that are necessary to compute the local binary searches
-				// We only need those with indexes between extremes indexes of buckets_offsets
-				// We need the next val for the binary search, hence the +1
-				//
-
-				IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
-						- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
-
-				//If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1
-				while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
-					--right;
-
-					nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
-							- shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
-				}
-
-				IndexType nitems_per_thread_for_this_load = right - left;
-
-				IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left
-						* NBUCKETS_PER_BLOCK];
-
-				if (threadIdx.x < nvalues_to_load) {
-					shared_frontier_degrees_exclusive_sum[threadIdx.x] =
-							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
-									+ threadIdx.x];
-				}
-
-				if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
-					shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
-							frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
-									+ TOP_DOWN_EXPAND_DIMX];
-				}
-
-				//shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync
-				__syncthreads();
-
-				// Now we will process the edges
-				// Here each thread will process nitems_per_thread_for_this_load
-				for (IndexType item_index = 0;
-						item_index < nitems_per_thread_for_this_load;
-						item_index += TOP_DOWN_BATCH_SIZE) {
-
-					// We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism)
-					// Reduces latency
-
-					IndexType current_max_edge_index = min(block_offset
-																				+ (left
-																						+ nitems_per_thread_for_this_load)
-																						* blockDim.x,
-																		totaldegree);
-
-					//We will need vec_u (source of the edge) until the end if we need to save the predecessors
-					//For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case)
-
-					IndexType vec_u[TOP_DOWN_BATCH_SIZE];
-					IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
-					IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
-
-					IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
-
-#pragma unroll
-					for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-
-						IndexType ibatch = left + item_index + iv;
-						IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x;
-
-						if (gid < current_max_edge_index) {
-							IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x)
-									/ TOP_DOWN_BUCKET_SIZE;
-							IndexType bucket_start = shared_buckets_offsets[start_off_idx]
-									- frontier_degrees_exclusive_sum_block_offset;
-							IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1]
-									- frontier_degrees_exclusive_sum_block_offset;
-
-							IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum,
-																	gid,
-																	bucket_start,
-																	bucket_end)
-									+ frontier_degrees_exclusive_sum_block_offset;
-							vec_u[iv] = frontier[k]; // origin of this edge
-							vec_frontier_degrees_exclusive_sum_index[iv] =
-									frontier_degrees_exclusive_sum[k];
-						} else {
-							vec_u[iv] = -1;
-							vec_frontier_degrees_exclusive_sum_index[iv] = -1;
-						}
-
-					}
-
-					IndexType *vec_row_ptr_u = &local_buf1[0];
-#pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType u = vec_u[iv];
-						//row_ptr for this vertex origin u
-						vec_row_ptr_u[iv] = (u != -1)
-													? row_ptr[u]
-														:
-														-1;
-					}
-
-					//We won't need row_ptr after that, reusing pointer
-					IndexType *vec_dest_v = vec_row_ptr_u;
-
-#pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType thread_item_index = left + item_index + iv;
-						IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x;
-
-						IndexType row_ptr_u = vec_row_ptr_u[iv];
-						IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
-
-						if (edge_mask && !edge_mask[edge])
-							row_ptr_u = -1; //disabling edge
-
-						//Destination of this edge
-						vec_dest_v[iv] = (row_ptr_u != -1)
-												? col_ind[edge]
-													:
-													-1;
-					}
-
-					//We don't need vec_frontier_degrees_exclusive_sum_index anymore
-					IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
-#pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType v = vec_dest_v[iv];
-						vec_v_visited_bmap[iv] = (v != -1)
-															? bmap[v / INT_SIZE]
-																:
-																(~0); //will look visited
-					}
-
-					// From now on we will consider v as a frontier candidate
-					// If for some reason vec_candidate[iv] should be put in the new_frontier
-					// Then set vec_candidate[iv] = -1
-					IndexType *vec_frontier_candidate = vec_dest_v;
-
-#pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType v = vec_frontier_candidate[iv];
-						int m = 1 << (v % INT_SIZE);
-
-						int is_visited = vec_v_visited_bmap[iv] & m;
-
-						if (is_visited)
-							vec_frontier_candidate[iv] = -1;
-					}
-
-					if (directed) {
-						//vec_v_visited_bmap is available
-
-						IndexType *vec_is_isolated_bmap = vec_v_visited_bmap;
-
-#pragma unroll
-						for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-							IndexType v = vec_frontier_candidate[iv];
-							vec_is_isolated_bmap[iv] = (v != -1)
-																? isolated_bmap[v / INT_SIZE]
-																	:
-																	-1;
-						}
-
-#pragma unroll
-						for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-							IndexType v = vec_frontier_candidate[iv];
-							int m = 1 << (v % INT_SIZE);
-							int is_isolated = vec_is_isolated_bmap[iv] & m;
-
-							//If v is isolated, we will not add it to the frontier (it's not a frontier candidate)
-							// 1st reason : it's useless
-							// 2nd reason : it will make top down algo fail
-							// we need each node in frontier to have a degree > 0
-							// If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr
-
-							if (is_isolated && v != -1) {
-								int m = 1 << (v % INT_SIZE);
-								atomicOr(&bmap[v / INT_SIZE], m);
-								if (distances)
-									distances[v] = lvl;
-
-								if (predecessors)
-									predecessors[v] = vec_u[iv];
-
-								//This is no longer a candidate, neutralize it
-								vec_frontier_candidate[iv] = -1;
-							}
-
-						}
-					}
-
-					//Number of successor candidate hold by this thread
-					IndexType thread_n_frontier_candidates = 0;
-
-#pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						IndexType v = vec_frontier_candidate[iv];
-						if (v != -1)
-							++thread_n_frontier_candidates;
-					}
-
-					// We need to have all nfrontier_candidates to be ready before doing the scan
-					__syncthreads();
-
-					// We will put the frontier candidates in a local queue
-					// Computing offsets
-					IndexType thread_frontier_candidate_offset = 0; //offset inside block
-					BlockScan(scan_storage).ExclusiveSum(	thread_n_frontier_candidates,
-																		thread_frontier_candidate_offset);
-
-#pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						//May have bank conflicts
-						IndexType frontier_candidate = vec_frontier_candidate[iv];
-
-						if (frontier_candidate != -1) {
-							shared_local_new_frontier_candidates[thread_frontier_candidate_offset] =
-									frontier_candidate;
-							shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] =
-									vec_u[iv];
-							++thread_frontier_candidate_offset;
-						}
-					}
-
-					if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
-						//No need to add nsuccessor_candidate, even if its an
-						//exclusive sum
-						//We incremented the thread_frontier_candidate_offset
-						block_n_frontier_candidates = thread_frontier_candidate_offset;
-					}
-
-					//broadcast block_n_frontier_candidates
-					__syncthreads();
-
-					IndexType naccepted_vertices = 0;
-					//We won't need vec_frontier_candidate after that
-					IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate;
-
-#pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						const int idx_shared = iv * blockDim.x + threadIdx.x;
-						vec_frontier_accepted_vertex[iv] = -1;
-
-						if (idx_shared < block_n_frontier_candidates) {
-							IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue
-							int m = 1 << (v % INT_SIZE);
-							int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old
-
-							if (!(m & q)) { //if this thread was the first to discover this node
-								if (distances)
-									distances[v] = lvl;
-
-								if (predecessors) {
-									IndexType pred = shared_local_new_frontier_predecessors[idx_shared];
-									predecessors[v] = pred;
-								}
-
-								vec_frontier_accepted_vertex[iv] = v;
-								++naccepted_vertices;
-							}
-						}
-
-					}
-
-					//We need naccepted_vertices to be ready
-					__syncthreads();
-
-					IndexType thread_new_frontier_offset;
-
-					BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset);
-
-					if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
-
-						IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices;
-						//for this thread, thread_new_frontier_offset + has_successor (exclusive sum)
-						if (inclusive_sum)
-							frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
-					}
-
-					//Broadcasting frontier_common_block_offset
-					__syncthreads();
-
-#pragma unroll
-					for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
-						const int idx_shared = iv * blockDim.x + threadIdx.x;
-						if (idx_shared < block_n_frontier_candidates) {
-
-							IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv];
-
-							if (new_frontier_vertex != -1) {
-								IndexType off = frontier_common_block_offset + thread_new_frontier_offset++;
-								new_frontier[off] = new_frontier_vertex;
-							}
-						}
-					}
-
-				}
-
-				//We need to keep shared_frontier_degrees_exclusive_sum coherent
-				__syncthreads();
-
-				//Preparing for next load
-				left = right;
-				right = nitems_per_thread;
-			}
-
-			//we need to keep shared_buckets_offsets coherent
-			__syncthreads();
-		}
-
-	}
-
-	template<typename IndexType>
-	void frontier_expand(const IndexType *row_ptr,
-								const IndexType *col_ind,
-								const IndexType *frontier,
-								const IndexType frontier_size,
-								const IndexType totaldegree,
-								const IndexType lvl,
-								IndexType *new_frontier,
-								IndexType *new_frontier_cnt,
-								const IndexType *frontier_degrees_exclusive_sum,
-								const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
-								int *visited_bmap,
-								IndexType *distances,
-								IndexType *predecessors,
-								const int *edge_mask,
-								const int *isolated_bmap,
-								bool directed,
-								cudaStream_t m_stream,
-								bool deterministic) {
-		if (!totaldegree)
-			return;
-
-		dim3 block;
-		block.x = TOP_DOWN_EXPAND_DIMX;
-
-		IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1)
-				/ (MAXBLOCKS * block.x);
-
-		dim3 grid;
-		grid.x = min(	(totaldegree + max_items_per_thread * block.x - 1)
-									/ (max_items_per_thread * block.x),
-							(IndexType) MAXBLOCKS);
-
-		topdown_expand_kernel<<<grid, block, 0, m_stream>>>(	row_ptr,
-																				col_ind,
-																				frontier,
-																				frontier_size,
-																				totaldegree,
-																				max_items_per_thread,
-																				lvl,
-																				new_frontier,
-																				new_frontier_cnt,
-																				frontier_degrees_exclusive_sum,
-																				frontier_degrees_exclusive_sum_buckets_offsets,
-																				visited_bmap,
-																				distances,
-																				predecessors,
-																				edge_mask,
-																				isolated_bmap,
-																				directed);
-		cudaCheckError()
-		;
-	}
-
-	template<typename IndexType>
-	__global__ void flag_isolated_vertices_kernel(	IndexType n,
-																	int *isolated_bmap,
-																	const IndexType *row_ptr,
-																	IndexType *degrees,
-																	IndexType *nisolated) {
-		typedef cub::BlockLoad<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
-				FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-		typedef cub::BlockStore<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
-				FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-		typedef cub::BlockReduce<IndexType, FLAG_ISOLATED_VERTICES_DIMX> BlockReduce;
-		typedef cub::WarpReduce<int, FLAG_ISOLATED_VERTICES_THREADS_PER_INT> WarpReduce;
-
-		__shared__ typename BlockLoad::TempStorage load_temp_storage;
-		__shared__ typename BlockStore::TempStorage store_temp_storage;
-		__shared__ typename BlockReduce::TempStorage block_reduce_temp_storage;
-
-		__shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX
-				/ FLAG_ISOLATED_VERTICES_THREADS_PER_INT];
-
-		__shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX];
-
-		for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD
-				* (blockDim.x * blockIdx.x);
-				block_off < n;
-				block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) {
-
-			IndexType thread_off = block_off
-					+ FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x;
-			IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1;
-
-			IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
-			IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1]
-
-			BlockLoad(load_temp_storage).Load(	row_ptr + block_off,
-															thread_row_ptr,
-															block_valid_items,
-															-1);
-
-			//To compute 4 degrees, we need 5 values of row_ptr
-			//Saving the "5th" value in shared memory for previous thread to use
-			if (threadIdx.x > 0) {
-				row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0];
-			}
-
-			//If this is the last thread, it needs to load its row ptr tail value
-			if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) {
-				row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1];
-
-			}
-			__syncthreads(); // we may reuse temp_storage
-
-			int local_isolated_bmap = 0;
-
-			IndexType imax = (n - thread_off);
-
-			IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
-
-#pragma unroll
-			for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) {
-				IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i];
-
-				if (i < imax)
-					local_isolated_bmap |= ((degree == 0) << i);
-			}
-
-			if (last_node_thread < n) {
-				IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] =
-						row_ptr_tail[threadIdx.x]
-								- thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1];
-
-				local_isolated_bmap |= ((degree == 0)
-						<< (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1));
-
-			}
-
-			local_isolated_bmap <<= (thread_off % INT_SIZE);
-
-			IndexType local_nisolated = __popc(local_isolated_bmap);
-
-			//We need local_nisolated and local_isolated_bmap to be ready for next steps
-			__syncthreads();
-
-			IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated);
-
-			if (threadIdx.x == 0 && total_nisolated) {
-				atomicAdd(nisolated, total_nisolated);
-			}
-
-			int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT;
-
-			//Building int for bmap
-			int int_aggregate_isolated_bmap =
-					WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(	local_isolated_bmap,
-																									BitwiseOr());
-
-			int is_head_of_visited_int =
-					((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0);
-			if (is_head_of_visited_int) {
-				isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap;
-			}
-
-			BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items);
-		}
-	}
-
-	template<typename IndexType>
-	void flag_isolated_vertices(	IndexType n,
-											int *isolated_bmap,
-											const IndexType *row_ptr,
-											IndexType *degrees,
-											IndexType *nisolated,
-											cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = FLAG_ISOLATED_VERTICES_DIMX;
-
-		grid.x = min(	(IndexType) MAXBLOCKS,
-							(n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x);
-
-		flag_isolated_vertices_kernel<<<grid, block, 0, m_stream>>>(n,
-																						isolated_bmap,
-																						row_ptr,
-																						degrees,
-																						nisolated);
-		cudaCheckError()
-		;
-	}
-
-	//
-	//
-	//
-	// Some utils functions
-	//
-	//
-
-	//Creates CUB data for graph size n
-	template<typename IndexType>
-	void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) {
-		// Determine temporary device storage requirements for exclusive prefix scan
-		d_temp_storage = NULL;
-		temp_storage_bytes = 0;
-		IndexType *d_in = NULL, *d_out = NULL;
-		cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n);
-		// Allocate temporary storage for exclusive prefix scan
-		cudaMalloc(&d_temp_storage, temp_storage_bytes);
-	}
-
-	template<typename IndexType>
-	__global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) {
-		for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x;
-				u < n;
-				u += gridDim.x * blockDim.x)
-			vec[u] = val;
-
-	}
-
-	template<typename IndexType>
-	void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = 256;
-		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
-		fill_kernel<<<grid, block, 0, m_stream>>>(vec, n, val);
-		cudaCheckError()
-		;
-	}
-
-	template<typename IndexType>
-	__global__ void set_frontier_degree_kernel(	IndexType *frontier_degree,
-																IndexType *frontier,
-																const IndexType *degree,
-																IndexType n) {
-		for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
-				idx < n;
-				idx += gridDim.x * blockDim.x) {
-			IndexType u = frontier[idx];
-			frontier_degree[idx] = degree[u];
-		}
-	}
-
-	template<typename IndexType>
-	void set_frontier_degree(	IndexType *frontier_degree,
-										IndexType *frontier,
-										const IndexType *degree,
-										IndexType n,
-										cudaStream_t m_stream) {
-		dim3 grid, block;
-		block.x = 256;
-		grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
-		set_frontier_degree_kernel<<<grid, block, 0, m_stream>>>(frontier_degree,
-																					frontier,
-																					degree,
-																					n);
-		cudaCheckError()
-		;
-	}
-
-	template<typename IndexType>
-	void exclusive_sum(	void *d_temp_storage,
-								size_t temp_storage_bytes,
-								IndexType *d_in,
-								IndexType *d_out,
-								IndexType num_items,
-								cudaStream_t m_stream) {
-		if (num_items <= 1)
-			return; //DeviceScan fails if n==1
-		cub::DeviceScan::ExclusiveSum(d_temp_storage,
-												temp_storage_bytes,
-												d_in,
-												d_out,
-												num_items,
-												m_stream);
-	}
-
-	template<typename T>
-	__global__ void fill_vec_kernel(T *vec, T n, T val) {
-		for (T idx = blockIdx.x * blockDim.x + threadIdx.x;
-				idx < n;
-				idx += blockDim.x * gridDim.x)
-			vec[idx] = val;
-	}
-
-	template<typename T>
-	void fill_vec(T *vec, T n, T val, cudaStream_t stream) {
-		dim3 grid, block;
-		block.x = 256;
-		grid.x = (n + block.x - 1) / block.x;
-
-		fill_vec_kernel<<<grid, block, 0, stream>>>(vec, n, val);
-		cudaCheckError()
-		;
-	}
-}
-//
diff --git a/cpp/src/nvgraph_gdf.cu b/cpp/src/community/nvgraph_gdf.cu
similarity index 54%
rename from cpp/src/nvgraph_gdf.cu
rename to cpp/src/community/nvgraph_gdf.cu
index 7f493ecafe9..4e605fb91f3 100644
--- a/cpp/src/nvgraph_gdf.cu
+++ b/cpp/src/community/nvgraph_gdf.cu
@@ -21,248 +21,19 @@
  * @file nvgraph_gdf.cu
  * ---------------------------------------------------------------------------**/
 
+#include <cugraph.h>
 #include <nvgraph_gdf.h>
 #include <nvgraph/nvgraph.h>
 #include <thrust/device_vector.h>
 #include <ctime>
 #include "utilities/error_utils.h"
+#include "converters/nvgraph.cuh"
 
 //RMM:
 //
 
 #include <rmm_utils.h>
 
-template<typename T>
-using Vector = thrust::device_vector<T, rmm_allocator<T>>;
-
-gdf_error nvgraph2gdf_error(nvgraphStatus_t nvg_stat) {
-  switch (nvg_stat) {
-    case NVGRAPH_STATUS_SUCCESS:
-      return GDF_SUCCESS;
-    case NVGRAPH_STATUS_NOT_INITIALIZED:
-      return GDF_INVALID_API_CALL;
-    case NVGRAPH_STATUS_INVALID_VALUE:
-      return GDF_INVALID_API_CALL;
-    case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED:
-      return GDF_UNSUPPORTED_DTYPE;
-    case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED:
-      return GDF_INVALID_API_CALL;
-    default:
-      return GDF_CUDA_ERROR;
-  }
-}
-
-gdf_error nvgraph2gdf_error_verbose(nvgraphStatus_t nvg_stat) {
-  switch (nvg_stat) {
-    case NVGRAPH_STATUS_NOT_INITIALIZED:
-      std::cerr << "nvGRAPH not initialized";
-      return GDF_CUDA_ERROR;
-    case NVGRAPH_STATUS_ALLOC_FAILED:
-      std::cerr << "nvGRAPH alloc failed";
-      return GDF_CUDA_ERROR;
-    case NVGRAPH_STATUS_INVALID_VALUE:
-      std::cerr << "nvGRAPH invalid value";
-      return GDF_CUDA_ERROR;
-    case NVGRAPH_STATUS_ARCH_MISMATCH:
-      std::cerr << "nvGRAPH arch mismatch";
-      return GDF_CUDA_ERROR;
-    case NVGRAPH_STATUS_MAPPING_ERROR:
-      std::cerr << "nvGRAPH mapping error";
-      return GDF_CUDA_ERROR;
-    case NVGRAPH_STATUS_EXECUTION_FAILED:
-      std::cerr << "nvGRAPH execution failed";
-      return GDF_CUDA_ERROR;
-    case NVGRAPH_STATUS_INTERNAL_ERROR:
-      std::cerr << "nvGRAPH internal error";
-      return GDF_CUDA_ERROR;
-    case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED:
-      std::cerr << "nvGRAPH type not supported";
-      return GDF_CUDA_ERROR;
-    case NVGRAPH_STATUS_NOT_CONVERGED:
-      std::cerr << "nvGRAPH algorithm failed to converge";
-      return GDF_CUDA_ERROR;
-    case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED:
-      std::cerr << "nvGRAPH graph type not supported";
-      return GDF_CUDA_ERROR;
-    default:
-      std::cerr << "Unknown nvGRAPH Status";
-      return GDF_CUDA_ERROR;
-  }
-}
-
-#ifdef VERBOSE
-#define NVG_TRY(call)                             \
-{                                                 \
-  if ((call)!=NVGRAPH_STATUS_SUCCESS)             \
-    return nvgraph2gdf_error_verbose((call));     \
-}
-#else
-#define NVG_TRY(call)                                   \
-{                                                        \
-  nvgraphStatus_t err_code = (call);                    \
-  if (err_code != NVGRAPH_STATUS_SUCCESS)               \
-    return nvgraph2gdf_error(err_code);                 \
-}
-#endif
-
-gdf_error gdf_createGraph_nvgraph(nvgraphHandle_t nvg_handle,
-                                  gdf_graph* gdf_G,
-                                  nvgraphGraphDescr_t* nvg_G,
-                                  bool use_transposed) {
-
-  // check input
-  GDF_REQUIRE(!((gdf_G->edgeList == nullptr) &&
-                  (gdf_G->adjList == nullptr) &&
-                  (gdf_G->transposedAdjList == nullptr)),
-              GDF_INVALID_API_CALL);
-  nvgraphTopologyType_t TT;
-  cudaDataType_t settype;
-  // create an nvgraph graph handle
-  NVG_TRY(nvgraphCreateGraphDescr(nvg_handle, nvg_G));
-  // setup nvgraph variables
-  if (use_transposed) {
-    // convert edgeList to transposedAdjList
-    if (gdf_G->transposedAdjList == nullptr) {
-      GDF_TRY(gdf_add_transposed_adj_list(gdf_G));
-    }
-    // using exiting transposedAdjList if it exisits and if adjList is missing
-    TT = NVGRAPH_CSC_32;
-    nvgraphCSCTopology32I_st topoData;
-    topoData.nvertices = gdf_G->transposedAdjList->offsets->size - 1;
-    topoData.nedges = gdf_G->transposedAdjList->indices->size;
-    topoData.destination_offsets = (int *) gdf_G->transposedAdjList->offsets->data;
-    topoData.source_indices = (int *) gdf_G->transposedAdjList->indices->data;
-    // attach the transposed adj list
-    NVG_TRY(nvgraphAttachGraphStructure(nvg_handle, *nvg_G, (void * )&topoData, TT));
-    //attach edge values
-    if (gdf_G->transposedAdjList->edge_data) {
-      switch (gdf_G->transposedAdjList->edge_data->dtype) {
-        case GDF_FLOAT32:
-          settype = CUDA_R_32F;
-          NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
-                                        *nvg_G,
-                                        0,
-                                        settype,
-                                        (float * ) gdf_G->transposedAdjList->edge_data->data))
-          break;
-        case GDF_FLOAT64:
-          settype = CUDA_R_64F;
-          NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
-                                        *nvg_G,
-                                        0,
-                                        settype,
-                                        (double * ) gdf_G->transposedAdjList->edge_data->data))
-          break;
-        default:
-          return GDF_UNSUPPORTED_DTYPE;
-      }
-    }
-
-  }
-  else {
-    // convert edgeList to adjList
-    if (gdf_G->adjList == nullptr) {
-      GDF_TRY(gdf_add_adj_list(gdf_G));
-    }
-    TT = NVGRAPH_CSR_32;
-    nvgraphCSRTopology32I_st topoData;
-    topoData.nvertices = gdf_G->adjList->offsets->size - 1;
-    topoData.nedges = gdf_G->adjList->indices->size;
-    topoData.source_offsets = (int *) gdf_G->adjList->offsets->data;
-     topoData.destination_indices = (int *) gdf_G->adjList->indices->data;
- 
-    // attach adj list
-    NVG_TRY(nvgraphAttachGraphStructure(nvg_handle, *nvg_G, (void * )&topoData, TT));
-    //attach edge values
-    if (gdf_G->adjList->edge_data) {
-      switch (gdf_G->adjList->edge_data->dtype) {
-        case GDF_FLOAT32:
-          settype = CUDA_R_32F;
-          NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
-                                        *nvg_G,
-                                        0,
-                                        settype,
-                                        (float * ) gdf_G->adjList->edge_data->data))
-          break;
-        case GDF_FLOAT64:
-          settype = CUDA_R_64F;
-          NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
-                                        *nvg_G,
-                                        0,
-                                        settype,
-                                        (double * ) gdf_G->adjList->edge_data->data))
-          break;
-        default:
-          return GDF_UNSUPPORTED_DTYPE;
-      }
-    }
-  }
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_sssp_nvgraph(gdf_graph *gdf_G,
-                            const int *source_vert,
-                            gdf_column *sssp_distances) {
-
-  GDF_REQUIRE(gdf_G != nullptr, GDF_INVALID_API_CALL);
-  GDF_REQUIRE(*source_vert >= 0, GDF_INVALID_API_CALL);
-  GDF_REQUIRE(*source_vert < sssp_distances->size, GDF_INVALID_API_CALL);
-  GDF_REQUIRE(sssp_distances != nullptr, GDF_INVALID_API_CALL);
-  GDF_REQUIRE(sssp_distances->data != nullptr, GDF_INVALID_API_CALL);
-  GDF_REQUIRE(!sssp_distances->valid, GDF_VALIDITY_UNSUPPORTED);
-  GDF_REQUIRE(sssp_distances->size > 0, GDF_INVALID_API_CALL);
-
-  // init nvgraph
-  // TODO : time this call
-  nvgraphHandle_t nvg_handle = 0;
-  nvgraphGraphDescr_t nvgraph_G = 0;
-  cudaDataType_t settype;
-
-  NVG_TRY(nvgraphCreate(&nvg_handle));
-  GDF_TRY(gdf_createGraph_nvgraph(nvg_handle, gdf_G, &nvgraph_G, true));
-
-  int sssp_index = 0;
-  int weight_index = 0;
-  Vector<float> d_val;
-
-  //RMM:
-  //        
-  cudaStream_t stream { nullptr };
-  rmm_temp_allocator allocator(stream);
-  if (gdf_G->transposedAdjList->edge_data == nullptr) {
-    // use a fp32 vector  [1,...,1]
-    settype = CUDA_R_32F;
-    d_val.resize(gdf_G->transposedAdjList->indices->size);
-    thrust::fill(thrust::cuda::par(allocator).on(stream), d_val.begin(), d_val.end(), 1.0);
-    NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
-                                  nvgraph_G,
-                                  weight_index,
-                                  settype,
-                                  (void * ) thrust::raw_pointer_cast(d_val.data())));
-  }
-  else {
-    switch (gdf_G->transposedAdjList->edge_data->dtype) {
-      case GDF_FLOAT32:
-        settype = CUDA_R_32F;
-        break;
-      case GDF_FLOAT64:
-        settype = CUDA_R_64F;
-        break;
-      default:
-        return GDF_UNSUPPORTED_DTYPE;
-    }
-  }
-
-  NVG_TRY(nvgraphAttachVertexData(nvg_handle, nvgraph_G, 0, settype, sssp_distances->data));
-
-  NVG_TRY(nvgraphSssp(nvg_handle, nvgraph_G, weight_index, source_vert, sssp_index));
-
-  NVG_TRY(nvgraphDestroyGraphDescr(nvg_handle, nvgraph_G));
-  NVG_TRY(nvgraphDestroy(nvg_handle));
-
-  return GDF_SUCCESS;
-}
-
 gdf_error gdf_balancedCutClustering_nvgraph(gdf_graph* gdf_G,
                                             const int num_clusters,
                                             const int num_eigen_vects,
@@ -285,37 +56,37 @@ gdf_error gdf_balancedCutClustering_nvgraph(gdf_graph* gdf_G,
   nvgraphHandle_t nvg_handle = nullptr;
   nvgraphGraphDescr_t nvgraph_G = nullptr;
         cudaDataType_t settype;
-        Vector<double> d_val;
+        rmm::device_vector<double> d_val;
 
   NVG_TRY(nvgraphCreate(&nvg_handle));
   GDF_TRY(gdf_createGraph_nvgraph(nvg_handle, gdf_G, &nvgraph_G, false));
   int weight_index = 0;
 
-        cudaStream_t stream { nullptr };
-        rmm_temp_allocator allocator(stream);
-        if (gdf_G->adjList->edge_data == nullptr) {
-                // use a fp64 vector  [1,...,1]
-                settype = CUDA_R_64F;
-                d_val.resize(gdf_G->adjList->indices->size);
-                thrust::fill(thrust::cuda::par(allocator).on(stream), d_val.begin(), d_val.end(), 1.0);
-                NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
-                                                                                                                                        nvgraph_G,
-                                                                                                                                        weight_index,
-                                                                                                                                        settype,
-                                                                                                                                        (void * ) thrust::raw_pointer_cast(d_val.data())));
-        }
-        else {
-                switch (gdf_G->adjList->edge_data->dtype) {
-                        case GDF_FLOAT32:
-                                settype = CUDA_R_32F;
-                                break;
-                        case GDF_FLOAT64:
-                                settype = CUDA_R_64F;
-                                break;
-                        default:
-                                return GDF_UNSUPPORTED_DTYPE;
-                }
-        }
+  cudaStream_t stream{nullptr};
+
+  if (gdf_G->adjList->edge_data == nullptr) {
+    // use a fp64 vector  [1,...,1]
+    settype = CUDA_R_64F;
+    d_val.resize(gdf_G->adjList->indices->size);
+    thrust::fill(rmm::exec_policy(stream)->on(stream), d_val.begin(), d_val.end(), 1.0);
+    NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
+                                  nvgraph_G,
+                                  weight_index,
+                                  settype,
+                                  (void * ) thrust::raw_pointer_cast(d_val.data())));
+  }
+  else {
+    switch (gdf_G->adjList->edge_data->dtype) {
+      case GDF_FLOAT32:
+        settype = CUDA_R_32F;
+        break;
+      case GDF_FLOAT64:
+        settype = CUDA_R_64F;
+        break;
+      default:
+        return GDF_UNSUPPORTED_DTYPE;
+    }
+  }
 
 
   // Pack parameters for call to Nvgraph
@@ -442,36 +213,36 @@ gdf_error gdf_AnalyzeClustering_edge_cut_nvgraph(gdf_graph* gdf_G,
   nvgraphHandle_t nvg_handle = nullptr;
   nvgraphGraphDescr_t nvgraph_G = nullptr;
         cudaDataType_t settype;
-        Vector<double> d_val;
+        rmm::device_vector<double> d_val;
 
   NVG_TRY(nvgraphCreate(&nvg_handle));
   GDF_TRY(gdf_createGraph_nvgraph(nvg_handle, gdf_G, &nvgraph_G, false));
   int weight_index = 0;
 
-        cudaStream_t stream { nullptr };
-        rmm_temp_allocator allocator(stream);
-        if (gdf_G->adjList->edge_data == nullptr) {
-                // use a fp64 vector  [1,...,1]
-                settype = CUDA_R_64F;
-                d_val.resize(gdf_G->adjList->indices->size);
-                thrust::fill(thrust::cuda::par(allocator).on(stream), d_val.begin(), d_val.end(), 1.0);
-                NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
-                                                                                                                                        nvgraph_G,
-                                                                                                                                        weight_index,
-                                                                                                                                        settype,
-                                                                                                                                        (void * ) thrust::raw_pointer_cast(d_val.data())));
-        }
-        else {
-                switch (gdf_G->adjList->edge_data->dtype) {
-                        case GDF_FLOAT32:
-                                settype = CUDA_R_32F;
-                                break;
-                        case GDF_FLOAT64:
-                                settype = CUDA_R_64F;
-                                break;
-                        default:
-                                return GDF_UNSUPPORTED_DTYPE;
-                }
+  cudaStream_t stream{nullptr};
+
+  if (gdf_G->adjList->edge_data == nullptr) {
+    // use a fp64 vector  [1,...,1]
+    settype = CUDA_R_64F;
+    d_val.resize(gdf_G->adjList->indices->size);
+    thrust::fill(rmm::exec_policy(stream)->on(stream), d_val.begin(), d_val.end(), 1.0);
+    NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
+                                  nvgraph_G,
+                                  weight_index,
+                                  settype,
+                                  (void * ) thrust::raw_pointer_cast(d_val.data())));
+  }
+  else {
+    switch (gdf_G->adjList->edge_data->dtype) {
+      case GDF_FLOAT32:
+        settype = CUDA_R_32F;
+        break;
+      case GDF_FLOAT64:
+        settype = CUDA_R_64F;
+        break;
+      default:
+        return GDF_UNSUPPORTED_DTYPE;
+      }
   }
 
   // Make Nvgraph call
@@ -560,8 +331,8 @@ gdf_error gdf_extract_subgraph_vertex_nvgraph(gdf_graph* gdf_G,
 
   cudaStream_t stream { nullptr };
 
-  ALLOC_MANAGED_TRY((void**) &offsets, sizeof(int32_t) * (num_verts + 1), stream);
-  ALLOC_MANAGED_TRY((void**) &indices, sizeof(int32_t) * num_edges, stream);
+  ALLOC_TRY((void**) &offsets, sizeof(int32_t) * (num_verts + 1), stream);
+  ALLOC_TRY((void**) &indices, sizeof(int32_t) * num_edges, stream);
 
   gdf_column_view(result->adjList->offsets,
                   offsets,
@@ -598,3 +369,48 @@ gdf_error gdf_triangle_count_nvgraph(gdf_graph* G, uint64_t* result) {
   NVG_TRY(nvgraphTriangleCount(nvg_handle, nvg_G, result));
   return GDF_SUCCESS;
 }
+
+gdf_error gdf_louvain(gdf_graph *graph, void *final_modularity, void *num_level, gdf_column *louvain_parts) {
+  GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL);
+  gdf_error err = gdf_add_adj_list(graph);
+  if (err != GDF_SUCCESS)
+    return err;
+
+  size_t n = graph->adjList->offsets->size - 1;
+  size_t e = graph->adjList->indices->size;
+
+  void* offsets_ptr = graph->adjList->offsets->data;
+  void* indices_ptr = graph->adjList->indices->data;
+
+  void* value_ptr;
+  rmm::device_vector<float> d_values;
+  if(graph->adjList->edge_data) {
+      value_ptr = graph->adjList->edge_data->data;
+  }
+  else {
+      cudaStream_t stream {nullptr};
+      d_values.resize(graph->adjList->indices->size);
+      thrust::fill(rmm::exec_policy(stream)->on(stream), d_values.begin(), d_values.end(), 1.0);
+      value_ptr = (void * ) thrust::raw_pointer_cast(d_values.data());
+  }
+
+  void* louvain_parts_ptr = louvain_parts->data;
+
+  auto gdf_to_cudadtype= [](gdf_column *col){
+    cudaDataType_t cuda_dtype;
+    switch(col->dtype){
+      case GDF_INT8: cuda_dtype = CUDA_R_8I; break;
+      case GDF_INT32: cuda_dtype = CUDA_R_32I; break;
+      case GDF_FLOAT32: cuda_dtype = CUDA_R_32F; break;
+      case GDF_FLOAT64: cuda_dtype = CUDA_R_64F; break;
+      default: throw new std::invalid_argument("Cannot convert data type");
+      }return cuda_dtype;
+  };
+
+  cudaDataType_t index_type = gdf_to_cudadtype(graph->adjList->indices);
+  cudaDataType_t val_type = graph->adjList->edge_data? gdf_to_cudadtype(graph->adjList->edge_data): CUDA_R_32F;
+
+  nvgraphLouvain(index_type, val_type, n, e, offsets_ptr, indices_ptr, value_ptr, 1, 0, NULL,
+                 final_modularity, louvain_parts_ptr, num_level);
+  return GDF_SUCCESS;
+}
diff --git a/cpp/src/COOtoCSR.cuh b/cpp/src/converters/COOtoCSR.cuh
similarity index 63%
rename from cpp/src/COOtoCSR.cuh
rename to cpp/src/converters/COOtoCSR.cuh
index 2ed2da4cd50..f00b352f0e4 100644
--- a/cpp/src/COOtoCSR.cuh
+++ b/cpp/src/converters/COOtoCSR.cuh
@@ -38,33 +38,33 @@
 
 template <typename T>
 struct CSR_Result {
-	std::int64_t size;
-	std::int64_t nnz;
-	T* rowOffsets;
-	T* colIndices;
+    std::int64_t size;
+    std::int64_t nnz;
+    T* rowOffsets;
+    T* colIndices;
 
-	CSR_Result() : size(0), nnz(0), rowOffsets(nullptr), colIndices(nullptr){}
+    CSR_Result() : size(0), nnz(0), rowOffsets(nullptr), colIndices(nullptr){}
 
 };
 
 template <typename T, typename W>
 struct CSR_Result_Weighted {
-	std::int64_t size;
-	std::int64_t nnz;
-	T* rowOffsets;
-	T* colIndices;
-	W* edgeWeights;
+    std::int64_t size;
+    std::int64_t nnz;
+    T* rowOffsets;
+    T* colIndices;
+    W* edgeWeights;
 
-	CSR_Result_Weighted() : size(0), nnz(0), rowOffsets(nullptr), colIndices(nullptr), edgeWeights(nullptr){}
+    CSR_Result_Weighted() : size(0), nnz(0), rowOffsets(nullptr), colIndices(nullptr), edgeWeights(nullptr){}
 
 };
 
 // Define kernel for copying run length encoded values into offset slots.
 template <typename T>
 __global__ void offsetsKernel(T runCounts, T* unique, T* counts, T* offsets) {
-	uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-	if (tid < runCounts)
-		offsets[unique[tid]] = counts[tid];
+    uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    if (tid < runCounts)
+        offsets[unique[tid]] = counts[tid];
 }
 
 // Method for constructing CSR from COO
@@ -73,13 +73,11 @@ gdf_error ConvertCOOtoCSR(T* sources, T* destinations, int64_t nnz, CSR_Result<T
     // Sort source and destination columns by source
     //   Allocate local memory for operating on
     T* srcs{nullptr}, *dests{nullptr};
-    //RMM
-    //
-    cudaStream_t stream{nullptr};
-    rmm_temp_allocator allocator(stream);
+
+    cudaStream_t stream {nullptr};
     
-    ALLOC_MANAGED_TRY((void**)&srcs, sizeof(T) * nnz, stream);
-    ALLOC_MANAGED_TRY((void**)&dests, sizeof(T) * nnz, stream);
+    ALLOC_TRY((void**)&srcs, sizeof(T) * nnz, stream);
+    ALLOC_TRY((void**)&dests, sizeof(T) * nnz, stream);
     
     CUDA_TRY(cudaMemcpy(srcs, sources, sizeof(T) * nnz, cudaMemcpyDefault));
     CUDA_TRY(cudaMemcpy(dests, destinations, sizeof(T) * nnz, cudaMemcpyDefault));
@@ -88,36 +86,36 @@ gdf_error ConvertCOOtoCSR(T* sources, T* destinations, int64_t nnz, CSR_Result<T
     void* tmpStorage = nullptr;
     size_t tmpBytes = 0;
 
-    thrust::stable_sort_by_key(thrust::cuda::par(allocator).on(stream), dests, dests + nnz, srcs);
-    thrust::stable_sort_by_key(thrust::cuda::par(allocator).on(stream), srcs, srcs + nnz, dests);
+    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), dests, dests + nnz, srcs);
+    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), srcs, srcs + nnz, dests);
 
-	// Find max id (since this may be in the dests array but not the srcs array we need to check both)
+    // Find max id (since this may be in the dests array but not the srcs array we need to check both)
     T maxId = -1;
     //   Max from srcs after sorting is just the last element
     CUDA_TRY(cudaMemcpy(&maxId, &(srcs[nnz-1]), sizeof(T), cudaMemcpyDefault));
-    auto maxId_it = thrust::max_element(thrust::cuda::par(allocator).on(stream), dests, dests + nnz);
+    auto maxId_it = thrust::max_element(rmm::exec_policy(stream)->on(stream), dests, dests + nnz);
     T maxId2;
     CUDA_TRY(cudaMemcpy(&maxId2, maxId_it, sizeof(T), cudaMemcpyDefault));
     maxId = maxId > maxId2 ? maxId : maxId2;
     result.size = maxId + 1;
 
     // Allocate offsets array
-    ALLOC_MANAGED_TRY((void**)&result.rowOffsets, (maxId + 2) * sizeof(T), stream);
+    ALLOC_TRY((void**)&result.rowOffsets, (maxId + 2) * sizeof(T), stream);
 
     // Set all values in offsets array to zeros
     CUDA_TRY(cudaMemset(result.rowOffsets, 0,(maxId + 2) * sizeof(int)));
 
     // Allocate temporary arrays same size as sources array, and single value to get run counts
     T* unique{nullptr}, *counts{nullptr}, *runCount{nullptr};
-    ALLOC_MANAGED_TRY((void**)&unique, (maxId + 1) * sizeof(T), stream);
-    ALLOC_MANAGED_TRY((void**)&counts, (maxId + 1) * sizeof(T), stream);
-    ALLOC_MANAGED_TRY((void**)&runCount, sizeof(T), stream);
+    ALLOC_TRY((void**)&unique, (maxId + 1) * sizeof(T), stream);
+    ALLOC_TRY((void**)&counts, (maxId + 1) * sizeof(T), stream);
+    ALLOC_TRY((void**)&runCount, sizeof(T), stream);
 
     // Use CUB run length encoding to get unique values and run lengths
     tmpStorage = nullptr;
-    cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
-    ALLOC_MANAGED_TRY((void**)&tmpStorage, tmpBytes, stream);
-    cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
+    CUDA_TRY(cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz));
+    ALLOC_TRY((void**)&tmpStorage, tmpBytes, stream);
+    CUDA_TRY(cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz));
     ALLOC_FREE_TRY(tmpStorage, stream);
 
     // Set offsets to run sizes for each index
@@ -128,7 +126,7 @@ gdf_error ConvertCOOtoCSR(T* sources, T* destinations, int64_t nnz, CSR_Result<T
     offsetsKernel<<<numBlocks, threadsPerBlock>>>(runCount_h, unique, counts, result.rowOffsets);
 
     // Scan offsets to get final offsets
-    thrust::exclusive_scan(thrust::cuda::par(allocator).on(stream), result.rowOffsets, result.rowOffsets + maxId + 2, result.rowOffsets);
+    thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), result.rowOffsets, result.rowOffsets + maxId + 2, result.rowOffsets);
 
     // Clean up temporary allocations
     result.nnz = nnz;
@@ -149,26 +147,24 @@ gdf_error ConvertCOOtoCSR_weighted(T* sources, T* destinations, W* edgeWeights,
     T* dests{nullptr};
     W* weights{nullptr};
     
-    //RMM:
-    //
-    cudaStream_t stream{nullptr};
-    rmm_temp_allocator allocator(stream);
-    ALLOC_MANAGED_TRY((void**)&srcs, sizeof(T) * nnz, stream);
-    ALLOC_MANAGED_TRY((void**)&dests, sizeof(T) * nnz, stream);
-    ALLOC_MANAGED_TRY((void**)&weights, sizeof(W) * nnz, stream);
+    cudaStream_t stream {nullptr};
+
+    ALLOC_TRY((void**)&srcs, sizeof(T) * nnz, stream);
+    ALLOC_TRY((void**)&dests, sizeof(T) * nnz, stream);
+    ALLOC_TRY((void**)&weights, sizeof(W) * nnz, stream);
     CUDA_TRY(cudaMemcpy(srcs, sources, sizeof(T) * nnz, cudaMemcpyDefault));
     CUDA_TRY(cudaMemcpy(dests, destinations, sizeof(T) * nnz, cudaMemcpyDefault));
     CUDA_TRY(cudaMemcpy(weights, edgeWeights, sizeof(W) * nnz, cudaMemcpyDefault));
 
     // Call Thrust::sort_by_key to sort the arrays with srcs as keys:
-    thrust::stable_sort_by_key(thrust::cuda::par(allocator).on(stream), dests, dests + nnz, thrust::make_zip_iterator(thrust::make_tuple(srcs, weights)));
-    thrust::stable_sort_by_key(thrust::cuda::par(allocator).on(stream), srcs, srcs + nnz, thrust::make_zip_iterator(thrust::make_tuple(dests, weights)));
+    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), dests, dests + nnz, thrust::make_zip_iterator(thrust::make_tuple(srcs, weights)));
+    thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), srcs, srcs + nnz, thrust::make_zip_iterator(thrust::make_tuple(dests, weights)));
 
-	// Find max id (since this may be in the dests array but not the srcs array we need to check both)
+    // Find max id (since this may be in the dests array but not the srcs array we need to check both)
     T maxId = -1;
     //   Max from srcs after sorting is just the last element
     CUDA_TRY(cudaMemcpy(&maxId, &(srcs[nnz-1]), sizeof(T), cudaMemcpyDefault));
-    auto maxId_it = thrust::max_element(thrust::cuda::par(allocator).on(stream), dests, dests + nnz);
+    auto maxId_it = thrust::max_element(rmm::exec_policy(stream)->on(stream), dests, dests + nnz);
     //   Max from dests requires a scan to find
     T maxId2;
     CUDA_TRY(cudaMemcpy(&maxId2, maxId_it, sizeof(T), cudaMemcpyDefault));
@@ -176,7 +172,7 @@ gdf_error ConvertCOOtoCSR_weighted(T* sources, T* destinations, W* edgeWeights,
     result.size = maxId + 1;
 
     // Allocate offsets array
-    ALLOC_MANAGED_TRY((void**)&result.rowOffsets, (maxId + 2) * sizeof(T), stream);
+    ALLOC_TRY((void**)&result.rowOffsets, (maxId + 2) * sizeof(T), stream);
 
     // Set all values in offsets array to zeros
     // /CUDA_TRY(
@@ -186,16 +182,16 @@ gdf_error ConvertCOOtoCSR_weighted(T* sources, T* destinations, W* edgeWeights,
 
     // Allocate temporary arrays same size as sources array, and single value to get run counts
     T* unique, *counts, *runCount;
-    ALLOC_MANAGED_TRY((void**)&unique, (maxId + 1) * sizeof(T), stream);
-    ALLOC_MANAGED_TRY((void**)&counts, (maxId + 1) * sizeof(T), stream);
-    ALLOC_MANAGED_TRY((void**)&runCount, sizeof(T), stream);
+    ALLOC_TRY((void**)&unique, (maxId + 1) * sizeof(T), stream);
+    ALLOC_TRY((void**)&counts, (maxId + 1) * sizeof(T), stream);
+    ALLOC_TRY((void**)&runCount, sizeof(T), stream);
 
     // Use CUB run length encoding to get unique values and run lengths
     void *tmpStorage = nullptr;
     size_t tmpBytes = 0;
-    cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
-    ALLOC_MANAGED_TRY(&tmpStorage, tmpBytes, stream);
-    cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
+    CUDA_TRY(cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz));
+    ALLOC_TRY(&tmpStorage, tmpBytes, stream);
+    CUDA_TRY(cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz));
     ALLOC_FREE_TRY(tmpStorage, stream);
 
     // Set offsets to run sizes for each index
@@ -206,7 +202,7 @@ gdf_error ConvertCOOtoCSR_weighted(T* sources, T* destinations, W* edgeWeights,
     offsetsKernel<<<numBlocks, threadsPerBlock>>>(runCount_h, unique, counts, result.rowOffsets);
 
     // Scan offsets to get final offsets
-    thrust::exclusive_scan(thrust::cuda::par(allocator).on(stream), result.rowOffsets, result.rowOffsets + maxId + 2, result.rowOffsets);
+    thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), result.rowOffsets, result.rowOffsets + maxId + 2, result.rowOffsets);
 
     // Clean up temporary allocations
     result.nnz = nnz;
diff --git a/cpp/src/converters/nvgraph.cu b/cpp/src/converters/nvgraph.cu
new file mode 100644
index 00000000000..cc448b54494
--- /dev/null
+++ b/cpp/src/converters/nvgraph.cu
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/** ---------------------------------------------------------------------------*
+ * @brief Wrapper functions for Nvgraph
+ *
+ * @file nvgraph_gdf.cu
+ * ---------------------------------------------------------------------------**/
+
+#include <nvgraph_gdf.h>
+#include <ctime>
+#include "utilities/error_utils.h"
+#include "converters/nvgraph.cuh"
+
+gdf_error gdf_createGraph_nvgraph(nvgraphHandle_t nvg_handle,
+                                  gdf_graph* gdf_G,
+                                  nvgraphGraphDescr_t* nvg_G,
+                                  bool use_transposed) {
+
+  // check input
+  GDF_REQUIRE(!((gdf_G->edgeList == nullptr) &&
+                  (gdf_G->adjList == nullptr) &&
+                  (gdf_G->transposedAdjList == nullptr)),
+              GDF_INVALID_API_CALL);
+  nvgraphTopologyType_t TT;
+  cudaDataType_t settype;
+  // create an nvgraph graph handle
+  NVG_TRY(nvgraphCreateGraphDescr(nvg_handle, nvg_G));
+  // setup nvgraph variables
+  if (use_transposed) {
+    // convert edgeList to transposedAdjList
+    if (gdf_G->transposedAdjList == nullptr) {
+      GDF_TRY(gdf_add_transposed_adj_list(gdf_G));
+    }
+    // using exiting transposedAdjList if it exisits and if adjList is missing
+    TT = NVGRAPH_CSC_32;
+    nvgraphCSCTopology32I_st topoData;
+    topoData.nvertices = gdf_G->transposedAdjList->offsets->size - 1;
+    topoData.nedges = gdf_G->transposedAdjList->indices->size;
+    topoData.destination_offsets = (int *) gdf_G->transposedAdjList->offsets->data;
+    topoData.source_indices = (int *) gdf_G->transposedAdjList->indices->data;
+    // attach the transposed adj list
+    NVG_TRY(nvgraphAttachGraphStructure(nvg_handle, *nvg_G, (void * )&topoData, TT));
+    //attach edge values
+    if (gdf_G->transposedAdjList->edge_data) {
+      switch (gdf_G->transposedAdjList->edge_data->dtype) {
+        case GDF_FLOAT32:
+          settype = CUDA_R_32F;
+          NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
+                                        *nvg_G,
+                                        0,
+                                        settype,
+                                        (float * ) gdf_G->transposedAdjList->edge_data->data))
+          break;
+        case GDF_FLOAT64:
+          settype = CUDA_R_64F;
+          NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
+                                        *nvg_G,
+                                        0,
+                                        settype,
+                                        (double * ) gdf_G->transposedAdjList->edge_data->data))
+          break;
+        default:
+          return GDF_UNSUPPORTED_DTYPE;
+      }
+    }
+
+  }
+  else {
+    // convert edgeList to adjList
+    if (gdf_G->adjList == nullptr) {
+      GDF_TRY(gdf_add_adj_list(gdf_G));
+    }
+    TT = NVGRAPH_CSR_32;
+    nvgraphCSRTopology32I_st topoData;
+    topoData.nvertices = gdf_G->adjList->offsets->size - 1;
+    topoData.nedges = gdf_G->adjList->indices->size;
+    topoData.source_offsets = (int *) gdf_G->adjList->offsets->data;
+     topoData.destination_indices = (int *) gdf_G->adjList->indices->data;
+ 
+    // attach adj list
+    NVG_TRY(nvgraphAttachGraphStructure(nvg_handle, *nvg_G, (void * )&topoData, TT));
+    //attach edge values
+    if (gdf_G->adjList->edge_data) {
+      switch (gdf_G->adjList->edge_data->dtype) {
+        case GDF_FLOAT32:
+          settype = CUDA_R_32F;
+          NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
+                                        *nvg_G,
+                                        0,
+                                        settype,
+                                        (float * ) gdf_G->adjList->edge_data->data))
+          break;
+        case GDF_FLOAT64:
+          settype = CUDA_R_64F;
+          NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
+                                        *nvg_G,
+                                        0,
+                                        settype,
+                                        (double * ) gdf_G->adjList->edge_data->data))
+          break;
+        default:
+          return GDF_UNSUPPORTED_DTYPE;
+      }
+    }
+  }
+  return GDF_SUCCESS;
+}
diff --git a/cpp/src/converters/nvgraph.cuh b/cpp/src/converters/nvgraph.cuh
new file mode 100644
index 00000000000..76c1ff97b69
--- /dev/null
+++ b/cpp/src/converters/nvgraph.cuh
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvgraph/nvgraph.h>
+#include <cugraph.h>
+
+/**
+ * Takes a GDF graph and wraps its data with an Nvgraph graph object.
+ * @param nvg_handle The Nvgraph handle
+ * @param gdf_G Pointer to GDF graph object
+ * @param nvgraph_G Pointer to the Nvgraph graph descriptor
+ * @param use_transposed True if we are transposing the input graph while wrapping
+ * @return Error code
+ */
+gdf_error gdf_createGraph_nvgraph(nvgraphHandle_t nvg_handle,
+                                  gdf_graph* gdf_G,
+                                  nvgraphGraphDescr_t * nvgraph_G,
+bool use_transposed = false);
diff --git a/cpp/src/converters/renumber.cu b/cpp/src/converters/renumber.cu
new file mode 100644
index 00000000000..d7821ab6f55
--- /dev/null
+++ b/cpp/src/converters/renumber.cu
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Renumber vertices
+// Author: Chuck Hastings charlesh@nvidia.com
+
+#include "renumber.cuh"
+
+gdf_error gdf_renumber_vertices(const gdf_column *src, const gdf_column *dst,
+                                gdf_column *src_renumbered, gdf_column *dst_renumbered,
+                                gdf_column *numbering_map) {
+  GDF_REQUIRE( src->size == dst->size, GDF_COLUMN_SIZE_MISMATCH );
+  GDF_REQUIRE( src->dtype == dst->dtype, GDF_UNSUPPORTED_DTYPE );
+  GDF_REQUIRE( ((src->dtype == GDF_INT32) || (src->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE );
+  GDF_REQUIRE( src->size > 0, GDF_DATASET_EMPTY );
+
+  //
+  //  TODO: we're currently renumbering without using valid.  We need to
+  //        worry about that at some point, but for now we'll just
+  //        copy the valid pointers to the new columns and go from there.
+  //
+  cudaStream_t stream{nullptr};
+
+  size_t src_size = src->size;
+  size_t new_size;
+
+  //
+  // TODO:  I assume int64_t for output.  A few thoughts:
+  //
+  //    * I could match src->dtype - since if the raw values fit in an int32_t,
+  //      then the renumbered values must fit within an int32_t
+  //    * If new_size < (2^31 - 1) then I could allocate 32-bit integers
+  //      and copy them in order to make the final footprint smaller.
+  //
+  //
+  //  NOTE:  Forcing match right now - it appears that cugraph is artficially
+  //         forcing the type to be 32
+  if (src->dtype == GDF_INT32) {
+    int32_t *tmp;
+
+    ALLOC_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream);
+    gdf_column_view(src_renumbered, tmp, src->valid, src->size, src->dtype);
+
+    ALLOC_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream);
+    gdf_column_view(dst_renumbered, tmp, dst->valid, dst->size, dst->dtype);
+
+    gdf_error err = cugraph::renumber_vertices(src_size,
+                                               (const int32_t *) src->data,
+                                               (const int32_t *) dst->data,
+                                               (int32_t *) src_renumbered->data,
+                                               (int32_t *) dst_renumbered->data,
+                                               &new_size, &tmp);
+    if (err != GDF_SUCCESS)
+      return err;
+
+    gdf_column_view(numbering_map, tmp, nullptr, new_size, src->dtype);
+  } else if (src->dtype == GDF_INT64) {
+
+    //
+    //  NOTE: At the moment, we force the renumbered graph to use
+    //        32-bit integer ids.  Since renumbering is going to make
+    //        the vertex range dense, this limits us to 2 billion
+    //        vertices.
+    //
+    //        The renumbering code supports 64-bit integer generation
+    //        so we can run this with int64_t output if desired...
+    //        but none of the algorithms support that.
+    //
+    int64_t *tmp;
+    ALLOC_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream);
+    gdf_column_view(src_renumbered, tmp, src->valid, src->size, GDF_INT32);
+
+    ALLOC_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream);
+    gdf_column_view(dst_renumbered, tmp, dst->valid, dst->size, GDF_INT32);
+
+    gdf_error err = cugraph::renumber_vertices(src_size,
+                                               (const int64_t *) src->data,
+                                               (const int64_t *) dst->data,
+                                               (int32_t *) src_renumbered->data,
+                                               (int32_t *) dst_renumbered->data,
+                                               &new_size, &tmp);
+    if (err != GDF_SUCCESS)
+      return err;
+
+    //
+    //  If there are too many vertices then the renumbering overflows so we'll
+    //  return an error.
+    //
+    if (new_size > 0x7fffffff) {
+      ALLOC_FREE_TRY(src_renumbered, stream);
+      ALLOC_FREE_TRY(dst_renumbered, stream);
+      return GDF_COLUMN_SIZE_TOO_BIG;
+    }
+
+    gdf_column_view(numbering_map, tmp, nullptr, new_size, src->dtype);
+  } else {
+    return GDF_UNSUPPORTED_DTYPE;
+  }
+
+  return GDF_SUCCESS;
+}
diff --git a/cpp/src/renumber.cuh b/cpp/src/converters/renumber.cuh
similarity index 90%
rename from cpp/src/renumber.cuh
rename to cpp/src/converters/renumber.cuh
index 0b05135e3cc..5e2fa069267 100644
--- a/cpp/src/renumber.cuh
+++ b/cpp/src/converters/renumber.cuh
@@ -34,8 +34,8 @@
 #include <cuda_runtime_api.h>
 
 #include "utilities/error_utils.h"
-#include "graph_utils.cuh"
-#include "heap.cuh"
+#include "utilities/graph_utils.cuh"
+#include "utilities/heap.cuh"
 #include "rmm_utils.h"
 
 namespace cugraph {
@@ -134,25 +134,27 @@ namespace cugraph {
 
     }
 
-    __global__ void SetupHash(hash_type hash_size, index_type *hash_bins_start, index_type *hash_bins_end) {
+    template <typename H, typename I>
+    __global__ void SetupHash(H hash_size, I *hash_bins_start, I *hash_bins_end) {
       hash_bins_end[0] = 0;
-      for (hash_type i = 0 ; i < hash_size ; ++i) {
+      for (H i = 0 ; i < hash_size ; ++i) {
         hash_bins_end[i+1] = hash_bins_end[i] + hash_bins_start[i];
       }
 
-      for (hash_type i = 0 ; i < (hash_size + 1) ; ++i) {
+      for (H i = 0 ; i < (hash_size + 1) ; ++i) {
         hash_bins_start[i] = hash_bins_end[i];
       }
    }
 
-    __global__ void ComputeBase(hash_type hash_size, index_type *hash_bins_base) {
-      index_type sum = 0;
-      for (hash_type i = 0 ; i < hash_size ; ++i) {
+    template <typename H, typename I>
+    __global__ void ComputeBase(H hash_size, I *hash_bins_base) {
+      I sum = 0;
+      for (H i = 0 ; i < hash_size ; ++i) {
         sum += hash_bins_base[i];
       }
 
       hash_bins_base[hash_size] = sum;
-      for (hash_type i = hash_size ; i > 0 ; --i) {
+      for (H i = hash_size ; i > 0 ; --i) {
         hash_bins_base[i-1] = hash_bins_base[i] - hash_bins_base[i-1];
       }
     }
@@ -202,9 +204,9 @@ namespace cugraph {
     //
     // We need 3 for hashing, and one array for data
     //
-    cudaStream_t stream{nullptr};
-    rmm_temp_allocator allocator(stream);
-    
+
+    cudaStream_t stream {nullptr};
+
     T_in *hash_data;
 
     detail::HashFunctionObject<T_in>  hash(hash_size);
@@ -218,10 +220,10 @@ namespace cugraph {
     int hash_threads_per_block = min((int) hash_size, max_threads_per_block);
     int hash_thread_blocks = min(((int) hash_size + hash_threads_per_block - 1) / hash_threads_per_block, max_blocks);
 
-    ALLOC_TRY(&hash_data,       2 * size * sizeof(T_in), nullptr);
-    ALLOC_TRY(&hash_bins_start, (1 + hash_size) * sizeof(detail::index_type), nullptr);
-    ALLOC_TRY(&hash_bins_end,   (1 + hash_size) * sizeof(detail::index_type), nullptr);
-    ALLOC_TRY(&hash_bins_base,  (1 + hash_size) * sizeof(detail::index_type), nullptr);
+    ALLOC_TRY(&hash_data,       2 * size * sizeof(T_in), stream);
+    ALLOC_TRY(&hash_bins_start, (1 + hash_size) * sizeof(detail::index_type), stream);
+    ALLOC_TRY(&hash_bins_end,   (1 + hash_size) * sizeof(detail::index_type), stream);
+    ALLOC_TRY(&hash_bins_base,  (1 + hash_size) * sizeof(detail::index_type), stream);
 
     //
     //  Pass 1: count how many vertex ids end up in each hash bin
@@ -229,13 +231,13 @@ namespace cugraph {
     CUDA_TRY(cudaMemset(hash_bins_start, 0, (1 + hash_size) * sizeof(detail::index_type)));
     CUDA_TRY(cudaMemset(hash_bins_base, 0, (1 + hash_size) * sizeof(detail::index_type)));
 
-    thrust::for_each(thrust::cuda::par(allocator).on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream),
 		     src, src + size,
 		     [hash_bins_start, hash] __device__ (T_in vid) {
 		       atomicAdd(hash_bins_start + hash(vid), detail::index_type{1});
 		     });
     
-    thrust::for_each(thrust::cuda::par(allocator).on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream),
 		     dst, dst + size,
 		     [hash_bins_start, hash] __device__ (T_in vid) {
 		       atomicAdd(hash_bins_start + hash(vid), detail::index_type{1});
@@ -252,7 +254,7 @@ namespace cugraph {
     //  Pass 2: Populate hash_data with data from the hash bins.  This implementation
     //    will do some partial deduplication, but we'll need to fully dedupe later.
     //
-    thrust::for_each(thrust::cuda::par(allocator).on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream),
 		     src, src + size,
 		     [hash_bins_end, hash_data, hash] __device__ (T_in vid) {
 		       uint32_t hash_index = hash(vid);
@@ -260,7 +262,7 @@ namespace cugraph {
 		       hash_data[hash_offset] = vid;
 		     });
 		     
-    thrust::for_each(thrust::cuda::par(allocator).on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream),
 		     dst, dst + size,
 		     [hash_bins_end, hash_data, hash] __device__ (T_in vid) {
 		       uint32_t hash_index = hash(vid);
@@ -282,7 +284,7 @@ namespace cugraph {
     //  Finally, we'll iterate over src and dst and populate src_renumbered
     //  and dst_renumbered.
     //
-    thrust::for_each(thrust::cuda::par(allocator).on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream),
 		     thrust::make_counting_iterator<detail::index_type>(0),
 		     thrust::make_counting_iterator<detail::index_type>(size),
 		     [hash_data, hash_bins_start, hash_bins_end,
@@ -293,7 +295,7 @@ namespace cugraph {
 		       src_renumbered[idx] = hash_bins_base[tmp] + (id - (hash_data + hash_bins_start[tmp]));
 		     });
 
-    thrust::for_each(thrust::cuda::par(allocator).on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream),
 		     thrust::make_counting_iterator<detail::index_type>(0),
 		     thrust::make_counting_iterator<detail::index_type>(size),
 		     [hash_data, hash_bins_start, hash_bins_end,
@@ -312,7 +314,7 @@ namespace cugraph {
     
     T_in * local_numbering_map = *numbering_map;
 
-    thrust::for_each(thrust::cuda::par(allocator).on(stream),
+    thrust::for_each(rmm::exec_policy(stream)->on(stream),
 		     thrust::make_counting_iterator<detail::index_type>(0),
 		     thrust::make_counting_iterator<detail::index_type>(hash_size),
 		     [hash_data, hash_bins_start, hash_bins_end,
diff --git a/cpp/src/cugraph.cu b/cpp/src/cugraph.cu
deleted file mode 100644
index 80975930de7..00000000000
--- a/cpp/src/cugraph.cu
+++ /dev/null
@@ -1,671 +0,0 @@
-// -*-c++-*-
-
- /*
- * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Graph analytics features
-// Author: Alex Fender afender@nvidia.com
-
-#include <cugraph.h>
-#include "graph_utils.cuh"
-#include "pagerank.cuh"
-#include "COOtoCSR.cuh"
-#include "utilities/error_utils.h"
-#include "bfs.cuh"
-#include "renumber.cuh"
-#include "snmg/spmv.cuh"
-#include <library_types.h>
-#include <nvgraph/nvgraph.h>
-#include <thrust/device_vector.h>
-
-#include <rmm_utils.h>
-
-template<typename T>
-using Vector = thrust::device_vector<T, rmm_allocator<T>>;
-
-/*
- * cudf has gdf_column_free and using this is, in general, better design than
- * creating our own, but we will keep this as cudf is planning to remove the
- * function. cudf plans to redesign cudf::column to fundamentally solve this
- * problem, so once they finished the redesign, we need to update this code to
- * use their new features. Until that time, we may rely on this as a temporary
- * solution.
- */
-void gdf_col_delete(gdf_column* col) {
-  if (col != nullptr) {
-    auto stream = cudaStream_t{nullptr};
-    if (col->data != nullptr) {
-      ALLOC_FREE_TRY(col->data, stream);
-    }
-    if (col->valid != nullptr) {
-      ALLOC_FREE_TRY(col->valid, stream);
-    }
-#if 0/* Currently, gdf_column_view does not set col_name, and col_name can have
-        an arbitrary value, so freeing col_name can lead to freeing a ranodom
-        address. This problem should be cleaned up once cudf finishes
-        redesigning cudf::column. */
-    if (col->col_name != nullptr) {
-      free(col->col_name);
-    }
-#endif
-    delete col;
-  }
-}
-
-void gdf_col_release(gdf_column* col) {
-  delete col;
-}
-
-void cpy_column_view(const gdf_column *in, gdf_column *out) {
-  if (in != nullptr && out !=nullptr) {
-    gdf_column_view(out, in->data, in->valid, in->size, in->dtype);
-  }
-}
-
-gdf_error gdf_adj_list_view(gdf_graph *graph, const gdf_column *offsets,
-                                 const gdf_column *indices, const gdf_column *edge_data) {
-  //This function returns an error if this graph object has at least one graph
-  //representation to prevent a single object storing two different graphs.
-  GDF_REQUIRE( ((graph->edgeList == nullptr) && (graph->adjList == nullptr) &&
-    (graph->transposedAdjList == nullptr)), GDF_INVALID_API_CALL);
-  GDF_REQUIRE( offsets->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
-  GDF_REQUIRE( indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
-  GDF_REQUIRE( (offsets->dtype == indices->dtype), GDF_UNSUPPORTED_DTYPE );
-  GDF_REQUIRE( ((offsets->dtype == GDF_INT32) || (offsets->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE );
-  GDF_REQUIRE( (offsets->size > 0), GDF_DATASET_EMPTY );
-
-  graph->adjList = new gdf_adj_list;
-  graph->adjList->offsets = new gdf_column;
-  graph->adjList->indices = new gdf_column;
-  graph->adjList->ownership = 0;
-
-  cpy_column_view(offsets, graph->adjList->offsets);
-  cpy_column_view(indices, graph->adjList->indices);
-  if (edge_data) {
-      GDF_REQUIRE( indices->size == edge_data->size, GDF_COLUMN_SIZE_MISMATCH );
-      graph->adjList->edge_data = new gdf_column;
-      cpy_column_view(edge_data, graph->adjList->edge_data);
-  }
-  else {
-    graph->adjList->edge_data = nullptr;
-  }
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_adj_list::get_vertex_identifiers(gdf_column *identifiers) {
-  GDF_REQUIRE( offsets != nullptr , GDF_INVALID_API_CALL);
-  GDF_REQUIRE( offsets->data != nullptr , GDF_INVALID_API_CALL);
-  cugraph::sequence<int>((int)offsets->size-1, (int*)identifiers->data);
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_adj_list::get_source_indices (gdf_column *src_indices) {
-  GDF_REQUIRE( offsets != nullptr , GDF_INVALID_API_CALL);
-  GDF_REQUIRE( offsets->data != nullptr , GDF_INVALID_API_CALL);
-  GDF_REQUIRE( src_indices->size == indices->size, GDF_COLUMN_SIZE_MISMATCH );
-  GDF_REQUIRE( src_indices->dtype == indices->dtype, GDF_UNSUPPORTED_DTYPE );
-  GDF_REQUIRE( src_indices->size > 0, GDF_DATASET_EMPTY );
-  cugraph::offsets_to_indices<int>((int*)offsets->data, offsets->size-1, (int*)src_indices->data);
-
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_renumber_vertices(const gdf_column *src, const gdf_column *dst,
-				gdf_column *src_renumbered, gdf_column *dst_renumbered,
-				gdf_column *numbering_map) {
-
-  GDF_REQUIRE( src->size == dst->size, GDF_COLUMN_SIZE_MISMATCH );
-  GDF_REQUIRE( src->dtype == dst->dtype, GDF_UNSUPPORTED_DTYPE );
-  GDF_REQUIRE( ((src->dtype == GDF_INT32) || (src->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE );
-  GDF_REQUIRE( src->size > 0, GDF_DATASET_EMPTY ); 
-
-  //
-  //  TODO: we're currently renumbering without using valid.  We need to
-  //        worry about that at some point, but for now we'll just
-  //        copy the valid pointers to the new columns and go from there.
-  //
-  cudaStream_t stream{nullptr};
-
-  size_t src_size = src->size;
-  size_t new_size;
-
-  //
-  // TODO:  I assume int64_t for output.  A few thoughts:
-  //
-  //    * I could match src->dtype - since if the raw values fit in an int32_t,
-  //      then the renumbered values must fit within an int32_t
-  //    * If new_size < (2^31 - 1) then I could allocate 32-bit integers
-  //      and copy them in order to make the final footprint smaller.
-  //
-  //
-  //  NOTE:  Forcing match right now - it appears that cugraph is artficially
-  //         forcing the type to be 32
-  if (src->dtype == GDF_INT32) {
-    int32_t *tmp;
-
-    ALLOC_MANAGED_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream);
-    gdf_column_view(src_renumbered, tmp, src->valid, src->size, src->dtype);
-  
-    ALLOC_MANAGED_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream);
-    gdf_column_view(dst_renumbered, tmp, dst->valid, dst->size, dst->dtype);
-
-    gdf_error err = cugraph::renumber_vertices(src_size,
-					       (const int32_t *) src->data,
-					       (const int32_t *) dst->data,
-					       (int32_t *) src_renumbered->data,
-					       (int32_t *) dst_renumbered->data,
-					       &new_size, &tmp);
-    if (err != GDF_SUCCESS)
-      return err;
-
-    gdf_column_view(numbering_map, tmp, nullptr, new_size, src->dtype);
-  } else if (src->dtype == GDF_INT64) {
-
-    //
-    //  NOTE: At the moment, we force the renumbered graph to use
-    //        32-bit integer ids.  Since renumbering is going to make
-    //        the vertex range dense, this limits us to 2 billion
-    //        vertices.
-    //
-    //        The renumbering code supports 64-bit integer generation
-    //        so we can run this with int64_t output if desired...
-    //        but none of the algorithms support that.
-    //
-    int64_t *tmp;
-    ALLOC_MANAGED_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream);
-    gdf_column_view(src_renumbered, tmp, src->valid, src->size, GDF_INT32);
-  
-    ALLOC_MANAGED_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream);
-    gdf_column_view(dst_renumbered, tmp, dst->valid, dst->size, GDF_INT32);
-
-    gdf_error err = cugraph::renumber_vertices(src_size,
-					       (const int64_t *) src->data,
-					       (const int64_t *) dst->data,
-					       (int32_t *) src_renumbered->data,
-					       (int32_t *) dst_renumbered->data,
-					       &new_size, &tmp);
-    if (err != GDF_SUCCESS)
-      return err;
-
-    //
-    //  If there are too many vertices then the renumbering overflows so we'll
-    //  return an error.
-    //
-    if (new_size > 0x7fffffff) {
-      ALLOC_FREE_TRY(src_renumbered, stream);
-      ALLOC_FREE_TRY(dst_renumbered, stream);
-      return GDF_COLUMN_SIZE_TOO_BIG;
-    }
-
-    gdf_column_view(numbering_map, tmp, nullptr, new_size, src->dtype);
-  } else {
-    return GDF_UNSUPPORTED_DTYPE;
-  }
-
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_edge_list_view(gdf_graph *graph, const gdf_column *src_indices,
-                                 const gdf_column *dest_indices, const gdf_column *edge_data) {
-  //This function returns an error if this graph object has at least one graph
-  //representation to prevent a single object storing two different graphs.
-  GDF_REQUIRE( ((graph->edgeList == nullptr) && (graph->adjList == nullptr) &&
-    (graph->transposedAdjList == nullptr)), GDF_INVALID_API_CALL);
-  GDF_REQUIRE( src_indices->size == dest_indices->size, GDF_COLUMN_SIZE_MISMATCH );
-  GDF_REQUIRE( src_indices->dtype == dest_indices->dtype, GDF_UNSUPPORTED_DTYPE );
-  GDF_REQUIRE( ((src_indices->dtype == GDF_INT32) || (src_indices->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE );
-  GDF_REQUIRE( src_indices->size > 0, GDF_DATASET_EMPTY );
-  GDF_REQUIRE( src_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
-  GDF_REQUIRE( dest_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
-
-  graph->edgeList = new gdf_edge_list;
-  graph->edgeList->src_indices = new gdf_column;
-  graph->edgeList->dest_indices = new gdf_column;
-  graph->edgeList->ownership = 0;
-
-  cpy_column_view(src_indices, graph->edgeList->src_indices);
-  cpy_column_view(dest_indices, graph->edgeList->dest_indices);
-  if (edge_data) {
-      GDF_REQUIRE( src_indices->size == edge_data->size, GDF_COLUMN_SIZE_MISMATCH );
-      graph->edgeList->edge_data = new gdf_column;
-      cpy_column_view(edge_data, graph->edgeList->edge_data);
-  }
-  else {
-    graph->edgeList->edge_data = nullptr;
-  }
-
-  return GDF_SUCCESS;
-}
-
-template <typename T, typename WT>
-gdf_error gdf_add_adj_list_impl (gdf_graph *graph) {
-    if (graph->adjList == nullptr) {
-      GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL);
-      int nnz = graph->edgeList->src_indices->size, status = 0;
-      graph->adjList = new gdf_adj_list;
-      graph->adjList->offsets = new gdf_column;
-      graph->adjList->indices = new gdf_column;
-      graph->adjList->ownership = 1;
-
-    if (graph->edgeList->edge_data!= nullptr) {
-      graph->adjList->edge_data = new gdf_column;
-
-      CSR_Result_Weighted<int32_t,WT> adj_list;
-      status = ConvertCOOtoCSR_weighted((int*)graph->edgeList->src_indices->data, (int*)graph->edgeList->dest_indices->data, (WT*)graph->edgeList->edge_data->data, nnz, adj_list);
-
-      gdf_column_view(graph->adjList->offsets, adj_list.rowOffsets,
-                            nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype);
-      gdf_column_view(graph->adjList->indices, adj_list.colIndices,
-                            nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype);
-      gdf_column_view(graph->adjList->edge_data, adj_list.edgeWeights,
-                          nullptr, adj_list.nnz, graph->edgeList->edge_data->dtype);
-    }
-    else {
-      CSR_Result<int> adj_list;
-      status = ConvertCOOtoCSR((int*)graph->edgeList->src_indices->data,(int*)graph->edgeList->dest_indices->data, nnz, adj_list);
-      gdf_column_view(graph->adjList->offsets, adj_list.rowOffsets,
-                            nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype);
-      gdf_column_view(graph->adjList->indices, adj_list.colIndices,
-                            nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype);
-    }
-    if (status !=0) {
-      std::cerr << "Could not generate the adj_list" << std::endl;
-      return GDF_CUDA_ERROR;
-    }
-  }
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_add_edge_list (gdf_graph *graph) {
-    if (graph->edgeList == nullptr) {
-      GDF_REQUIRE( graph->adjList != nullptr , GDF_INVALID_API_CALL);
-      int *d_src;
-      graph->edgeList = new gdf_edge_list;
-      graph->edgeList->src_indices = new gdf_column;
-      graph->edgeList->dest_indices = new gdf_column;
-      graph->edgeList->ownership = 2;
-
-      CUDA_TRY(cudaMallocManaged ((void**)&d_src, sizeof(int) * graph->adjList->indices->size));
-
-      cugraph::offsets_to_indices<int>((int*)graph->adjList->offsets->data,
-                                  graph->adjList->offsets->size-1,
-                                  (int*)d_src);
-
-      gdf_column_view(graph->edgeList->src_indices, d_src,
-                      nullptr, graph->adjList->indices->size, graph->adjList->indices->dtype);
-      cpy_column_view(graph->adjList->indices, graph->edgeList->dest_indices);
-
-      if (graph->adjList->edge_data != nullptr) {
-        graph->edgeList->edge_data = new gdf_column;
-        cpy_column_view(graph->adjList->edge_data, graph->edgeList->edge_data);
-      }
-  }
-  return GDF_SUCCESS;
-}
-
-
-template <typename WT>
-gdf_error gdf_add_transposed_adj_list_impl (gdf_graph *graph) {
-    if (graph->transposedAdjList == nullptr ) {
-      GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL);
-      int nnz = graph->edgeList->src_indices->size, status = 0;
-      graph->transposedAdjList = new gdf_adj_list;
-      graph->transposedAdjList->offsets = new gdf_column;
-      graph->transposedAdjList->indices = new gdf_column;
-      graph->transposedAdjList->ownership = 1;
-
-      if (graph->edgeList->edge_data) {
-        graph->transposedAdjList->edge_data = new gdf_column;
-        CSR_Result_Weighted<int32_t,WT> adj_list;
-        status = ConvertCOOtoCSR_weighted( (int*)graph->edgeList->dest_indices->data, (int*)graph->edgeList->src_indices->data, (WT*)graph->edgeList->edge_data->data, nnz, adj_list);
-        gdf_column_view(graph->transposedAdjList->offsets, adj_list.rowOffsets,
-                              nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype);
-        gdf_column_view(graph->transposedAdjList->indices, adj_list.colIndices,
-                              nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype);
-        gdf_column_view(graph->transposedAdjList->edge_data, adj_list.edgeWeights,
-                            nullptr, adj_list.nnz, graph->edgeList->edge_data->dtype);
-      }
-      else {
-
-        CSR_Result<int> adj_list;
-        status = ConvertCOOtoCSR((int*)graph->edgeList->dest_indices->data, (int*)graph->edgeList->src_indices->data, nnz, adj_list);
-        gdf_column_view(graph->transposedAdjList->offsets, adj_list.rowOffsets,
-                              nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype);
-        gdf_column_view(graph->transposedAdjList->indices, adj_list.colIndices,
-                              nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype);
-      }
-      if (status !=0) {
-        std::cerr << "Could not generate the adj_list" << std::endl;
-        return GDF_CUDA_ERROR;
-      }
-    }
-    return GDF_SUCCESS;
-}
-
-gdf_error gdf_degree_impl(int n, int e, gdf_column* col_ptr, gdf_column* degree, bool offsets) {
-  if(offsets == true) {
-    dim3 nthreads, nblocks;
-    nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS);
-    nthreads.y = 1;
-    nthreads.z = 1;
-    nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-    nblocks.y = 1;
-    nblocks.z = 1;
-
-    switch (col_ptr->dtype) {
-      case GDF_INT32:   cugraph::degree_offsets<int32_t, float> <<<nblocks, nthreads>>>(n, e, static_cast<int*>(col_ptr->data), static_cast<int*>(degree->data));break;
-      default: return GDF_UNSUPPORTED_DTYPE;
-    }
-  }
-  else {
-    dim3 nthreads, nblocks;
-    nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS);
-    nthreads.y = 1;
-    nthreads.z = 1;
-    nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-    nblocks.y = 1;
-    nblocks.z = 1;
-
-    switch (col_ptr->dtype) {
-      case GDF_INT32:   cugraph::degree_coo<int32_t, float> <<<nblocks, nthreads>>>(n, e, static_cast<int*>(col_ptr->data), static_cast<int*>(degree->data));break;
-      default: return GDF_UNSUPPORTED_DTYPE;
-    }
-  }
-  return GDF_SUCCESS;
-}
-
-
-gdf_error gdf_degree(gdf_graph *graph, gdf_column *degree, int x) {
-  // Calculates the degree of all vertices of the graph
-  // x = 0: in+out degree
-  // x = 1: in-degree
-  // x = 2: out-degree
-  GDF_REQUIRE(graph->adjList != nullptr || graph->transposedAdjList != nullptr, GDF_INVALID_API_CALL);
-  int n;
-  int e;
-  if(graph->adjList != nullptr) {
-    n = graph->adjList->offsets->size -1;
-    e = graph->adjList->indices->size;
-  }
-  else {
-    n = graph->transposedAdjList->offsets->size - 1;
-    e = graph->transposedAdjList->indices->size;
-  }
-
-  if(x!=1) {
-    // Computes out-degree for x=0 and x=2
-    if(graph->adjList)
-      gdf_degree_impl(n, e, graph->adjList->offsets, degree, true);
-    else
-      gdf_degree_impl(n, e, graph->transposedAdjList->indices, degree, false);
-  }
-
-  if(x!=2) {
-    // Computes in-degree for x=0 and x=1
-    if(graph->adjList)
-      gdf_degree_impl(n, e, graph->adjList->indices, degree, false);
-    else
-      gdf_degree_impl(n, e, graph->transposedAdjList->offsets, degree, true);
-  }
-  return GDF_SUCCESS;
-}
-
-
-template <typename WT>
-gdf_error gdf_pagerank_impl (gdf_graph *graph,
-                      gdf_column *pagerank, float alpha = 0.85,
-                      float tolerance = 1e-4, int max_iter = 200,
-                      bool has_guess = false) {
-  GDF_REQUIRE( graph->edgeList != nullptr, GDF_VALIDITY_UNSUPPORTED );
-  GDF_REQUIRE( graph->edgeList->src_indices->size == graph->edgeList->dest_indices->size, GDF_COLUMN_SIZE_MISMATCH );
-  GDF_REQUIRE( graph->edgeList->src_indices->dtype == graph->edgeList->dest_indices->dtype, GDF_UNSUPPORTED_DTYPE );
-  GDF_REQUIRE( graph->edgeList->src_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
-  GDF_REQUIRE( graph->edgeList->dest_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
-  GDF_REQUIRE( pagerank != nullptr , GDF_INVALID_API_CALL );
-  GDF_REQUIRE( pagerank->data != nullptr , GDF_INVALID_API_CALL );
-  GDF_REQUIRE( pagerank->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
-  GDF_REQUIRE( pagerank->size > 0 , GDF_INVALID_API_CALL );
-
-  int m=pagerank->size, nnz = graph->edgeList->src_indices->size, status = 0;
-  WT *d_pr, *d_val = nullptr, *d_leaf_vector = nullptr;
-  WT res = 1.0;
-  WT *residual = &res;
-
-  if (graph->transposedAdjList == nullptr) {
-    gdf_add_transposed_adj_list(graph);
-  }
-  cudaStream_t stream{nullptr};
-  ALLOC_MANAGED_TRY((void**)&d_leaf_vector, sizeof(WT) * m, stream);
-  ALLOC_MANAGED_TRY((void**)&d_val, sizeof(WT) * nnz , stream);
-  ALLOC_MANAGED_TRY((void**)&d_pr,    sizeof(WT) * m, stream);
-
-  //  The templating for HT_matrix_csc_coo assumes that m, nnz and data are all the same type
-  cugraph::HT_matrix_csc_coo(m, nnz, (int *)graph->transposedAdjList->offsets->data, (int *)graph->transposedAdjList->indices->data, d_val, d_leaf_vector);
-
-  if (has_guess)
-  {
-    GDF_REQUIRE( pagerank->data != nullptr, GDF_VALIDITY_UNSUPPORTED );
-    cugraph::copy<WT>(m, (WT*)pagerank->data, d_pr);
-  }
-
-  status = cugraph::pagerank<int32_t,WT>( m,nnz, (int*)graph->transposedAdjList->offsets->data, (int*)graph->transposedAdjList->indices->data,
-    d_val, alpha, d_leaf_vector, false, tolerance, max_iter, d_pr, residual);
-
-  if (status !=0)
-    switch ( status ) {
-      case -1: std::cerr<< "Error : bad parameters in Pagerank"<<std::endl; return GDF_CUDA_ERROR;
-      case 1: std::cerr<< "Warning : Pagerank did not reached the desired tolerance"<<std::endl;  return GDF_CUDA_ERROR;
-      default:  std::cerr<< "Pagerank failed"<<std::endl;  return GDF_CUDA_ERROR;
-    }
-
-  cugraph::copy<WT>(m, d_pr, (WT*)pagerank->data);
-
-  ALLOC_FREE_TRY(d_val, stream);
-  ALLOC_FREE_TRY(d_pr, stream);
-  ALLOC_FREE_TRY(d_leaf_vector, stream);
-
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_add_adj_list(gdf_graph *graph) {
-  if (graph->adjList != nullptr)
-    return GDF_SUCCESS;
-
-  GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL);
-  GDF_REQUIRE( graph->edgeList->src_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE );
-
-  if (graph->edgeList->edge_data != nullptr) {
-    switch (graph->edgeList->edge_data->dtype) {
-      case GDF_FLOAT32:   return gdf_add_adj_list_impl<int32_t, float>(graph);
-      case GDF_FLOAT64:   return gdf_add_adj_list_impl<int32_t, double>(graph);
-      default: return GDF_UNSUPPORTED_DTYPE;
-    }
-  }
-  else {
-    return gdf_add_adj_list_impl<int32_t, float>(graph);
-  }
-}
-
-gdf_error gdf_add_transposed_adj_list(gdf_graph *graph) {
-  if (graph->edgeList == nullptr)
-    gdf_add_edge_list(graph);
-
-  GDF_REQUIRE(graph->edgeList->src_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
-  GDF_REQUIRE(graph->edgeList->dest_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
-
-  if (graph->edgeList->edge_data != nullptr) {
-    switch (graph->edgeList->edge_data->dtype) {
-      case GDF_FLOAT32:   return gdf_add_transposed_adj_list_impl<float>(graph);
-      case GDF_FLOAT64:   return gdf_add_transposed_adj_list_impl<double>(graph);
-      default: return GDF_UNSUPPORTED_DTYPE;
-    }
-  }
-  else {
-    return gdf_add_transposed_adj_list_impl<float>(graph);
-  }
-}
-
-gdf_error gdf_delete_adj_list(gdf_graph *graph) {
-  if (graph->adjList) {
-    delete graph->adjList;
-  }
-  graph->adjList = nullptr;
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_delete_edge_list(gdf_graph *graph) {
-  if (graph->edgeList) {
-    delete graph->edgeList;
-  }
-  graph->edgeList = nullptr;
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_delete_transposed_adj_list(gdf_graph *graph) {
-  if (graph->transposedAdjList) {
-    delete graph->transposedAdjList;
-  }
-  graph->transposedAdjList = nullptr;
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_pagerank(gdf_graph *graph, gdf_column *pagerank, float alpha, float tolerance, int max_iter, bool has_guess) {
-  //
-  //  page rank operates on CSR and can't currently support 64-bit integers.
-  //
-  //  If csr doesn't exist, create it.  Then check type to make sure it is 32-bit.
-  //
-  GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL);
-  gdf_error err = gdf_add_adj_list(graph);
-  if (err != GDF_SUCCESS)
-    return err;
-
-  GDF_REQUIRE(graph->adjList->offsets->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
-  GDF_REQUIRE(graph->adjList->indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
-
-  switch (pagerank->dtype) {
-    case GDF_FLOAT32:   return gdf_pagerank_impl<float>(graph, pagerank, alpha, tolerance, max_iter, has_guess);
-    case GDF_FLOAT64:   return gdf_pagerank_impl<double>(graph, pagerank, alpha, tolerance, max_iter, has_guess);
-    default: return GDF_UNSUPPORTED_DTYPE;
-  }
-}
-
-gdf_error gdf_bfs(gdf_graph *graph, gdf_column *distances, gdf_column *predecessors, int start_vertex, bool directed) {
-  GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL);
-  gdf_error err = gdf_add_adj_list(graph);
-  if (err != GDF_SUCCESS)
-    return err;
-  GDF_REQUIRE(graph->adjList->offsets->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
-  GDF_REQUIRE(graph->adjList->indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
-  GDF_REQUIRE(distances->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
-  GDF_REQUIRE(predecessors->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
-
-  int n = graph->adjList->offsets->size - 1;
-  int e = graph->adjList->indices->size;
-  int* offsets_ptr = (int*)graph->adjList->offsets->data;
-  int* indices_ptr = (int*)graph->adjList->indices->data;
-  int* distances_ptr = (int*)distances->data;
-  int* predecessors_ptr = (int*)predecessors->data;
-  int alpha = 15;
-  int beta = 18;
-
-  cugraph::Bfs<int> bfs(n, e, offsets_ptr, indices_ptr, directed, alpha, beta);
-  bfs.configure(distances_ptr, predecessors_ptr, nullptr);
-  bfs.traverse(start_vertex);
-  return GDF_SUCCESS;
-}
-
-gdf_error gdf_louvain(gdf_graph *graph, void *final_modularity, void *num_level, gdf_column *louvain_parts) {
-  GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL);
-  gdf_error err = gdf_add_adj_list(graph);
-  if (err != GDF_SUCCESS)
-    return err;
-
-  size_t n = graph->adjList->offsets->size - 1;
-  size_t e = graph->adjList->indices->size;
-
-  void* offsets_ptr = graph->adjList->offsets->data;
-  void* indices_ptr = graph->adjList->indices->data;
-
-  void* value_ptr;
-  Vector<float> d_values;
-  if(graph->adjList->edge_data) {
-      value_ptr = graph->adjList->edge_data->data;
-  }
-  else {
-      cudaStream_t stream { nullptr };
-      rmm_temp_allocator allocator(stream);
-      d_values.resize(graph->adjList->indices->size);
-      thrust::fill(thrust::cuda::par(allocator).on(stream), d_values.begin(), d_values.end(), 1.0);
-      value_ptr = (void * ) thrust::raw_pointer_cast(d_values.data());
-  }
-
-  void* louvain_parts_ptr = louvain_parts->data;
-
-  auto gdf_to_cudadtype= [](gdf_column *col){
-    cudaDataType_t cuda_dtype;
-    switch(col->dtype){
-      case GDF_INT8: cuda_dtype = CUDA_R_8I; break;
-      case GDF_INT32: cuda_dtype = CUDA_R_32I; break;
-      case GDF_FLOAT32: cuda_dtype = CUDA_R_32F; break;
-      case GDF_FLOAT64: cuda_dtype = CUDA_R_64F; break;
-      default: throw new std::invalid_argument("Cannot convert data type");
-      }return cuda_dtype;
-  };
-
-  cudaDataType_t index_type = gdf_to_cudadtype(graph->adjList->indices);
-  cudaDataType_t val_type = graph->adjList->edge_data? gdf_to_cudadtype(graph->adjList->edge_data): CUDA_R_32F;
-
-  nvgraphLouvain(index_type, val_type, n, e, offsets_ptr, indices_ptr, value_ptr, 1, 0, NULL,
-                 final_modularity, louvain_parts_ptr, num_level);
-  return GDF_SUCCESS;
-}
-
-template <typename idx_t,typename val_t>
-gdf_error gdf_snmg_csrmv_impl (size_t * part_offsets, gdf_column * off, gdf_column * ind, gdf_column * val, gdf_column ** x_cols){
-  
-  GDF_REQUIRE( part_offsets != nullptr, GDF_INVALID_API_CALL );
-  GDF_REQUIRE( off != nullptr, GDF_INVALID_API_CALL );
-  GDF_REQUIRE( ind != nullptr, GDF_INVALID_API_CALL );
-  GDF_REQUIRE( val != nullptr, GDF_INVALID_API_CALL );
-  GDF_REQUIRE( x_cols != nullptr, GDF_INVALID_API_CALL );
-  GDF_REQUIRE( off->size > 0, GDF_INVALID_API_CALL );
-  GDF_REQUIRE( ind->size > 0, GDF_INVALID_API_CALL );
-  GDF_REQUIRE( val->size > 0, GDF_INVALID_API_CALL );
-  GDF_REQUIRE( ind->size == val->size, GDF_COLUMN_SIZE_MISMATCH ); 
-  GDF_REQUIRE( off->dtype == ind->dtype, GDF_UNSUPPORTED_DTYPE );  
-  GDF_REQUIRE( off->null_count + ind->null_count + val->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );                 
-
-  gdf_error status;
-  auto p = omp_get_num_threads();
-
-  val_t* x[p];
-  for (auto i = 0; i < p; ++i)
-  {
-    GDF_REQUIRE( x_cols[i] != nullptr, GDF_INVALID_API_CALL );
-    GDF_REQUIRE( x_cols[i]->size > 0, GDF_INVALID_API_CALL );
-    x[i]= static_cast<val_t*>(x_cols[i]->data);
-  }
-  status = cugraph::snmg_csrmv<idx_t,val_t>(part_offsets,
-                                      static_cast<idx_t*>(off->data), 
-                                      static_cast<idx_t*>(ind->data), 
-                                      static_cast<val_t*>(val->data), 
-                                      x);
-  return status;
-}
-
-gdf_error gdf_snmg_csrmv (size_t * part_offsets, gdf_column * off, gdf_column * ind, gdf_column * val, gdf_column ** x_cols){
-    switch (val->dtype) {
-      case GDF_FLOAT32:   return gdf_snmg_csrmv_impl<int32_t,float>(part_offsets, off, ind, val, x_cols);
-      case GDF_FLOAT64:   return gdf_snmg_csrmv_impl<int32_t,double>(part_offsets, off, ind, val, x_cols);
-      default: return GDF_UNSUPPORTED_DTYPE;
-    }
-}
diff --git a/cpp/src/graph_utils.cuh b/cpp/src/graph_utils.cuh
deleted file mode 100644
index 190c71d9926..00000000000
--- a/cpp/src/graph_utils.cuh
+++ /dev/null
@@ -1,602 +0,0 @@
-/*
- * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Interanl helper functions 
-// Author: Alex Fender afender@nvidia.com
-#pragma once
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-//#include <library_types.h>
-//#include <cuda_fp16.h>
-#include <thrust/device_vector.h>
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-#include <thrust/inner_product.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/sort.h>
-
-#include <rmm_utils.h>
-
-#define USE_CG 1
-//#define DEBUG 1
-
-namespace cugraph
-{
-
-#define CUDA_MAX_BLOCKS 65535
-#define CUDA_MAX_KERNEL_THREADS 256  //kernel will launch at most 256 threads per block
-#define DEFAULT_MASK 0xffffffff
-#define US
-
-//error check
-#ifdef DEBUG
-#define WHERE " at: " << __FILE__ << ':' << __LINE__
-#define cudaCheckError() {                                              \
-    cudaError_t e=cudaGetLastError();                                     \
-    if(e!=cudaSuccess) {                                                  \
-      std::cerr << "Cuda failure: "  << cudaGetErrorString(e) << WHERE << std::endl;        \
-    }                                                                     \
-  }
-#else 
-#define cudaCheckError()
-#define WHERE ""
-#endif 
-
-	template<typename T>
-	static __device__  __forceinline__ T shfl_up(T r, int offset, int bound = 32, int mask =
-																									DEFAULT_MASK)
-																							{
-#if __CUDA_ARCH__ >= 300
-#if USE_CG
-		return __shfl_up_sync( mask, r, offset, bound );
-#else
-		return __shfl_up( r, offset, bound );
-#endif
-#else
-		return 0.0f;
-#endif
-	}
-
-	template<typename T>
-	static __device__  __forceinline__ T shfl(T r, int lane, int bound = 32, int mask = DEFAULT_MASK)
-																						{
-#if __CUDA_ARCH__ >= 300
-#if USE_CG
-		return __shfl_sync(mask, r, lane, bound );
-#else
-		return __shfl(r, lane, bound );
-#endif
-#else
-		return 0.0f;
-#endif
-	}
-
-	template<typename IdxType, typename ValType>
-	__inline__   __device__
-	ValType parallel_prefix_sum(IdxType n, IdxType *ind, ValType *w) {
-		IdxType i, j, mn;
-		ValType v, last;
-		ValType sum = 0.0;
-		bool valid;
-
-		//Parallel prefix sum (using __shfl)
-		mn = (((n + blockDim.x - 1) / blockDim.x) * blockDim.x); //n in multiple of blockDim.x
-		for (i = threadIdx.x; i < mn; i += blockDim.x) {
-			//All threads (especially the last one) must always participate
-			//in the shfl instruction, otherwise their sum will be undefined.
-			//So, the loop stopping condition is based on multiple of n in loop increments,
-			//so that all threads enter into the loop and inside we make sure we do not
-			//read out of bounds memory checking for the actual size n.
-
-			//check if the thread is valid
-			valid = i < n;
-
-			//Notice that the last thread is used to propagate the prefix sum.
-			//For all the threads, in the first iteration the last is 0, in the following
-			//iterations it is the value at the last thread of the previous iterations.
-
-			//get the value of the last thread
-			last = shfl(sum, blockDim.x - 1, blockDim.x);
-
-			//if you are valid read the value from memory, otherwise set your value to 0
-			sum = (valid) ? w[ind[i]] : 0.0;
-
-			//do prefix sum (of size warpSize=blockDim.x =< 32)
-			for (j = 1; j < blockDim.x; j *= 2) {
-				v = shfl_up(sum, j, blockDim.x);
-				if (threadIdx.x >= j)
-					sum += v;
-			}
-			//shift by last
-			sum += last;
-			//notice that no __threadfence or __syncthreads are needed in this implementation
-		}
-		//get the value of the last thread (to all threads)
-		last = shfl(sum, blockDim.x - 1, blockDim.x);
-
-		return last;
-	}
-
-//dot
-	template<typename T>
-	T dot(size_t n, T* x, T* y) {
-		//RMM:
-		//
-		cudaStream_t stream { nullptr };
-		rmm_temp_allocator allocator(stream);
-		T result = thrust::inner_product(thrust::cuda::par(allocator).on(stream),
-																			thrust::device_pointer_cast(x),
-																			thrust::device_pointer_cast(x + n),
-																			thrust::device_pointer_cast(y),
-																			0.0f);
-		cudaCheckError();
-		return result;
-	}
-
-//axpy
-	template<typename T>
-	struct axpy_functor: public thrust::binary_function<T, T, T> {
-		const T a;
-		axpy_functor(T _a) :
-				a(_a) {
-		}
-		__host__  __device__
-		T operator()(const T& x, const T& y) const {
-			return a * x + y;
-		}
-	};
-
-	template<typename T>
-	void axpy(size_t n, T a, T* x, T* y) {
-		//RMM:
-		//
-		cudaStream_t stream { nullptr };
-		rmm_temp_allocator allocator(stream);
-		thrust::transform(thrust::cuda::par(allocator).on(stream),
-											thrust::device_pointer_cast(x),
-											thrust::device_pointer_cast(x + n),
-											thrust::device_pointer_cast(y),
-											thrust::device_pointer_cast(y),
-											axpy_functor<T>(a));
-		cudaCheckError();
-	}
-
-//norm
-	template<typename T>
-	struct square {
-		__host__  __device__
-		T operator()(const T& x) const {
-			return x * x;
-		}
-	};
-
-	template<typename T>
-	T nrm2(size_t n, T* x) {
-		//RMM:
-		//
-		cudaStream_t stream { nullptr };
-		rmm_temp_allocator allocator(stream);
-		T init = 0;
-		T result = std::sqrt(thrust::transform_reduce(thrust::cuda::par(allocator).on(stream),
-																									thrust::device_pointer_cast(x),
-																									thrust::device_pointer_cast(x + n),
-																									square<T>(),
-																									init,
-																									thrust::plus<T>()));
-		cudaCheckError();
-		return result;
-	}
-
-	template<typename T>
-	T nrm1(size_t n, T* x) {
-		//RMM:
-		//
-		cudaStream_t stream { nullptr };
-		rmm_temp_allocator allocator(stream);
-
-		T result = thrust::reduce(thrust::cuda::par(allocator).on(stream),
-															thrust::device_pointer_cast(x),
-															thrust::device_pointer_cast(x + n));
-		cudaCheckError();
-		return result;
-	}
-
-	template<typename T>
-	void scal(size_t n, T val, T* x) {
-		//RMM:
-		//
-		cudaStream_t stream { nullptr };
-		rmm_temp_allocator allocator(stream);
-
-		thrust::transform(thrust::cuda::par(allocator).on(stream),
-											thrust::device_pointer_cast(x),
-											thrust::device_pointer_cast(x + n),
-											thrust::make_constant_iterator(val),
-											thrust::device_pointer_cast(x),
-											thrust::multiplies<T>());
-		cudaCheckError();
-	}
-
-	template<typename T>
-	void fill(size_t n, T* x, T value) {
-		//RMM:
-		//
-		cudaStream_t stream { nullptr };
-		rmm_temp_allocator allocator(stream);
-
-		thrust::fill(thrust::cuda::par(allocator).on(stream),
-									thrust::device_pointer_cast(x),
-									thrust::device_pointer_cast(x + n), value);
-		cudaCheckError();
-	}
-
-	template<typename T>
-	void printv(size_t n, T* vec, int offset) {
-		thrust::device_ptr<T> dev_ptr(vec);
-		std::cout.precision(15);
-		std::cout << "sample size = " << n << ", offset = " << offset << std::endl;
-		thrust::copy(dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator<T>(std::cout, " ")); //Assume no RMM dependency; TODO: check / test (potential BUG !!!!!)
-		cudaCheckError();
-		std::cout << std::endl;
-	}
-
-	template<typename T>
-	void copy(size_t n, T *x, T *res)
-						{
-		//RMM:
-		//
-		cudaStream_t stream { nullptr };
-		rmm_temp_allocator allocator(stream);
-
-		thrust::device_ptr<T> dev_ptr(x);
-		thrust::device_ptr<T> res_ptr(res);
-		thrust::copy_n(thrust::cuda::par(allocator).on(stream), dev_ptr, n, res_ptr);
-		cudaCheckError();
-	}
-
-	template<typename T>
-	struct is_zero {
-		__host__ __device__
-		bool operator()(const T x) {
-			return x == 0;
-		}
-	};
-
-	template<typename T>
-	struct dangling_functor: public thrust::unary_function<T, T> {
-		const T val;
-		dangling_functor(T _val) :
-				val(_val) {
-		}
-		__host__  __device__
-		T operator()(const T& x) const {
-			return val + x;
-		}
-	};
-
-	template<typename T>
-	void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor) {
-		//RMM:
-		//
-		cudaStream_t stream { nullptr };
-		rmm_temp_allocator allocator(stream);
-
-		thrust::transform_if(thrust::cuda::par(allocator).on(stream),
-													thrust::device_pointer_cast(dangling_nodes),
-													thrust::device_pointer_cast(dangling_nodes + n),
-													thrust::device_pointer_cast(dangling_nodes),
-													dangling_functor<T>(1.0 - damping_factor),
-													is_zero<T>());
-		cudaCheckError();
-	}
-
-//google matrix kernels
-	template<typename IndexType, typename ValueType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	degree_coo(const IndexType n, const IndexType e, const IndexType *ind, IndexType *degree) {
-		for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
-			atomicAdd(&degree[ind[i]], 1.0);
-	}
-	template<typename IndexType, typename ValueType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	equi_prob(const IndexType n,
-						const IndexType e,
-						const IndexType *ind,
-						ValueType *val,
-						IndexType *degree) {
-		for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
-			val[i] = 1.0 / degree[ind[i]];
-	}
-
-	template<typename IndexType, typename ValueType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	flag_leafs(const IndexType n, IndexType *degree, ValueType *bookmark) {
-		for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
-			if (degree[i] == 0)
-				bookmark[i] = 1.0;
-	}
-
-        template<typename IndexType, typename ValueType>
-        __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-        degree_offsets(const IndexType n, const IndexType e, const IndexType *ind, IndexType *degree) {
-                for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
-                        degree[i] += ind[i+1]-ind[i];
-        }
-
-
-//notice that in the transposed matrix/csc a dangling node is a node without incomming edges
-//just swap coo src and dest arrays after that to interpret it as HT
-	template<typename IndexType, typename ValueType>
-	void HT_matrix_coo(	const IndexType n,
-											const IndexType e,
-											const IndexType *src,
-											ValueType *cooVal,
-											ValueType *bookmark) {
-		IndexType *degree { nullptr };
-		cudaStream_t stream { nullptr };
-		ALLOC_MANAGED_TRY((void** )&degree, sizeof(IndexType) * n, stream);
-
-		cudaMemset(degree, 0, sizeof(IndexType) * n);
-
-		dim3 nthreads, nblocks;
-		nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS);
-		nthreads.y = 1;
-		nthreads.z = 1;
-		nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-		nblocks.y = 1;
-		nblocks.z = 1;
-		degree_coo<IndexType, ValueType> <<<nblocks, nthreads>>>(n, e, src, degree);
-		equi_prob<IndexType, ValueType> <<<nblocks, nthreads>>>(n, e, src, cooVal, degree);
-		ValueType val = 0.0;
-		fill(n, bookmark, val);
-		nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS);
-		nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-		flag_leafs<IndexType, ValueType> <<<nblocks, nthreads>>>(n, degree, bookmark);
-
-		//printv(n, degree , 0);
-		//printv(n, bookmark , 0);
-		//printv(e, cooVal , 0);
-
-		//this was missing: TODO: check if okay
-		ALLOC_FREE_TRY(degree, stream);
-	}
-
-	template<typename IndexType, typename ValueType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	equi_prob3(	const IndexType n,
-							const IndexType e,
-							const IndexType *csrPtr,
-							const IndexType *csrInd,
-							ValueType *val,
-							IndexType *degree) {
-		int j, row, col;
-		for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
-			for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1];
-					j += gridDim.y * blockDim.y) {
-				col = csrInd[j];
-				val[j] = 1.0 / degree[col];
-				//val[j] = 999;
-			}
-		}
-	}
-
-	template<typename IndexType, typename ValueType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	equi_prob2(	const IndexType n,
-							const IndexType e,
-							const IndexType *csrPtr,
-							const IndexType *csrInd,
-							ValueType *val,
-							IndexType *degree) {
-		int row = blockIdx.x * blockDim.x + threadIdx.x;
-		if (row < n)
-				{
-			int row_begin = csrPtr[row];
-			int row_end = csrPtr[row + 1];
-			int col;
-			for (int i = row_begin; i < row_end; i++) {
-				col = csrInd[i];
-				val[i] = 1.0 / degree[col];
-			}
-		}
-	}
-
-// compute the H^T values for an already transposed adjacency matrix, leveraging coo info
-	template<typename IndexType, typename ValueType>
-	void HT_matrix_csc_coo(	const IndexType n,
-													const IndexType e,
-													const IndexType *csrPtr,
-													const IndexType *csrInd,
-													ValueType *val,
-													ValueType *bookmark) {
-		IndexType *degree;
-		cudaStream_t stream { nullptr };
-		ALLOC_MANAGED_TRY((void** )&degree, sizeof(IndexType) * n, stream);
-		cudaMemset(degree, 0, sizeof(IndexType) * n);
-
-		dim3 nthreads, nblocks;
-		nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS);
-		nthreads.y = 1;
-		nthreads.z = 1;
-		nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-		nblocks.y = 1;
-		nblocks.z = 1;
-		degree_coo<IndexType, ValueType> <<<nblocks, nthreads>>>(n, e, csrInd, degree);
-		cudaCheckError();
-
-		int y = 4;
-		nthreads.x = 32 / y;
-		nthreads.y = y;
-		nthreads.z = 8;
-		nblocks.x = 1;
-		nblocks.y = 1;
-		nblocks.z = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS); //1;
-		equi_prob3<IndexType, ValueType> <<<nblocks, nthreads>>>(n, e, csrPtr, csrInd, val, degree);
-		//printv(e, val , 0);
-		cudaCheckError();
-
-		ValueType a = 0.0;
-		fill(n, bookmark, a);
-		cudaCheckError();
-
-		nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS);
-		nthreads.y = 1;
-		nthreads.z = 1;
-		nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
-		nblocks.y = 1;
-		nblocks.z = 1;
-		flag_leafs<IndexType, ValueType> <<<nblocks, nthreads>>>(n, degree, bookmark);
-		cudaCheckError();
-
-		//this was missing! TODO: check if okay.
-		ALLOC_FREE_TRY(degree, stream);
-	}
-
-	template<typename IndexType, typename ValueType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	permute_vals_kernel(const IndexType e, IndexType *perm, ValueType *in, ValueType *out) {
-		for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
-			out[i] = in[perm[i]];
-	}
-
-	template<typename IndexType, typename ValueType>
-	void permute_vals(const IndexType e, IndexType *perm, ValueType *in, ValueType *out) {
-		int nthreads = min(e, CUDA_MAX_KERNEL_THREADS);
-		int nblocks = min((e + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS);
-		permute_vals_kernel<<<nblocks, nthreads>>>(e, perm, in, out);
-		//printv(e, in , 0);
-		//printv(e, perm , 0);
-		//printv(e, out , 0);
-	}
-
-// This will remove duplicate along with sorting
-// This will sort the COO Matrix, row will be sorted and each column of same row will be sorted. 
-	template<typename IndexType, typename ValueType, typename SizeT>
-	void remove_duplicate(IndexType* src, IndexType* dest, ValueType* val, SizeT &nnz)
-												{
-		//RMM:
-		//
-		cudaStream_t stream { nullptr };
-		rmm_temp_allocator allocator(stream);
-		if (val != NULL)
-				{
-			thrust::stable_sort_by_key(	thrust::cuda::par(allocator).on(stream),
-																	thrust::raw_pointer_cast(val),
-																	thrust::raw_pointer_cast(val) + nnz,
-																	thrust::make_zip_iterator(thrust::make_tuple(	thrust::raw_pointer_cast(src),
-																																								thrust::raw_pointer_cast(dest))));
-			thrust::stable_sort_by_key(	thrust::cuda::par(allocator).on(stream),
-																	thrust::raw_pointer_cast(dest),
-																	thrust::raw_pointer_cast(dest + nnz),
-																	thrust::make_zip_iterator(thrust::make_tuple(	thrust::raw_pointer_cast(src),
-																																								thrust::raw_pointer_cast(val))));
-			thrust::stable_sort_by_key(	thrust::cuda::par(allocator).on(stream),
-																	thrust::raw_pointer_cast(src),
-																	thrust::raw_pointer_cast(src + nnz),
-																	thrust::make_zip_iterator(thrust::make_tuple(	thrust::raw_pointer_cast(dest),
-																																								thrust::raw_pointer_cast(val))));
-
-			typedef thrust::tuple<IndexType*, ValueType*> IteratorTuple;
-			typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-			typedef thrust::tuple<IndexType*, ZipIterator> ZipIteratorTuple;
-			typedef thrust::zip_iterator<ZipIteratorTuple> ZipZipIterator;
-
-			ZipZipIterator newEnd =
-					thrust::unique(	thrust::cuda::par(allocator).on(stream),
-													thrust::make_zip_iterator(thrust::make_tuple(	thrust::raw_pointer_cast(src),
-																																				thrust::make_zip_iterator(thrust::make_tuple(	thrust::raw_pointer_cast(dest),
-																																																											thrust::raw_pointer_cast(val))))),
-													thrust::make_zip_iterator(thrust::make_tuple(	thrust::raw_pointer_cast(src
-																																						+ nnz),
-																																				thrust::make_zip_iterator(thrust::make_tuple(	dest
-																																																													+ nnz,
-																																																											val
-																																																													+ nnz)))));
-
-			ZipIteratorTuple endTuple = newEnd.get_iterator_tuple();
-			IndexType* row_end = thrust::get<0>(endTuple);
-
-			nnz = ((size_t) row_end - (size_t) src) / sizeof(IndexType);
-		}
-		else
-		{
-			thrust::stable_sort_by_key(	thrust::cuda::par(allocator).on(stream),
-																	thrust::raw_pointer_cast(dest),
-																	thrust::raw_pointer_cast(dest + nnz),
-																	thrust::raw_pointer_cast(src));
-			thrust::stable_sort_by_key(	thrust::cuda::par(allocator).on(stream),
-																	thrust::raw_pointer_cast(src),
-																	thrust::raw_pointer_cast(src + nnz),
-																	thrust::raw_pointer_cast(dest));
-
-			typedef thrust::tuple<IndexType*, IndexType*> IteratorTuple;
-			typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-
-			ZipIterator newEnd =
-					thrust::unique(	thrust::cuda::par(allocator).on(stream),
-													thrust::make_zip_iterator(thrust::make_tuple(	thrust::raw_pointer_cast(src),
-																																				thrust::raw_pointer_cast(dest))),
-													thrust::make_zip_iterator(thrust::make_tuple(	thrust::raw_pointer_cast(src
-																																						+ nnz),
-																																				thrust::raw_pointer_cast(dest
-																																						+ nnz))));
-
-			IteratorTuple endTuple = newEnd.get_iterator_tuple();
-			IndexType* row_end = thrust::get<0>(endTuple);
-
-			nnz = ((size_t) row_end - (size_t) src) / sizeof(IndexType);
-		}
-	}
-
-	template<typename IndexType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) offsets_to_indices_kernel(	const IndexType *offsets,
-																																												IndexType v,
-																																												IndexType *indices) {
-
-		int tid, ctaStart;
-		tid = threadIdx.x;
-		ctaStart = blockIdx.x;
-
-		for (int j = ctaStart; j < v; j += gridDim.x) {
-			IndexType colStart = offsets[j];
-			IndexType colEnd = offsets[j + 1];
-			IndexType rowNnz = colEnd - colStart;
-
-			for (int i = 0; i < rowNnz; i += blockDim.x) {
-				if ((colStart + tid + i) < colEnd) {
-					indices[colStart + tid + i] = j;
-				}
-			}
-		}
-	}
-
-	template<typename IndexType>
-	void offsets_to_indices(const IndexType *offsets, IndexType v, IndexType *indices)
-													{
-		int nthreads = min(v, CUDA_MAX_KERNEL_THREADS);
-		int nblocks = min((v + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS);
-		offsets_to_indices_kernel<<<nblocks, nthreads>>>(offsets, v, indices);
-		cudaCheckError();
-	}
-
-	template<typename IndexType>
-	void sequence(IndexType n, IndexType *vec, IndexType init = 0)
-								{
-		thrust::sequence(	thrust::device,
-											thrust::device_pointer_cast(vec),
-											thrust::device_pointer_cast(vec + n),
-											init);
-		cudaCheckError();
-	}
-
-} //namespace cugraph
diff --git a/cpp/src/jaccard.cu b/cpp/src/jaccard.cu
deleted file mode 100644
index 91d6206a7e6..00000000000
--- a/cpp/src/jaccard.cu
+++ /dev/null
@@ -1,710 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/** ---------------------------------------------------------------------------*
- * @brief The cugraph Jaccard core functionality
- *
- * @file jaccard.cu
- * ---------------------------------------------------------------------------**/
-
-#include "graph_utils.cuh"
-#include "cugraph.h"
-#include "rmm_utils.h"
-#include "utilities/error_utils.h"
-
-namespace cugraph {
-	// Volume of neighboors (*weight_s)
-	template<bool weighted, typename IdxType, typename ValType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	jaccard_row_sum(IdxType n,
-									IdxType *csrPtr,
-									IdxType *csrInd,
-									ValType *v,
-									ValType *work) {
-		IdxType row, start, end, length;
-		ValType sum;
-		for (row = threadIdx.y + blockIdx.y * blockDim.y;
-				row < n;
-				row += gridDim.y * blockDim.y) {
-			start = csrPtr[row];
-			end = csrPtr[row + 1];
-			length = end - start;
-			//compute row sums
-			if (weighted) {
-				sum = parallel_prefix_sum(length, csrInd + start, v);
-				if (threadIdx.x == 0)
-					work[row] = sum;
-			}
-			else {
-				work[row] = (ValType) length;
-			}
-		}
-	}
-
-	// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-	template<bool weighted, typename IdxType, typename ValType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	jaccard_is(IdxType n,
-							IdxType *csrPtr,
-							IdxType *csrInd,
-							ValType *v,
-							ValType *work,
-							ValType *weight_i,
-							ValType *weight_s) {
-		IdxType i, j, row, col, Ni, Nj;
-		IdxType ref, cur, ref_col, cur_col, match;
-		ValType ref_val;
-
-		for (row = threadIdx.z + blockIdx.z * blockDim.z;
-				row < n;
-				row += gridDim.z * blockDim.z) {
-			for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y;
-					j < csrPtr[row + 1];
-					j += gridDim.y * blockDim.y) {
-				col = csrInd[j];
-				//find which row has least elements (and call it reference row)
-				Ni = csrPtr[row + 1] - csrPtr[row];
-				Nj = csrPtr[col + 1] - csrPtr[col];
-				ref = (Ni < Nj) ? row : col;
-				cur = (Ni < Nj) ? col : row;
-
-				//compute new sum weights
-				weight_s[j] = work[row] + work[col];
-
-				//compute new intersection weights
-				//search for the element with the same column index in the reference row
-				for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-						i += gridDim.x * blockDim.x) {
-					match = -1;
-					ref_col = csrInd[i];
-					if (weighted) {
-						ref_val = v[ref_col];
-					}
-					else {
-						ref_val = 1.0;
-					}
-
-					//binary search (column indices are sorted within each row)
-					IdxType left = csrPtr[cur];
-					IdxType right = csrPtr[cur + 1] - 1;
-					while (left <= right) {
-						IdxType middle = (left + right) >> 1;
-						cur_col = csrInd[middle];
-						if (cur_col > ref_col) {
-							right = middle - 1;
-						}
-						else if (cur_col < ref_col) {
-							left = middle + 1;
-						}
-						else {
-							match = middle;
-							break;
-						}
-					}
-
-					//if the element with the same column index in the reference row has been found
-					if (match != -1) {
-						atomicAdd(&weight_i[j], ref_val);
-					}
-				}
-			}
-		}
-	}
-
-	// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-	// Using list of node pairs
-	template<bool weighted, typename IdxType, typename ValType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	jaccard_is_pairs(IdxType num_pairs,
-										IdxType *csrPtr,
-										IdxType *csrInd,
-										IdxType *first_pair,
-										IdxType *second_pair,
-										ValType *v,
-										ValType *work,
-										ValType *weight_i,
-										ValType *weight_s) {
-		IdxType i, idx, row, col, Ni, Nj;
-		IdxType ref, cur, ref_col, cur_col, match;
-		ValType ref_val;
-
-		for (idx = threadIdx.z + blockIdx.z * blockDim.z;
-				idx < num_pairs;
-				idx += gridDim.z * blockDim.z) {
-			row = first_pair[idx];
-			col = second_pair[idx];
-			//find which row has least elements (and call it reference row)
-			Ni = csrPtr[row + 1] - csrPtr[row];
-			Nj = csrPtr[col + 1] - csrPtr[col];
-			ref = (Ni < Nj) ? row : col;
-			cur = (Ni < Nj) ? col : row;
-
-			//compute new sum weights
-			weight_s[idx] = work[row] + work[col];
-
-			//compute new intersection weights
-			//search for the element with the same column index in the reference row
-			for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x;
-					i < csrPtr[ref + 1];
-					i += gridDim.x * blockDim.x) {
-				match = -1;
-				ref_col = csrInd[i];
-				if (weighted) {
-					ref_val = v[ref_col];
-				}
-				else {
-					ref_val = 1.0;
-				}
-
-				//binary search (column indices are sorted within each row)
-				IdxType left = csrPtr[cur];
-				IdxType right = csrPtr[cur + 1] - 1;
-				while (left <= right) {
-					IdxType middle = (left + right) >> 1;
-					cur_col = csrInd[middle];
-					if (cur_col > ref_col) {
-						right = middle - 1;
-					}
-					else if (cur_col < ref_col) {
-						left = middle + 1;
-					}
-					else {
-						match = middle;
-						break;
-					}
-				}
-
-				//if the element with the same column index in the reference row has been found
-				if (match != -1) {
-					atomicAdd(&weight_i[idx], ref_val);
-				}
-			}
-		}
-	}
-
-	//Jaccard  weights (*weight)
-	template<bool weighted, typename IdxType, typename ValType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	jaccard_jw(IdxType e,
-							IdxType *csrPtr,
-							IdxType *csrInd,
-							ValType *weight_i,
-							ValType *weight_s,
-							ValType *weight_j) {
-		IdxType j;
-		ValType Wi, Ws, Wu;
-
-		for (j = threadIdx.x + blockIdx.x * blockDim.x;
-				j < e;
-				j += gridDim.x * blockDim.x) {
-			Wi = weight_i[j];
-			Ws = weight_s[j];
-			Wu = Ws - Wi;
-			weight_j[j] = (Wi / Wu);
-		}
-	}
-
-	template<bool weighted, typename IdxType, typename ValType>
-	int jaccard(IdxType n,
-							IdxType e,
-							IdxType *csrPtr,
-							IdxType *csrInd,
-							ValType *weight_in,
-							ValType *work,
-							ValType *weight_i,
-							ValType *weight_s,
-							ValType *weight_j) {
-		dim3 nthreads, nblocks;
-		int y = 4;
-
-		//setup launch configuration
-		nthreads.x = 32;
-		nthreads.y = y;
-		nthreads.z = 1;
-		nblocks.x = 1;
-		nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS);
-		nblocks.z = 1;
-		//launch kernel
-		jaccard_row_sum<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
-																																				csrPtr,
-																																				csrInd,
-																																				weight_in,
-																																				work);
-		cudaDeviceSynchronize();
-		fill(e, weight_i, (ValType) 0.0);
-		//setup launch configuration
-		nthreads.x = 32 / y;
-		nthreads.y = y;
-		nthreads.z = 8;
-		nblocks.x = 1;
-		nblocks.y = 1;
-		nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1;
-		//launch kernel
-		jaccard_is<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
-																																		csrPtr,
-																																		csrInd,
-																																		weight_in,
-																																		work,
-																																		weight_i,
-																																		weight_s);
-
-		//setup launch configuration
-		nthreads.x = min(e, (IdxType) CUDA_MAX_KERNEL_THREADS);
-		nthreads.y = 1;
-		nthreads.z = 1;
-		nblocks.x = min((e + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS);
-		nblocks.y = 1;
-		nblocks.z = 1;
-		//launch kernel
-		jaccard_jw<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(e,
-																																		csrPtr,
-																																		csrInd,
-																																		weight_i,
-																																		weight_s,
-																																		weight_j);
-
-		return 0;
-	}
-
-	template<bool weighted, typename IdxType, typename ValType>
-	int jaccard_pairs(IdxType n,
-										IdxType num_pairs,
-										IdxType *csrPtr,
-										IdxType *csrInd,
-										IdxType *first_pair,
-										IdxType *second_pair,
-										ValType *weight_in,
-										ValType *work,
-										ValType *weight_i,
-										ValType *weight_s,
-										ValType *weight_j) {
-		dim3 nthreads, nblocks;
-		int y = 4;
-
-		//setup launch configuration
-		nthreads.x = 32;
-		nthreads.y = y;
-		nthreads.z = 1;
-		nblocks.x = 1;
-		nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS);
-		nblocks.z = 1;
-		//launch kernel
-		jaccard_row_sum<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
-																																				csrPtr,
-																																				csrInd,
-																																				weight_in,
-																																				work);
-		cudaDeviceSynchronize();
-		fill(num_pairs, weight_i, (ValType) 0.0);
-		//setup launch configuration
-		nthreads.x = 32;
-		nthreads.y = 1;
-		nthreads.z = 8;
-		nblocks.x = 1;
-		nblocks.y = 1;
-		nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1;
-		//launch kernel
-		jaccard_is_pairs<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(num_pairs,
-																																					csrPtr,
-																																					csrInd,
-																																					first_pair,
-																																					second_pair,
-																																					weight_in,
-																																					work,
-																																					weight_i,
-																																					weight_s);
-
-		//setup launch configuration
-		nthreads.x = min(num_pairs, (IdxType) CUDA_MAX_KERNEL_THREADS);
-		nthreads.y = 1;
-		nthreads.z = 1;
-		nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS);
-		nblocks.y = 1;
-		nblocks.z = 1;
-		//launch kernel
-		jaccard_jw<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(num_pairs,
-																																		csrPtr,
-																																		csrInd,
-																																		weight_i,
-																																		weight_s,
-																																		weight_j);
-
-		return 0;
-	}
-} // End cugraph namespace
-
-gdf_error gdf_jaccard(gdf_graph *graph, gdf_column *weights, gdf_column *result) {
-	GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL);
-	GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED);
-
-	GDF_TRY(gdf_add_adj_list(graph));
-	GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL);
-
-	bool weighted = (weights != nullptr);
-
-	gdf_dtype ValueType = result->dtype;
-	gdf_dtype IndexType = graph->adjList->offsets->dtype;
-
-	void *csrPtr = graph->adjList->offsets->data;
-	void *csrInd = graph->adjList->indices->data;
-	void *weight_i = nullptr;
-	void *weight_s = nullptr;
-	void *weight_j = result->data;
-	void *work = nullptr;
-	void *weight_in = nullptr;
-	if (weighted)
-		weight_in = weights->data;
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::jaccard<true, int32_t, float>(n,
-																						e,
-																						(int32_t*) csrPtr,
-																						(int32_t*) csrInd,
-																						(float*) weight_in,
-																						(float*) work,
-																						(float*) weight_i,
-																						(float*) weight_s,
-																						(float*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::jaccard<false, int32_t, float>(n,
-																						e,
-																						(int32_t*) csrPtr,
-																						(int32_t*) csrInd,
-																						(float*) weight_in,
-																						(float*) work,
-																						(float*) weight_i,
-																						(float*) weight_s,
-																						(float*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::jaccard<true, int32_t, double>(n,
-																						e,
-																						(int32_t*) csrPtr,
-																						(int32_t*) csrInd,
-																						(double*) weight_in,
-																						(double*) work,
-																						(double*) weight_i,
-																						(double*) weight_s,
-																						(double*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::jaccard<false, int32_t, double>(n,
-																							e,
-																							(int32_t*) csrPtr,
-																							(int32_t*) csrInd,
-																							(double*) weight_in,
-																							(double*) work,
-																							(double*) weight_i,
-																							(double*) weight_s,
-																							(double*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::jaccard<true, int64_t, float>(n,
-																						e,
-																						(int64_t*) csrPtr,
-																						(int64_t*) csrInd,
-																						(float*) weight_in,
-																						(float*) work,
-																						(float*) weight_i,
-																						(float*) weight_s,
-																						(float*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::jaccard<false, int64_t, float>(n,
-																						e,
-																						(int64_t*) csrPtr,
-																						(int64_t*) csrInd,
-																						(float*) weight_in,
-																						(float*) work,
-																						(float*) weight_i,
-																						(float*) weight_s,
-																						(float*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::jaccard<true, int64_t, double>(n,
-																						e,
-																						(int64_t*) csrPtr,
-																						(int64_t*) csrInd,
-																						(double*) weight_in,
-																						(double*) work,
-																						(double*) weight_i,
-																						(double*) weight_s,
-																						(double*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::jaccard<false, int64_t, double>(n,
-																							e,
-																							(int64_t*) csrPtr,
-																							(int64_t*) csrInd,
-																							(double*) weight_in,
-																							(double*) work,
-																							(double*) weight_i,
-																							(double*) weight_s,
-																							(double*) weight_j);
-	}
-
-// Clean up temp arrays
-	ALLOC_FREE_TRY(weight_i, nullptr);
-	ALLOC_FREE_TRY(weight_s, nullptr);
-	ALLOC_FREE_TRY(work, nullptr);
-
-	return GDF_SUCCESS;
-}
-
-gdf_error gdf_jaccard_list(gdf_graph* graph,
-														gdf_column* weights,
-														gdf_column* first,
-														gdf_column* second,
-														gdf_column* result) {
-	GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL);
-	GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED);
-
-	GDF_REQUIRE(first != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(first->data != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(!first->valid, GDF_VALIDITY_UNSUPPORTED);
-
-	GDF_REQUIRE(second != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(second->data != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(!second->valid, GDF_VALIDITY_UNSUPPORTED);
-
-	GDF_TRY(gdf_add_adj_list(graph));
-	GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL);
-
-	bool weighted = (weights != nullptr);
-
-	gdf_dtype ValueType = result->dtype;
-	gdf_dtype IndexType = graph->adjList->offsets->dtype;
-	GDF_REQUIRE(first->dtype == IndexType, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(second->dtype == IndexType, GDF_INVALID_API_CALL);
-
-	void *first_pair = first->data;
-	void *second_pair = second->data;
-	void *csrPtr = graph->adjList->offsets->data;
-	void *csrInd = graph->adjList->indices->data;
-	void *weight_i = nullptr;
-	void *weight_s = nullptr;
-	void *weight_j = result->data;
-	void *work = nullptr;
-	void *weight_in = nullptr;
-	if (weighted)
-		weight_in = weights->data;
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::jaccard_pairs<true, int32_t, float>(n,
-																									num_pairs,
-																									(int32_t*) csrPtr,
-																									(int32_t*) csrInd,
-																									(int32_t*) first_pair,
-																									(int32_t*) second_pair,
-																									(float*) weight_in,
-																									(float*) work,
-																									(float*) weight_i,
-																									(float*) weight_s,
-																									(float*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::jaccard_pairs<false, int32_t, float>(n,
-																									num_pairs,
-																									(int32_t*) csrPtr,
-																									(int32_t*) csrInd,
-																									(int32_t*) first_pair,
-																									(int32_t*) second_pair,
-																									(float*) weight_in,
-																									(float*) work,
-																									(float*) weight_i,
-																									(float*) weight_s,
-																									(float*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::jaccard_pairs<true, int32_t, double>(n,
-																									num_pairs,
-																									(int32_t*) csrPtr,
-																									(int32_t*) csrInd,
-																									(int32_t*) first_pair,
-																									(int32_t*) second_pair,
-																									(double*) weight_in,
-																									(double*) work,
-																									(double*) weight_i,
-																									(double*) weight_s,
-																									(double*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::jaccard_pairs<false, int32_t, double>(n,
-																										num_pairs,
-																										(int32_t*) csrPtr,
-																										(int32_t*) csrInd,
-																										(int32_t*) first_pair,
-																										(int32_t*) second_pair,
-																										(double*) weight_in,
-																										(double*) work,
-																										(double*) weight_i,
-																										(double*) weight_s,
-																										(double*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::jaccard_pairs<true, int64_t, float>(n,
-																									num_pairs,
-																									(int64_t*) csrPtr,
-																									(int64_t*) csrInd,
-																									(int64_t*) first_pair,
-																									(int64_t*) second_pair,
-																									(float*) weight_in,
-																									(float*) work,
-																									(float*) weight_i,
-																									(float*) weight_s,
-																									(float*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::jaccard_pairs<false, int64_t, float>(n,
-																									num_pairs,
-																									(int64_t*) csrPtr,
-																									(int64_t*) csrInd,
-																									(int64_t*) first_pair,
-																									(int64_t*) second_pair,
-																									(float*) weight_in,
-																									(float*) work,
-																									(float*) weight_i,
-																									(float*) weight_s,
-																									(float*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::jaccard_pairs<true, int64_t, double>(n,
-																									num_pairs,
-																									(int64_t*) csrPtr,
-																									(int64_t*) csrInd,
-																									(int64_t*) first_pair,
-																									(int64_t*) second_pair,
-																									(double*) weight_in,
-																									(double*) work,
-																									(double*) weight_i,
-																									(double*) weight_s,
-																									(double*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::jaccard_pairs<false, int64_t, double>(n,
-																										num_pairs,
-																										(int64_t*) csrPtr,
-																										(int64_t*) csrInd,
-																										(int64_t*) first_pair,
-																										(int64_t*) second_pair,
-																										(double*) weight_in,
-																										(double*) work,
-																										(double*) weight_i,
-																										(double*) weight_s,
-																										(double*) weight_j);
-	}
-
-	// Clean up temp arrays
-	ALLOC_FREE_TRY(weight_i, nullptr);
-	ALLOC_FREE_TRY(weight_s, nullptr);
-	ALLOC_FREE_TRY(work, nullptr);
-
-	return GDF_SUCCESS;
-}
-
diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu
new file mode 100644
index 00000000000..1943ba9f22b
--- /dev/null
+++ b/cpp/src/link_analysis/pagerank.cu
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+// Pagerank solver
+// Author: Alex Fender afender@nvidia.com
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string>
+ #include <sstream>
+#include <iostream>
+#include <iomanip>
+#include "cub/cub.cuh"
+#include <algorithm>
+#include <iomanip>
+
+#include <rmm_utils.h>
+
+#include "utilities/graph_utils.cuh"
+#include "utilities/error_utils.h"
+#include <cugraph.h>
+
+namespace cugraph
+{
+
+#ifdef DEBUG
+  #define PR_VERBOSE
+#endif
+template <typename IndexType, typename ValueType>
+bool pagerankIteration(IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd,ValueType *cscVal,
+                       ValueType alpha, ValueType *a, ValueType *b, float tolerance, int iter, int max_iter,
+                       ValueType * &tmp,  void* cub_d_temp_storage, size_t  cub_temp_storage_bytes,
+                       ValueType * &pr, ValueType *residual) {
+    ValueType  dot_res;
+    CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal,
+                                    cscPtr, cscInd, tmp, pr, n, n, e));
+
+    scal(n, alpha, pr);
+    dot_res = dot( n, a, tmp);
+    axpy(n, dot_res,  b,  pr);
+    scal(n, (ValueType)1.0/nrm2(n, pr) , pr);
+    axpy(n, (ValueType)-1.0,  pr,  tmp);
+    *residual = nrm2(n, tmp);
+    if (*residual < tolerance)
+    {
+        scal(n, (ValueType)1.0/nrm1(n,pr), pr);
+        return true;
+    }
+    else
+    {
+        if (iter< max_iter)
+        {
+            std::swap(pr, tmp);
+        }
+        else
+        {
+           scal(n, (ValueType)1.0/nrm1(n,pr), pr);
+        }
+        return false;
+    }
+}
+
+template <typename IndexType, typename ValueType>
+int pagerank(IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd, ValueType *cscVal,
+             ValueType alpha, ValueType *a, bool has_guess, float tolerance, int max_iter,
+             ValueType * &pagerank_vector, ValueType * &residual) {
+  int max_it, i = 0 ;
+  float tol;
+  bool converged = false;
+  ValueType randomProbability = static_cast<ValueType>( 1.0/n);
+  ValueType *b=0, *tmp=0;
+  void* cub_d_temp_storage = NULL;
+  size_t cub_temp_storage_bytes = 0;
+
+  if (max_iter > 0)
+      max_it = max_iter;
+  else
+      max_it =  500;
+
+  if (tolerance == 0.0f)
+      tol =  1.0E-6f;
+  else if (tolerance < 1.0f && tolerance > 0.0f)
+      tol = tolerance;
+  else
+      return -1;
+
+  if (alpha <= 0.0f || alpha >= 1.0f)
+      return -1;
+
+  cudaStream_t stream{nullptr};
+
+  ALLOC_TRY((void**)&b, sizeof(ValueType) * n, stream);
+#if 1/* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */
+  CUDA_TRY(cudaMalloc((void**)&tmp, sizeof(ValueType) * n));
+#else
+  ALLOC_TRY((void**)&tmp, sizeof(ValueType) * n, stream);
+#endif
+  cudaCheckError();
+
+  if (!has_guess) {
+       fill(n, pagerank_vector, randomProbability);
+       fill(n, tmp, randomProbability);
+  }
+  else {
+    copy(n, pagerank_vector, tmp);
+  }
+
+  fill(n, b, randomProbability);
+  update_dangling_nodes(n, a, alpha);
+
+  CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal,
+                                  cscPtr, cscInd, tmp, pagerank_vector, n, n, e));
+   // Allocate temporary storage
+  ALLOC_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream);
+  cudaCheckError()
+#ifdef PR_VERBOSE
+  std::stringstream ss;
+  ss.str(std::string());
+  ss <<" ------------------PageRank------------------"<< std::endl;
+  ss <<" --------------------------------------------"<< std::endl;
+  ss << std::setw(10) << "Iteration" << std::setw(15) << "Residual" << std::endl;
+  ss <<" --------------------------------------------"<< std::endl;
+  std::cout<<ss.str();
+#endif
+
+  while (!converged && i < max_it)
+  {
+      i++;
+      converged = pagerankIteration(n, e, cscPtr, cscInd, cscVal,
+                                    alpha, a, b, tol, i, max_it, tmp,
+                                    cub_d_temp_storage, cub_temp_storage_bytes,
+                                    pagerank_vector, residual);
+#ifdef PR_VERBOSE
+      ss.str(std::string());
+      ss << std::setw(10) << i ;
+      ss.precision(3);
+      ss << std::setw(15) << std::scientific << *residual  << std::endl;
+      std::cout<<ss.str();
+#endif
+  }
+  #ifdef PR_VERBOSE
+  std::cout <<" --------------------------------------------"<< std::endl;
+  #endif
+  //printv(n,pagerank_vector,0);
+
+  ALLOC_FREE_TRY(b, stream);
+#if 1/* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */
+  CUDA_TRY(cudaFree(tmp));
+#else
+  ALLOC_FREE_TRY(tmp, stream);
+#endif
+  ALLOC_FREE_TRY(cub_d_temp_storage, stream);
+
+  return converged ? 0 : 1;
+}
+
+//template int pagerank<int, half> (  int n, int e, int *cscPtr, int *cscInd,half *cscVal, half alpha, half *a, bool has_guess, float tolerance, int max_iter, half * &pagerank_vector, half * &residual);
+template int pagerank<int, float> (  int n, int e, int *cscPtr, int *cscInd,float *cscVal, float alpha, float *a, bool has_guess, float tolerance, int max_iter, float * &pagerank_vector, float * &residual);
+template int pagerank<int, double> (  int n, int e, int *cscPtr, int *cscInd,double *cscVal, double alpha, double *a, bool has_guess, float tolerance, int max_iter, double * &pagerank_vector, double * &residual);
+
+} //namespace cugraph
+
+template <typename WT>
+gdf_error gdf_pagerank_impl (gdf_graph *graph,
+                      gdf_column *pagerank, float alpha = 0.85,
+                      float tolerance = 1e-4, int max_iter = 200,
+                      bool has_guess = false) {
+  GDF_REQUIRE( graph->edgeList != nullptr, GDF_VALIDITY_UNSUPPORTED );
+  GDF_REQUIRE( graph->edgeList->src_indices->size == graph->edgeList->dest_indices->size, GDF_COLUMN_SIZE_MISMATCH );
+  GDF_REQUIRE( graph->edgeList->src_indices->dtype == graph->edgeList->dest_indices->dtype, GDF_UNSUPPORTED_DTYPE );
+  GDF_REQUIRE( graph->edgeList->src_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
+  GDF_REQUIRE( graph->edgeList->dest_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
+  GDF_REQUIRE( pagerank != nullptr , GDF_INVALID_API_CALL );
+  GDF_REQUIRE( pagerank->data != nullptr , GDF_INVALID_API_CALL );
+  GDF_REQUIRE( pagerank->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
+  GDF_REQUIRE( pagerank->size > 0 , GDF_INVALID_API_CALL );
+
+  int m=pagerank->size, nnz = graph->edgeList->src_indices->size, status = 0;
+  WT *d_pr, *d_val = nullptr, *d_leaf_vector = nullptr;
+  WT res = 1.0;
+  WT *residual = &res;
+
+  if (graph->transposedAdjList == nullptr) {
+    gdf_add_transposed_adj_list(graph);
+  }
+  cudaStream_t stream{nullptr};
+  ALLOC_TRY((void**)&d_leaf_vector, sizeof(WT) * m, stream);
+  ALLOC_TRY((void**)&d_val, sizeof(WT) * nnz , stream);
+#if 1/* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */
+  CUDA_TRY(cudaMalloc((void**)&d_pr, sizeof(WT) * m));
+#else
+  ALLOC_TRY((void**)&d_pr, sizeof(WT) * m, stream);
+#endif
+
+  //  The templating for HT_matrix_csc_coo assumes that m, nnz and data are all the same type
+  cugraph::HT_matrix_csc_coo(m, nnz, (int *)graph->transposedAdjList->offsets->data, (int *)graph->transposedAdjList->indices->data, d_val, d_leaf_vector);
+
+  if (has_guess)
+  {
+    GDF_REQUIRE( pagerank->data != nullptr, GDF_VALIDITY_UNSUPPORTED );
+    cugraph::copy<WT>(m, (WT*)pagerank->data, d_pr);
+  }
+
+  status = cugraph::pagerank<int32_t,WT>( m,nnz, (int*)graph->transposedAdjList->offsets->data, (int*)graph->transposedAdjList->indices->data,
+    d_val, alpha, d_leaf_vector, false, tolerance, max_iter, d_pr, residual);
+
+  if (status !=0)
+    switch ( status ) {
+      case -1: std::cerr<< "Error : bad parameters in Pagerank"<<std::endl; return GDF_CUDA_ERROR;
+      case 1: std::cerr<< "Warning : Pagerank did not reached the desired tolerance"<<std::endl;  return GDF_CUDA_ERROR;
+      default:  std::cerr<< "Pagerank failed"<<std::endl;  return GDF_CUDA_ERROR;
+    }
+
+  cugraph::copy<WT>(m, d_pr, (WT*)pagerank->data);
+
+  ALLOC_FREE_TRY(d_val, stream);
+#if 1/* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */
+  CUDA_TRY(cudaFree(d_pr));
+#else
+  ALLOC_FREE_TRY(d_pr, stream);
+#endif
+  ALLOC_FREE_TRY(d_leaf_vector, stream);
+
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_pagerank(gdf_graph *graph, gdf_column *pagerank, float alpha, float tolerance, int max_iter, bool has_guess) {
+  //
+  //  page rank operates on CSR and can't currently support 64-bit integers.
+  //
+  //  If csr doesn't exist, create it.  Then check type to make sure it is 32-bit.
+  //
+  GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL);
+  gdf_error err = gdf_add_adj_list(graph);
+  if (err != GDF_SUCCESS)
+    return err;
+
+  GDF_REQUIRE(graph->adjList->offsets->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+  GDF_REQUIRE(graph->adjList->indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+
+  switch (pagerank->dtype) {
+    case GDF_FLOAT32:   return gdf_pagerank_impl<float>(graph, pagerank, alpha, tolerance, max_iter, has_guess);
+    case GDF_FLOAT64:   return gdf_pagerank_impl<double>(graph, pagerank, alpha, tolerance, max_iter, has_guess);
+    default: return GDF_UNSUPPORTED_DTYPE;
+  }
+}
diff --git a/cpp/src/link_prediction/jaccard.cu b/cpp/src/link_prediction/jaccard.cu
new file mode 100644
index 00000000000..fe3502e4356
--- /dev/null
+++ b/cpp/src/link_prediction/jaccard.cu
@@ -0,0 +1,710 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/** ---------------------------------------------------------------------------*
+ * @brief The cugraph Jaccard core functionality
+ *
+ * @file jaccard.cu
+ * ---------------------------------------------------------------------------**/
+
+#include "utilities/graph_utils.cuh"
+#include "cugraph.h"
+#include "rmm_utils.h"
+#include "utilities/error_utils.h"
+
+namespace cugraph {
+  // Volume of neighboors (*weight_s)
+  template<bool weighted, typename IdxType, typename ValType>
+  __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+  jaccard_row_sum(IdxType n,
+                  IdxType *csrPtr,
+                  IdxType *csrInd,
+                  ValType *v,
+                  ValType *work) {
+    IdxType row, start, end, length;
+    ValType sum;
+    for (row = threadIdx.y + blockIdx.y * blockDim.y;
+        row < n;
+        row += gridDim.y * blockDim.y) {
+      start = csrPtr[row];
+      end = csrPtr[row + 1];
+      length = end - start;
+      //compute row sums
+      if (weighted) {
+        sum = parallel_prefix_sum(length, csrInd + start, v);
+        if (threadIdx.x == 0)
+          work[row] = sum;
+      }
+      else {
+        work[row] = (ValType) length;
+      }
+    }
+  }
+
+  // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
+  template<bool weighted, typename IdxType, typename ValType>
+  __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+  jaccard_is(IdxType n,
+             IdxType *csrPtr,
+             IdxType *csrInd,
+             ValType *v,
+             ValType *work,
+             ValType *weight_i,
+             ValType *weight_s) {
+    IdxType i, j, row, col, Ni, Nj;
+    IdxType ref, cur, ref_col, cur_col, match;
+    ValType ref_val;
+
+    for (row = threadIdx.z + blockIdx.z * blockDim.z;
+        row < n;
+        row += gridDim.z * blockDim.z) {
+      for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y;
+          j < csrPtr[row + 1];
+          j += gridDim.y * blockDim.y) {
+        col = csrInd[j];
+        //find which row has least elements (and call it reference row)
+        Ni = csrPtr[row + 1] - csrPtr[row];
+        Nj = csrPtr[col + 1] - csrPtr[col];
+        ref = (Ni < Nj) ? row : col;
+        cur = (Ni < Nj) ? col : row;
+
+        //compute new sum weights
+        weight_s[j] = work[row] + work[col];
+
+        //compute new intersection weights
+        //search for the element with the same column index in the reference row
+        for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
+            i += gridDim.x * blockDim.x) {
+          match = -1;
+          ref_col = csrInd[i];
+          if (weighted) {
+            ref_val = v[ref_col];
+          }
+          else {
+            ref_val = 1.0;
+          }
+
+          //binary search (column indices are sorted within each row)
+          IdxType left = csrPtr[cur];
+          IdxType right = csrPtr[cur + 1] - 1;
+          while (left <= right) {
+            IdxType middle = (left + right) >> 1;
+            cur_col = csrInd[middle];
+            if (cur_col > ref_col) {
+              right = middle - 1;
+            }
+            else if (cur_col < ref_col) {
+              left = middle + 1;
+            }
+            else {
+              match = middle;
+              break;
+            }
+          }
+
+          //if the element with the same column index in the reference row has been found
+          if (match != -1) {
+            atomicAdd(&weight_i[j], ref_val);
+          }
+        }
+      }
+    }
+  }
+
+  // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
+  // Using list of node pairs
+  template<bool weighted, typename IdxType, typename ValType>
+  __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+  jaccard_is_pairs(IdxType num_pairs,
+                   IdxType *csrPtr,
+                   IdxType *csrInd,
+                   IdxType *first_pair,
+                   IdxType *second_pair,
+                   ValType *v,
+                   ValType *work,
+                   ValType *weight_i,
+                   ValType *weight_s) {
+    IdxType i, idx, row, col, Ni, Nj;
+    IdxType ref, cur, ref_col, cur_col, match;
+    ValType ref_val;
+
+    for (idx = threadIdx.z + blockIdx.z * blockDim.z;
+        idx < num_pairs;
+        idx += gridDim.z * blockDim.z) {
+      row = first_pair[idx];
+      col = second_pair[idx];
+      //find which row has least elements (and call it reference row)
+      Ni = csrPtr[row + 1] - csrPtr[row];
+      Nj = csrPtr[col + 1] - csrPtr[col];
+      ref = (Ni < Nj) ? row : col;
+      cur = (Ni < Nj) ? col : row;
+
+      //compute new sum weights
+      weight_s[idx] = work[row] + work[col];
+
+      //compute new intersection weights
+      //search for the element with the same column index in the reference row
+      for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x;
+          i < csrPtr[ref + 1];
+          i += gridDim.x * blockDim.x) {
+        match = -1;
+        ref_col = csrInd[i];
+        if (weighted) {
+          ref_val = v[ref_col];
+        }
+        else {
+          ref_val = 1.0;
+        }
+
+        //binary search (column indices are sorted within each row)
+        IdxType left = csrPtr[cur];
+        IdxType right = csrPtr[cur + 1] - 1;
+        while (left <= right) {
+          IdxType middle = (left + right) >> 1;
+          cur_col = csrInd[middle];
+          if (cur_col > ref_col) {
+            right = middle - 1;
+          }
+          else if (cur_col < ref_col) {
+            left = middle + 1;
+          }
+          else {
+            match = middle;
+            break;
+          }
+        }
+
+        //if the element with the same column index in the reference row has been found
+        if (match != -1) {
+          atomicAdd(&weight_i[idx], ref_val);
+        }
+      }
+    }
+  }
+
+  //Jaccard  weights (*weight)
+  template<bool weighted, typename IdxType, typename ValType>
+  __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+  jaccard_jw(IdxType e,
+             IdxType *csrPtr,
+             IdxType *csrInd,
+             ValType *weight_i,
+             ValType *weight_s,
+             ValType *weight_j) {
+    IdxType j;
+    ValType Wi, Ws, Wu;
+
+    for (j = threadIdx.x + blockIdx.x * blockDim.x;
+        j < e;
+        j += gridDim.x * blockDim.x) {
+      Wi = weight_i[j];
+      Ws = weight_s[j];
+      Wu = Ws - Wi;
+      weight_j[j] = (Wi / Wu);
+    }
+  }
+
+  template<bool weighted, typename IdxType, typename ValType>
+  int jaccard(IdxType n,
+              IdxType e,
+              IdxType *csrPtr,
+              IdxType *csrInd,
+              ValType *weight_in,
+              ValType *work,
+              ValType *weight_i,
+              ValType *weight_s,
+              ValType *weight_j) {
+    dim3 nthreads, nblocks;
+    int y = 4;
+
+    //setup launch configuration
+    nthreads.x = 32;
+    nthreads.y = y;
+    nthreads.z = 1;
+    nblocks.x = 1;
+    nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS);
+    nblocks.z = 1;
+    //launch kernel
+    jaccard_row_sum<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
+                                                                        csrPtr,
+                                                                        csrInd,
+                                                                        weight_in,
+                                                                        work);
+    cudaDeviceSynchronize();
+    fill(e, weight_i, (ValType) 0.0);
+    //setup launch configuration
+    nthreads.x = 32 / y;
+    nthreads.y = y;
+    nthreads.z = 8;
+    nblocks.x = 1;
+    nblocks.y = 1;
+    nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1;
+    //launch kernel
+    jaccard_is<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
+                                                                   csrPtr,
+                                                                   csrInd,
+                                                                   weight_in,
+                                                                   work,
+                                                                   weight_i,
+                                                                   weight_s);
+
+    //setup launch configuration
+    nthreads.x = min(e, (IdxType) CUDA_MAX_KERNEL_THREADS);
+    nthreads.y = 1;
+    nthreads.z = 1;
+    nblocks.x = min((e + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS);
+    nblocks.y = 1;
+    nblocks.z = 1;
+    //launch kernel
+    jaccard_jw<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(e,
+                                                                   csrPtr,
+                                                                   csrInd,
+                                                                   weight_i,
+                                                                   weight_s,
+                                                                   weight_j);
+
+    return 0;
+  }
+
+  template<bool weighted, typename IdxType, typename ValType>
+  int jaccard_pairs(IdxType n,
+                    IdxType num_pairs,
+                    IdxType *csrPtr,
+                    IdxType *csrInd,
+                    IdxType *first_pair,
+                    IdxType *second_pair,
+                    ValType *weight_in,
+                    ValType *work,
+                    ValType *weight_i,
+                    ValType *weight_s,
+                    ValType *weight_j) {
+    dim3 nthreads, nblocks;
+    int y = 4;
+
+    //setup launch configuration
+    nthreads.x = 32;
+    nthreads.y = y;
+    nthreads.z = 1;
+    nblocks.x = 1;
+    nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS);
+    nblocks.z = 1;
+    //launch kernel
+    jaccard_row_sum<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
+                                                                        csrPtr,
+                                                                        csrInd,
+                                                                        weight_in,
+                                                                        work);
+    cudaDeviceSynchronize();
+    fill(num_pairs, weight_i, (ValType) 0.0);
+    //setup launch configuration
+    nthreads.x = 32;
+    nthreads.y = 1;
+    nthreads.z = 8;
+    nblocks.x = 1;
+    nblocks.y = 1;
+    nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1;
+    //launch kernel
+    jaccard_is_pairs<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(num_pairs,
+                                                                         csrPtr,
+                                                                         csrInd,
+                                                                         first_pair,
+                                                                         second_pair,
+                                                                         weight_in,
+                                                                         work,
+                                                                         weight_i,
+                                                                         weight_s);
+
+    //setup launch configuration
+    nthreads.x = min(num_pairs, (IdxType) CUDA_MAX_KERNEL_THREADS);
+    nthreads.y = 1;
+    nthreads.z = 1;
+    nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS);
+    nblocks.y = 1;
+    nblocks.z = 1;
+    //launch kernel
+    jaccard_jw<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(num_pairs,
+                                                                   csrPtr,
+                                                                   csrInd,
+                                                                   weight_i,
+                                                                   weight_s,
+                                                                   weight_j);
+
+    return 0;
+  }
+} // End cugraph namespace
+
+gdf_error gdf_jaccard(gdf_graph *graph, gdf_column *weights, gdf_column *result) {
+  GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL);
+  GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED);
+
+  GDF_TRY(gdf_add_adj_list(graph));
+  GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL);
+
+  bool weighted = (weights != nullptr);
+
+  gdf_dtype ValueType = result->dtype;
+  gdf_dtype IndexType = graph->adjList->offsets->dtype;
+
+  void *csrPtr = graph->adjList->offsets->data;
+  void *csrInd = graph->adjList->indices->data;
+  void *weight_i = nullptr;
+  void *weight_s = nullptr;
+  void *weight_j = result->data;
+  void *work = nullptr;
+  void *weight_in = nullptr;
+  if (weighted)
+    weight_in = weights->data;
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::jaccard<true, int32_t, float>(n,
+                                           e,
+                                           (int32_t*) csrPtr,
+                                           (int32_t*) csrInd,
+                                           (float*) weight_in,
+                                           (float*) work,
+                                           (float*) weight_i,
+                                           (float*) weight_s,
+                                           (float*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::jaccard<false, int32_t, float>(n,
+                                            e,
+                                            (int32_t*) csrPtr,
+                                            (int32_t*) csrInd,
+                                            (float*) weight_in,
+                                            (float*) work,
+                                            (float*) weight_i,
+                                            (float*) weight_s,
+                                            (float*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::jaccard<true, int32_t, double>(n,
+                                            e,
+                                            (int32_t*) csrPtr,
+                                            (int32_t*) csrInd,
+                                            (double*) weight_in,
+                                            (double*) work,
+                                            (double*) weight_i,
+                                            (double*) weight_s,
+                                            (double*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::jaccard<false, int32_t, double>(n,
+                                             e,
+                                             (int32_t*) csrPtr,
+                                             (int32_t*) csrInd,
+                                             (double*) weight_in,
+                                             (double*) work,
+                                             (double*) weight_i,
+                                             (double*) weight_s,
+                                             (double*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::jaccard<true, int64_t, float>(n,
+                                           e,
+                                           (int64_t*) csrPtr,
+                                           (int64_t*) csrInd,
+                                           (float*) weight_in,
+                                           (float*) work,
+                                           (float*) weight_i,
+                                           (float*) weight_s,
+                                           (float*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::jaccard<false, int64_t, float>(n,
+                                            e,
+                                            (int64_t*) csrPtr,
+                                            (int64_t*) csrInd,
+                                            (float*) weight_in,
+                                            (float*) work,
+                                            (float*) weight_i,
+                                            (float*) weight_s,
+                                            (float*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::jaccard<true, int64_t, double>(n,
+                                            e,
+                                            (int64_t*) csrPtr,
+                                            (int64_t*) csrInd,
+                                            (double*) weight_in,
+                                            (double*) work,
+                                            (double*) weight_i,
+                                            (double*) weight_s,
+                                            (double*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::jaccard<false, int64_t, double>(n,
+                                             e,
+                                             (int64_t*) csrPtr,
+                                             (int64_t*) csrInd,
+                                             (double*) weight_in,
+                                             (double*) work,
+                                             (double*) weight_i,
+                                             (double*) weight_s,
+                                             (double*) weight_j);
+  }
+
+// Clean up temp arrays
+  ALLOC_FREE_TRY(weight_i, nullptr);
+  ALLOC_FREE_TRY(weight_s, nullptr);
+  ALLOC_FREE_TRY(work, nullptr);
+
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_jaccard_list(gdf_graph* graph,
+                           gdf_column* weights,
+                           gdf_column* first,
+                           gdf_column* second,
+                           gdf_column* result) {
+  GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL);
+  GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED);
+
+  GDF_REQUIRE(first != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(first->data != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(!first->valid, GDF_VALIDITY_UNSUPPORTED);
+
+  GDF_REQUIRE(second != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(second->data != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(!second->valid, GDF_VALIDITY_UNSUPPORTED);
+
+  GDF_TRY(gdf_add_adj_list(graph));
+  GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL);
+
+  bool weighted = (weights != nullptr);
+
+  gdf_dtype ValueType = result->dtype;
+  gdf_dtype IndexType = graph->adjList->offsets->dtype;
+  GDF_REQUIRE(first->dtype == IndexType, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(second->dtype == IndexType, GDF_INVALID_API_CALL);
+
+  void *first_pair = first->data;
+  void *second_pair = second->data;
+  void *csrPtr = graph->adjList->offsets->data;
+  void *csrInd = graph->adjList->indices->data;
+  void *weight_i = nullptr;
+  void *weight_s = nullptr;
+  void *weight_j = result->data;
+  void *work = nullptr;
+  void *weight_in = nullptr;
+  if (weighted)
+    weight_in = weights->data;
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::jaccard_pairs<true, int32_t, float>(n,
+                                                 num_pairs,
+                                                 (int32_t*) csrPtr,
+                                                 (int32_t*) csrInd,
+                                                 (int32_t*) first_pair,
+                                                 (int32_t*) second_pair,
+                                                 (float*) weight_in,
+                                                 (float*) work,
+                                                 (float*) weight_i,
+                                                 (float*) weight_s,
+                                                 (float*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::jaccard_pairs<false, int32_t, float>(n,
+                                                  num_pairs,
+                                                  (int32_t*) csrPtr,
+                                                  (int32_t*) csrInd,
+                                                  (int32_t*) first_pair,
+                                                  (int32_t*) second_pair,
+                                                  (float*) weight_in,
+                                                  (float*) work,
+                                                  (float*) weight_i,
+                                                  (float*) weight_s,
+                                                  (float*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::jaccard_pairs<true, int32_t, double>(n,
+                                                  num_pairs,
+                                                  (int32_t*) csrPtr,
+                                                  (int32_t*) csrInd,
+                                                  (int32_t*) first_pair,
+                                                  (int32_t*) second_pair,
+                                                  (double*) weight_in,
+                                                  (double*) work,
+                                                  (double*) weight_i,
+                                                  (double*) weight_s,
+                                                  (double*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::jaccard_pairs<false, int32_t, double>(n,
+                                                   num_pairs,
+                                                   (int32_t*) csrPtr,
+                                                   (int32_t*) csrInd,
+                                                   (int32_t*) first_pair,
+                                                   (int32_t*) second_pair,
+                                                   (double*) weight_in,
+                                                   (double*) work,
+                                                   (double*) weight_i,
+                                                   (double*) weight_s,
+                                                   (double*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::jaccard_pairs<true, int64_t, float>(n,
+                                                 num_pairs,
+                                                 (int64_t*) csrPtr,
+                                                 (int64_t*) csrInd,
+                                                 (int64_t*) first_pair,
+                                                 (int64_t*) second_pair,
+                                                 (float*) weight_in,
+                                                 (float*) work,
+                                                 (float*) weight_i,
+                                                 (float*) weight_s,
+                                                 (float*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::jaccard_pairs<false, int64_t, float>(n,
+                                                  num_pairs,
+                                                  (int64_t*) csrPtr,
+                                                  (int64_t*) csrInd,
+                                                  (int64_t*) first_pair,
+                                                  (int64_t*) second_pair,
+                                                  (float*) weight_in,
+                                                  (float*) work,
+                                                  (float*) weight_i,
+                                                  (float*) weight_s,
+                                                  (float*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::jaccard_pairs<true, int64_t, double>(n,
+                                                  num_pairs,
+                                                  (int64_t*) csrPtr,
+                                                  (int64_t*) csrInd,
+                                                  (int64_t*) first_pair,
+                                                  (int64_t*) second_pair,
+                                                  (double*) weight_in,
+                                                  (double*) work,
+                                                  (double*) weight_i,
+                                                  (double*) weight_s,
+                                                  (double*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::jaccard_pairs<false, int64_t, double>(n,
+                                                   num_pairs,
+                                                   (int64_t*) csrPtr,
+                                                   (int64_t*) csrInd,
+                                                   (int64_t*) first_pair,
+                                                   (int64_t*) second_pair,
+                                                   (double*) weight_in,
+                                                   (double*) work,
+                                                   (double*) weight_i,
+                                                   (double*) weight_s,
+                                                   (double*) weight_j);
+  }
+
+  // Clean up temp arrays
+  ALLOC_FREE_TRY(weight_i, nullptr);
+  ALLOC_FREE_TRY(weight_s, nullptr);
+  ALLOC_FREE_TRY(work, nullptr);
+
+  return GDF_SUCCESS;
+}
+
diff --git a/cpp/src/link_prediction/overlap.cu b/cpp/src/link_prediction/overlap.cu
new file mode 100644
index 00000000000..cce0ac99752
--- /dev/null
+++ b/cpp/src/link_prediction/overlap.cu
@@ -0,0 +1,709 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/** ---------------------------------------------------------------------------*
+ * @brief The cugraph Jaccard core functionality
+ *
+ * @file jaccard.cu
+ * ---------------------------------------------------------------------------**/
+
+#include "utilities/graph_utils.cuh"
+#include "cugraph.h"
+#include "rmm_utils.h"
+#include "utilities/error_utils.h"
+
+namespace cugraph {
+  // Volume of neighboors (*weight_s)
+  template<bool weighted, typename IdxType, typename ValType>
+  __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+  overlap_row_sum(IdxType n,
+                  IdxType *csrPtr,
+                  IdxType *csrInd,
+                  ValType *v,
+                  ValType *work) {
+    IdxType row, start, end, length;
+    ValType sum;
+    for (row = threadIdx.y + blockIdx.y * blockDim.y;
+        row < n;
+        row += gridDim.y * blockDim.y) {
+      start = csrPtr[row];
+      end = csrPtr[row + 1];
+      length = end - start;
+      //compute row sums
+      if (weighted) {
+        sum = parallel_prefix_sum(length, csrInd + start, v);
+        if (threadIdx.x == 0)
+          work[row] = sum;
+      }
+      else {
+        work[row] = (ValType) length;
+      }
+    }
+  }
+
+  // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
+  template<bool weighted, typename IdxType, typename ValType>
+  __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+  overlap_is(IdxType n,
+             IdxType *csrPtr,
+             IdxType *csrInd,
+             ValType *v,
+             ValType *work,
+             ValType *weight_i,
+             ValType *weight_s) {
+    IdxType i, j, row, col, Ni, Nj;
+    IdxType ref, cur, ref_col, cur_col, match;
+    ValType ref_val;
+
+    for (row = threadIdx.z + blockIdx.z * blockDim.z;
+        row < n;
+        row += gridDim.z * blockDim.z) {
+      for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y;
+          j < csrPtr[row + 1];
+          j += gridDim.y * blockDim.y) {
+        col = csrInd[j];
+        //find which row has least elements (and call it reference row)
+        Ni = csrPtr[row + 1] - csrPtr[row];
+        Nj = csrPtr[col + 1] - csrPtr[col];
+        ref = (Ni < Nj) ? row : col;
+        cur = (Ni < Nj) ? col : row;
+
+        //compute new sum weights
+        weight_s[j] = min(work[row], work[col]);
+
+        //compute new intersection weights
+        //search for the element with the same column index in the reference row
+        for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
+            i += gridDim.x * blockDim.x) {
+          match = -1;
+          ref_col = csrInd[i];
+          if (weighted) {
+            ref_val = v[ref_col];
+          }
+          else {
+            ref_val = 1.0;
+          }
+
+          //binary search (column indices are sorted within each row)
+          IdxType left = csrPtr[cur];
+          IdxType right = csrPtr[cur + 1] - 1;
+          while (left <= right) {
+            IdxType middle = (left + right) >> 1;
+            cur_col = csrInd[middle];
+            if (cur_col > ref_col) {
+              right = middle - 1;
+            }
+            else if (cur_col < ref_col) {
+              left = middle + 1;
+            }
+            else {
+              match = middle;
+              break;
+            }
+          }
+
+          //if the element with the same column index in the reference row has been found
+          if (match != -1) {
+            atomicAdd(&weight_i[j], ref_val);
+          }
+        }
+      }
+    }
+  }
+
+  // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
+  // Using list of node pairs
+  template<bool weighted, typename IdxType, typename ValType>
+  __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+  overlap_is_pairs(IdxType num_pairs,
+                   IdxType *csrPtr,
+                   IdxType *csrInd,
+                   IdxType *first_pair,
+                   IdxType *second_pair,
+                   ValType *v,
+                   ValType *work,
+                   ValType *weight_i,
+                   ValType *weight_s) {
+    IdxType i, idx, row, col, Ni, Nj;
+    IdxType ref, cur, ref_col, cur_col, match;
+    ValType ref_val;
+
+    for (idx = threadIdx.z + blockIdx.z * blockDim.z;
+        idx < num_pairs;
+        idx += gridDim.z * blockDim.z) {
+      row = first_pair[idx];
+      col = second_pair[idx];
+      //find which row has least elements (and call it reference row)
+      Ni = csrPtr[row + 1] - csrPtr[row];
+      Nj = csrPtr[col + 1] - csrPtr[col];
+      ref = (Ni < Nj) ? row : col;
+      cur = (Ni < Nj) ? col : row;
+
+      //compute new sum weights
+      weight_s[idx] = min(work[row], work[col]);
+
+      //compute new intersection weights
+      //search for the element with the same column index in the reference row
+      for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x;
+          i < csrPtr[ref + 1];
+          i += gridDim.x * blockDim.x) {
+        match = -1;
+        ref_col = csrInd[i];
+        if (weighted) {
+          ref_val = v[ref_col];
+        }
+        else {
+          ref_val = 1.0;
+        }
+
+        //binary search (column indices are sorted within each row)
+        IdxType left = csrPtr[cur];
+        IdxType right = csrPtr[cur + 1] - 1;
+        while (left <= right) {
+          IdxType middle = (left + right) >> 1;
+          cur_col = csrInd[middle];
+          if (cur_col > ref_col) {
+            right = middle - 1;
+          }
+          else if (cur_col < ref_col) {
+            left = middle + 1;
+          }
+          else {
+            match = middle;
+            break;
+          }
+        }
+
+        //if the element with the same column index in the reference row has been found
+        if (match != -1) {
+          atomicAdd(&weight_i[idx], ref_val);
+        }
+      }
+    }
+  }
+
+  //Jaccard  weights (*weight)
+  template<bool weighted, typename IdxType, typename ValType>
+  __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+  overlap_jw(IdxType e,
+             IdxType *csrPtr,
+             IdxType *csrInd,
+             ValType *weight_i,
+             ValType *weight_s,
+             ValType *weight_j) {
+    IdxType j;
+    ValType Wi, Wu;
+
+    for (j = threadIdx.x + blockIdx.x * blockDim.x;
+        j < e;
+        j += gridDim.x * blockDim.x) {
+      Wi = weight_i[j];
+      Wu = weight_s[j];
+      weight_j[j] = (Wi / Wu);
+    }
+  }
+
+  template<bool weighted, typename IdxType, typename ValType>
+  int overlap(IdxType n,
+              IdxType e,
+              IdxType *csrPtr,
+              IdxType *csrInd,
+              ValType *weight_in,
+              ValType *work,
+              ValType *weight_i,
+              ValType *weight_s,
+              ValType *weight_j) {
+    dim3 nthreads, nblocks;
+    int y = 4;
+
+    //setup launch configuration
+    nthreads.x = 32;
+    nthreads.y = y;
+    nthreads.z = 1;
+    nblocks.x = 1;
+    nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS);
+    nblocks.z = 1;
+    //launch kernel
+    overlap_row_sum<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
+                                                                        csrPtr,
+                                                                        csrInd,
+                                                                        weight_in,
+                                                                        work);
+    cudaDeviceSynchronize();
+    fill(e, weight_i, (ValType) 0.0);
+    //setup launch configuration
+    nthreads.x = 32 / y;
+    nthreads.y = y;
+    nthreads.z = 8;
+    nblocks.x = 1;
+    nblocks.y = 1;
+    nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1;
+    //launch kernel
+    overlap_is<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
+                                                                   csrPtr,
+                                                                   csrInd,
+                                                                   weight_in,
+                                                                   work,
+                                                                   weight_i,
+                                                                   weight_s);
+
+    //setup launch configuration
+    nthreads.x = min(e, (IdxType) CUDA_MAX_KERNEL_THREADS);
+    nthreads.y = 1;
+    nthreads.z = 1;
+    nblocks.x = min((e + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS);
+    nblocks.y = 1;
+    nblocks.z = 1;
+    //launch kernel
+    overlap_jw<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(e,
+                                                                   csrPtr,
+                                                                   csrInd,
+                                                                   weight_i,
+                                                                   weight_s,
+                                                                   weight_j);
+
+    return 0;
+  }
+
+  template<bool weighted, typename IdxType, typename ValType>
+  int overlap_pairs(IdxType n,
+                    IdxType num_pairs,
+                    IdxType *csrPtr,
+                    IdxType *csrInd,
+                    IdxType *first_pair,
+                    IdxType *second_pair,
+                    ValType *weight_in,
+                    ValType *work,
+                    ValType *weight_i,
+                    ValType *weight_s,
+                    ValType *weight_j) {
+    dim3 nthreads, nblocks;
+    int y = 4;
+
+    //setup launch configuration
+    nthreads.x = 32;
+    nthreads.y = y;
+    nthreads.z = 1;
+    nblocks.x = 1;
+    nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS);
+    nblocks.z = 1;
+    //launch kernel
+    overlap_row_sum<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
+                                                                        csrPtr,
+                                                                        csrInd,
+                                                                        weight_in,
+                                                                        work);
+    cudaDeviceSynchronize();
+    fill(num_pairs, weight_i, (ValType) 0.0);
+    //setup launch configuration
+    nthreads.x = 32;
+    nthreads.y = 1;
+    nthreads.z = 8;
+    nblocks.x = 1;
+    nblocks.y = 1;
+    nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1;
+    //launch kernel
+    overlap_is_pairs<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(num_pairs,
+                                                                         csrPtr,
+                                                                         csrInd,
+                                                                         first_pair,
+                                                                         second_pair,
+                                                                         weight_in,
+                                                                         work,
+                                                                         weight_i,
+                                                                         weight_s);
+
+    //setup launch configuration
+    nthreads.x = min(num_pairs, (IdxType) CUDA_MAX_KERNEL_THREADS);
+    nthreads.y = 1;
+    nthreads.z = 1;
+    nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS);
+    nblocks.y = 1;
+    nblocks.z = 1;
+    //launch kernel
+    overlap_jw<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(num_pairs,
+                                                                   csrPtr,
+                                                                   csrInd,
+                                                                   weight_i,
+                                                                   weight_s,
+                                                                   weight_j);
+
+    return 0;
+  }
+} // End cugraph namespace
+
+gdf_error gdf_overlap(gdf_graph *graph, gdf_column *weights, gdf_column *result) {
+  GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL);
+  GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED);
+
+  GDF_TRY(gdf_add_adj_list(graph));
+  GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL);
+
+  bool weighted = (weights != nullptr);
+
+  gdf_dtype ValueType = result->dtype;
+  gdf_dtype IndexType = graph->adjList->offsets->dtype;
+
+  void *csrPtr = graph->adjList->offsets->data;
+  void *csrInd = graph->adjList->indices->data;
+  void *weight_i = nullptr;
+  void *weight_s = nullptr;
+  void *weight_j = result->data;
+  void *work = nullptr;
+  void *weight_in = nullptr;
+  if (weighted)
+    weight_in = weights->data;
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::overlap<true, int32_t, float>(n,
+                                           e,
+                                           (int32_t*) csrPtr,
+                                           (int32_t*) csrInd,
+                                           (float*) weight_in,
+                                           (float*) work,
+                                           (float*) weight_i,
+                                           (float*) weight_s,
+                                           (float*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::overlap<false, int32_t, float>(n,
+                                            e,
+                                            (int32_t*) csrPtr,
+                                            (int32_t*) csrInd,
+                                            (float*) weight_in,
+                                            (float*) work,
+                                            (float*) weight_i,
+                                            (float*) weight_s,
+                                            (float*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::overlap<true, int32_t, double>(n,
+                                            e,
+                                            (int32_t*) csrPtr,
+                                            (int32_t*) csrInd,
+                                            (double*) weight_in,
+                                            (double*) work,
+                                            (double*) weight_i,
+                                            (double*) weight_s,
+                                            (double*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::overlap<false, int32_t, double>(n,
+                                             e,
+                                             (int32_t*) csrPtr,
+                                             (int32_t*) csrInd,
+                                             (double*) weight_in,
+                                             (double*) work,
+                                             (double*) weight_i,
+                                             (double*) weight_s,
+                                             (double*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::overlap<true, int64_t, float>(n,
+                                           e,
+                                           (int64_t*) csrPtr,
+                                           (int64_t*) csrInd,
+                                           (float*) weight_in,
+                                           (float*) work,
+                                           (float*) weight_i,
+                                           (float*) weight_s,
+                                           (float*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::overlap<false, int64_t, float>(n,
+                                            e,
+                                            (int64_t*) csrPtr,
+                                            (int64_t*) csrInd,
+                                            (float*) weight_in,
+                                            (float*) work,
+                                            (float*) weight_i,
+                                            (float*) weight_s,
+                                            (float*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::overlap<true, int64_t, double>(n,
+                                            e,
+                                            (int64_t*) csrPtr,
+                                            (int64_t*) csrInd,
+                                            (double*) weight_in,
+                                            (double*) work,
+                                            (double*) weight_i,
+                                            (double*) weight_s,
+                                            (double*) weight_j);
+  }
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t e = graph->adjList->indices->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::overlap<false, int64_t, double>(n,
+                                             e,
+                                             (int64_t*) csrPtr,
+                                             (int64_t*) csrInd,
+                                             (double*) weight_in,
+                                             (double*) work,
+                                             (double*) weight_i,
+                                             (double*) weight_s,
+                                             (double*) weight_j);
+  }
+
+// Clean up temp arrays
+  ALLOC_FREE_TRY(weight_i, nullptr);
+  ALLOC_FREE_TRY(weight_s, nullptr);
+  ALLOC_FREE_TRY(work, nullptr);
+
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_overlap_list(gdf_graph* graph,
+                           gdf_column* weights,
+                           gdf_column* first,
+                           gdf_column* second,
+                           gdf_column* result) {
+  GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL);
+  GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED);
+
+  GDF_REQUIRE(first != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(first->data != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(!first->valid, GDF_VALIDITY_UNSUPPORTED);
+
+  GDF_REQUIRE(second != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(second->data != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(!second->valid, GDF_VALIDITY_UNSUPPORTED);
+
+  GDF_TRY(gdf_add_adj_list(graph));
+  GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL);
+
+  bool weighted = (weights != nullptr);
+
+  gdf_dtype ValueType = result->dtype;
+  gdf_dtype IndexType = graph->adjList->offsets->dtype;
+  GDF_REQUIRE(first->dtype == IndexType, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(second->dtype == IndexType, GDF_INVALID_API_CALL);
+
+  void *first_pair = first->data;
+  void *second_pair = second->data;
+  void *csrPtr = graph->adjList->offsets->data;
+  void *csrInd = graph->adjList->indices->data;
+  void *weight_i = nullptr;
+  void *weight_s = nullptr;
+  void *weight_j = result->data;
+  void *work = nullptr;
+  void *weight_in = nullptr;
+  if (weighted)
+    weight_in = weights->data;
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::overlap_pairs<true, int32_t, float>(n,
+                                                 num_pairs,
+                                                 (int32_t*) csrPtr,
+                                                 (int32_t*) csrInd,
+                                                 (int32_t*) first_pair,
+                                                 (int32_t*) second_pair,
+                                                 (float*) weight_in,
+                                                 (float*) work,
+                                                 (float*) weight_i,
+                                                 (float*) weight_s,
+                                                 (float*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::overlap_pairs<false, int32_t, float>(n,
+                                                  num_pairs,
+                                                  (int32_t*) csrPtr,
+                                                  (int32_t*) csrInd,
+                                                  (int32_t*) first_pair,
+                                                  (int32_t*) second_pair,
+                                                  (float*) weight_in,
+                                                  (float*) work,
+                                                  (float*) weight_i,
+                                                  (float*) weight_s,
+                                                  (float*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::overlap_pairs<true, int32_t, double>(n,
+                                                  num_pairs,
+                                                  (int32_t*) csrPtr,
+                                                  (int32_t*) csrInd,
+                                                  (int32_t*) first_pair,
+                                                  (int32_t*) second_pair,
+                                                  (double*) weight_in,
+                                                  (double*) work,
+                                                  (double*) weight_i,
+                                                  (double*) weight_s,
+                                                  (double*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) {
+    int32_t n = graph->adjList->offsets->size - 1;
+    int32_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::overlap_pairs<false, int32_t, double>(n,
+                                                   num_pairs,
+                                                   (int32_t*) csrPtr,
+                                                   (int32_t*) csrInd,
+                                                   (int32_t*) first_pair,
+                                                   (int32_t*) second_pair,
+                                                   (double*) weight_in,
+                                                   (double*) work,
+                                                   (double*) weight_i,
+                                                   (double*) weight_s,
+                                                   (double*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::overlap_pairs<true, int64_t, float>(n,
+                                                 num_pairs,
+                                                 (int64_t*) csrPtr,
+                                                 (int64_t*) csrInd,
+                                                 (int64_t*) first_pair,
+                                                 (int64_t*) second_pair,
+                                                 (float*) weight_in,
+                                                 (float*) work,
+                                                 (float*) weight_i,
+                                                 (float*) weight_s,
+                                                 (float*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(float) * n, nullptr);
+    cugraph::overlap_pairs<false, int64_t, float>(n,
+                                                  num_pairs,
+                                                  (int64_t*) csrPtr,
+                                                  (int64_t*) csrInd,
+                                                  (int64_t*) first_pair,
+                                                  (int64_t*) second_pair,
+                                                  (float*) weight_in,
+                                                  (float*) work,
+                                                  (float*) weight_i,
+                                                  (float*) weight_s,
+                                                  (float*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::overlap_pairs<true, int64_t, double>(n,
+                                                  num_pairs,
+                                                  (int64_t*) csrPtr,
+                                                  (int64_t*) csrInd,
+                                                  (int64_t*) first_pair,
+                                                  (int64_t*) second_pair,
+                                                  (double*) weight_in,
+                                                  (double*) work,
+                                                  (double*) weight_i,
+                                                  (double*) weight_s,
+                                                  (double*) weight_j);
+  }
+
+  if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) {
+    int64_t n = graph->adjList->offsets->size - 1;
+    int64_t num_pairs = first->size;
+    ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
+    ALLOC_TRY(&work, sizeof(double) * n, nullptr);
+    cugraph::overlap_pairs<false, int64_t, double>(n,
+                                                   num_pairs,
+                                                   (int64_t*) csrPtr,
+                                                   (int64_t*) csrInd,
+                                                   (int64_t*) first_pair,
+                                                   (int64_t*) second_pair,
+                                                   (double*) weight_in,
+                                                   (double*) work,
+                                                   (double*) weight_i,
+                                                   (double*) weight_s,
+                                                   (double*) weight_j);
+  }
+
+  // Clean up temp arrays
+  ALLOC_FREE_TRY(weight_i, nullptr);
+  ALLOC_FREE_TRY(weight_s, nullptr);
+  ALLOC_FREE_TRY(work, nullptr);
+
+  return GDF_SUCCESS;
+}
+
diff --git a/cpp/src/overlap.cu b/cpp/src/overlap.cu
deleted file mode 100644
index 315baf1dac8..00000000000
--- a/cpp/src/overlap.cu
+++ /dev/null
@@ -1,709 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/** ---------------------------------------------------------------------------*
- * @brief The cugraph Jaccard core functionality
- *
- * @file jaccard.cu
- * ---------------------------------------------------------------------------**/
-
-#include "graph_utils.cuh"
-#include "cugraph.h"
-#include "rmm_utils.h"
-#include "utilities/error_utils.h"
-
-namespace cugraph {
-	// Volume of neighboors (*weight_s)
-	template<bool weighted, typename IdxType, typename ValType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	overlap_row_sum(IdxType n,
-									IdxType *csrPtr,
-									IdxType *csrInd,
-									ValType *v,
-									ValType *work) {
-		IdxType row, start, end, length;
-		ValType sum;
-		for (row = threadIdx.y + blockIdx.y * blockDim.y;
-				row < n;
-				row += gridDim.y * blockDim.y) {
-			start = csrPtr[row];
-			end = csrPtr[row + 1];
-			length = end - start;
-			//compute row sums
-			if (weighted) {
-				sum = parallel_prefix_sum(length, csrInd + start, v);
-				if (threadIdx.x == 0)
-					work[row] = sum;
-			}
-			else {
-				work[row] = (ValType) length;
-			}
-		}
-	}
-
-	// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-	template<bool weighted, typename IdxType, typename ValType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	overlap_is(IdxType n,
-							IdxType *csrPtr,
-							IdxType *csrInd,
-							ValType *v,
-							ValType *work,
-							ValType *weight_i,
-							ValType *weight_s) {
-		IdxType i, j, row, col, Ni, Nj;
-		IdxType ref, cur, ref_col, cur_col, match;
-		ValType ref_val;
-
-		for (row = threadIdx.z + blockIdx.z * blockDim.z;
-				row < n;
-				row += gridDim.z * blockDim.z) {
-			for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y;
-					j < csrPtr[row + 1];
-					j += gridDim.y * blockDim.y) {
-				col = csrInd[j];
-				//find which row has least elements (and call it reference row)
-				Ni = csrPtr[row + 1] - csrPtr[row];
-				Nj = csrPtr[col + 1] - csrPtr[col];
-				ref = (Ni < Nj) ? row : col;
-				cur = (Ni < Nj) ? col : row;
-
-				//compute new sum weights
-				weight_s[j] = min(work[row], work[col]);
-
-				//compute new intersection weights
-				//search for the element with the same column index in the reference row
-				for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1];
-						i += gridDim.x * blockDim.x) {
-					match = -1;
-					ref_col = csrInd[i];
-					if (weighted) {
-						ref_val = v[ref_col];
-					}
-					else {
-						ref_val = 1.0;
-					}
-
-					//binary search (column indices are sorted within each row)
-					IdxType left = csrPtr[cur];
-					IdxType right = csrPtr[cur + 1] - 1;
-					while (left <= right) {
-						IdxType middle = (left + right) >> 1;
-						cur_col = csrInd[middle];
-						if (cur_col > ref_col) {
-							right = middle - 1;
-						}
-						else if (cur_col < ref_col) {
-							left = middle + 1;
-						}
-						else {
-							match = middle;
-							break;
-						}
-					}
-
-					//if the element with the same column index in the reference row has been found
-					if (match != -1) {
-						atomicAdd(&weight_i[j], ref_val);
-					}
-				}
-			}
-		}
-	}
-
-	// Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
-	// Using list of node pairs
-	template<bool weighted, typename IdxType, typename ValType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	overlap_is_pairs(IdxType num_pairs,
-										IdxType *csrPtr,
-										IdxType *csrInd,
-										IdxType *first_pair,
-										IdxType *second_pair,
-										ValType *v,
-										ValType *work,
-										ValType *weight_i,
-										ValType *weight_s) {
-		IdxType i, idx, row, col, Ni, Nj;
-		IdxType ref, cur, ref_col, cur_col, match;
-		ValType ref_val;
-
-		for (idx = threadIdx.z + blockIdx.z * blockDim.z;
-				idx < num_pairs;
-				idx += gridDim.z * blockDim.z) {
-			row = first_pair[idx];
-			col = second_pair[idx];
-			//find which row has least elements (and call it reference row)
-			Ni = csrPtr[row + 1] - csrPtr[row];
-			Nj = csrPtr[col + 1] - csrPtr[col];
-			ref = (Ni < Nj) ? row : col;
-			cur = (Ni < Nj) ? col : row;
-
-			//compute new sum weights
-			weight_s[idx] = min(work[row], work[col]);
-
-			//compute new intersection weights
-			//search for the element with the same column index in the reference row
-			for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x;
-					i < csrPtr[ref + 1];
-					i += gridDim.x * blockDim.x) {
-				match = -1;
-				ref_col = csrInd[i];
-				if (weighted) {
-					ref_val = v[ref_col];
-				}
-				else {
-					ref_val = 1.0;
-				}
-
-				//binary search (column indices are sorted within each row)
-				IdxType left = csrPtr[cur];
-				IdxType right = csrPtr[cur + 1] - 1;
-				while (left <= right) {
-					IdxType middle = (left + right) >> 1;
-					cur_col = csrInd[middle];
-					if (cur_col > ref_col) {
-						right = middle - 1;
-					}
-					else if (cur_col < ref_col) {
-						left = middle + 1;
-					}
-					else {
-						match = middle;
-						break;
-					}
-				}
-
-				//if the element with the same column index in the reference row has been found
-				if (match != -1) {
-					atomicAdd(&weight_i[idx], ref_val);
-				}
-			}
-		}
-	}
-
-	//Jaccard  weights (*weight)
-	template<bool weighted, typename IdxType, typename ValType>
-	__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
-	overlap_jw(IdxType e,
-							IdxType *csrPtr,
-							IdxType *csrInd,
-							ValType *weight_i,
-							ValType *weight_s,
-							ValType *weight_j) {
-		IdxType j;
-		ValType Wi, Wu;
-
-		for (j = threadIdx.x + blockIdx.x * blockDim.x;
-				j < e;
-				j += gridDim.x * blockDim.x) {
-			Wi = weight_i[j];
-			Wu = weight_s[j];
-			weight_j[j] = (Wi / Wu);
-		}
-	}
-
-	template<bool weighted, typename IdxType, typename ValType>
-	int overlap(IdxType n,
-							IdxType e,
-							IdxType *csrPtr,
-							IdxType *csrInd,
-							ValType *weight_in,
-							ValType *work,
-							ValType *weight_i,
-							ValType *weight_s,
-							ValType *weight_j) {
-		dim3 nthreads, nblocks;
-		int y = 4;
-
-		//setup launch configuration
-		nthreads.x = 32;
-		nthreads.y = y;
-		nthreads.z = 1;
-		nblocks.x = 1;
-		nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS);
-		nblocks.z = 1;
-		//launch kernel
-		overlap_row_sum<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
-																																				csrPtr,
-																																				csrInd,
-																																				weight_in,
-																																				work);
-		cudaDeviceSynchronize();
-		fill(e, weight_i, (ValType) 0.0);
-		//setup launch configuration
-		nthreads.x = 32 / y;
-		nthreads.y = y;
-		nthreads.z = 8;
-		nblocks.x = 1;
-		nblocks.y = 1;
-		nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1;
-		//launch kernel
-		overlap_is<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
-																																		csrPtr,
-																																		csrInd,
-																																		weight_in,
-																																		work,
-																																		weight_i,
-																																		weight_s);
-
-		//setup launch configuration
-		nthreads.x = min(e, (IdxType) CUDA_MAX_KERNEL_THREADS);
-		nthreads.y = 1;
-		nthreads.z = 1;
-		nblocks.x = min((e + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS);
-		nblocks.y = 1;
-		nblocks.z = 1;
-		//launch kernel
-		overlap_jw<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(e,
-																																		csrPtr,
-																																		csrInd,
-																																		weight_i,
-																																		weight_s,
-																																		weight_j);
-
-		return 0;
-	}
-
-	template<bool weighted, typename IdxType, typename ValType>
-	int overlap_pairs(IdxType n,
-										IdxType num_pairs,
-										IdxType *csrPtr,
-										IdxType *csrInd,
-										IdxType *first_pair,
-										IdxType *second_pair,
-										ValType *weight_in,
-										ValType *work,
-										ValType *weight_i,
-										ValType *weight_s,
-										ValType *weight_j) {
-		dim3 nthreads, nblocks;
-		int y = 4;
-
-		//setup launch configuration
-		nthreads.x = 32;
-		nthreads.y = y;
-		nthreads.z = 1;
-		nblocks.x = 1;
-		nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS);
-		nblocks.z = 1;
-		//launch kernel
-		overlap_row_sum<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(n,
-																																				csrPtr,
-																																				csrInd,
-																																				weight_in,
-																																				work);
-		cudaDeviceSynchronize();
-		fill(num_pairs, weight_i, (ValType) 0.0);
-		//setup launch configuration
-		nthreads.x = 32;
-		nthreads.y = 1;
-		nthreads.z = 8;
-		nblocks.x = 1;
-		nblocks.y = 1;
-		nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1;
-		//launch kernel
-		overlap_is_pairs<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(num_pairs,
-																																					csrPtr,
-																																					csrInd,
-																																					first_pair,
-																																					second_pair,
-																																					weight_in,
-																																					work,
-																																					weight_i,
-																																					weight_s);
-
-		//setup launch configuration
-		nthreads.x = min(num_pairs, (IdxType) CUDA_MAX_KERNEL_THREADS);
-		nthreads.y = 1;
-		nthreads.z = 1;
-		nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS);
-		nblocks.y = 1;
-		nblocks.z = 1;
-		//launch kernel
-		overlap_jw<weighted, IdxType, ValType> <<<nblocks, nthreads>>>(num_pairs,
-																																		csrPtr,
-																																		csrInd,
-																																		weight_i,
-																																		weight_s,
-																																		weight_j);
-
-		return 0;
-	}
-} // End cugraph namespace
-
-gdf_error gdf_overlap(gdf_graph *graph, gdf_column *weights, gdf_column *result) {
-	GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL);
-	GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED);
-
-	GDF_TRY(gdf_add_adj_list(graph));
-	GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL);
-
-	bool weighted = (weights != nullptr);
-
-	gdf_dtype ValueType = result->dtype;
-	gdf_dtype IndexType = graph->adjList->offsets->dtype;
-
-	void *csrPtr = graph->adjList->offsets->data;
-	void *csrInd = graph->adjList->indices->data;
-	void *weight_i = nullptr;
-	void *weight_s = nullptr;
-	void *weight_j = result->data;
-	void *work = nullptr;
-	void *weight_in = nullptr;
-	if (weighted)
-		weight_in = weights->data;
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::overlap<true, int32_t, float>(n,
-																						e,
-																						(int32_t*) csrPtr,
-																						(int32_t*) csrInd,
-																						(float*) weight_in,
-																						(float*) work,
-																						(float*) weight_i,
-																						(float*) weight_s,
-																						(float*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::overlap<false, int32_t, float>(n,
-																						e,
-																						(int32_t*) csrPtr,
-																						(int32_t*) csrInd,
-																						(float*) weight_in,
-																						(float*) work,
-																						(float*) weight_i,
-																						(float*) weight_s,
-																						(float*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::overlap<true, int32_t, double>(n,
-																						e,
-																						(int32_t*) csrPtr,
-																						(int32_t*) csrInd,
-																						(double*) weight_in,
-																						(double*) work,
-																						(double*) weight_i,
-																						(double*) weight_s,
-																						(double*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::overlap<false, int32_t, double>(n,
-																							e,
-																							(int32_t*) csrPtr,
-																							(int32_t*) csrInd,
-																							(double*) weight_in,
-																							(double*) work,
-																							(double*) weight_i,
-																							(double*) weight_s,
-																							(double*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::overlap<true, int64_t, float>(n,
-																						e,
-																						(int64_t*) csrPtr,
-																						(int64_t*) csrInd,
-																						(float*) weight_in,
-																						(float*) work,
-																						(float*) weight_i,
-																						(float*) weight_s,
-																						(float*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::overlap<false, int64_t, float>(n,
-																						e,
-																						(int64_t*) csrPtr,
-																						(int64_t*) csrInd,
-																						(float*) weight_in,
-																						(float*) work,
-																						(float*) weight_i,
-																						(float*) weight_s,
-																						(float*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::overlap<true, int64_t, double>(n,
-																						e,
-																						(int64_t*) csrPtr,
-																						(int64_t*) csrInd,
-																						(double*) weight_in,
-																						(double*) work,
-																						(double*) weight_i,
-																						(double*) weight_s,
-																						(double*) weight_j);
-	}
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t e = graph->adjList->indices->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::overlap<false, int64_t, double>(n,
-																							e,
-																							(int64_t*) csrPtr,
-																							(int64_t*) csrInd,
-																							(double*) weight_in,
-																							(double*) work,
-																							(double*) weight_i,
-																							(double*) weight_s,
-																							(double*) weight_j);
-	}
-
-// Clean up temp arrays
-	ALLOC_FREE_TRY(weight_i, nullptr);
-	ALLOC_FREE_TRY(weight_s, nullptr);
-	ALLOC_FREE_TRY(work, nullptr);
-
-	return GDF_SUCCESS;
-}
-
-gdf_error gdf_overlap_list(gdf_graph* graph,
-														gdf_column* weights,
-														gdf_column* first,
-														gdf_column* second,
-														gdf_column* result) {
-	GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL);
-	GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED);
-
-	GDF_REQUIRE(first != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(first->data != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(!first->valid, GDF_VALIDITY_UNSUPPORTED);
-
-	GDF_REQUIRE(second != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(second->data != nullptr, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(!second->valid, GDF_VALIDITY_UNSUPPORTED);
-
-	GDF_TRY(gdf_add_adj_list(graph));
-	GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL);
-
-	bool weighted = (weights != nullptr);
-
-	gdf_dtype ValueType = result->dtype;
-	gdf_dtype IndexType = graph->adjList->offsets->dtype;
-	GDF_REQUIRE(first->dtype == IndexType, GDF_INVALID_API_CALL);
-	GDF_REQUIRE(second->dtype == IndexType, GDF_INVALID_API_CALL);
-
-	void *first_pair = first->data;
-	void *second_pair = second->data;
-	void *csrPtr = graph->adjList->offsets->data;
-	void *csrInd = graph->adjList->indices->data;
-	void *weight_i = nullptr;
-	void *weight_s = nullptr;
-	void *weight_j = result->data;
-	void *work = nullptr;
-	void *weight_in = nullptr;
-	if (weighted)
-		weight_in = weights->data;
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::overlap_pairs<true, int32_t, float>(n,
-																									num_pairs,
-																									(int32_t*) csrPtr,
-																									(int32_t*) csrInd,
-																									(int32_t*) first_pair,
-																									(int32_t*) second_pair,
-																									(float*) weight_in,
-																									(float*) work,
-																									(float*) weight_i,
-																									(float*) weight_s,
-																									(float*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::overlap_pairs<false, int32_t, float>(n,
-																									num_pairs,
-																									(int32_t*) csrPtr,
-																									(int32_t*) csrInd,
-																									(int32_t*) first_pair,
-																									(int32_t*) second_pair,
-																									(float*) weight_in,
-																									(float*) work,
-																									(float*) weight_i,
-																									(float*) weight_s,
-																									(float*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::overlap_pairs<true, int32_t, double>(n,
-																									num_pairs,
-																									(int32_t*) csrPtr,
-																									(int32_t*) csrInd,
-																									(int32_t*) first_pair,
-																									(int32_t*) second_pair,
-																									(double*) weight_in,
-																									(double*) work,
-																									(double*) weight_i,
-																									(double*) weight_s,
-																									(double*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) {
-		int32_t n = graph->adjList->offsets->size - 1;
-		int32_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::overlap_pairs<false, int32_t, double>(n,
-																										num_pairs,
-																										(int32_t*) csrPtr,
-																										(int32_t*) csrInd,
-																										(int32_t*) first_pair,
-																										(int32_t*) second_pair,
-																										(double*) weight_in,
-																										(double*) work,
-																										(double*) weight_i,
-																										(double*) weight_s,
-																										(double*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::overlap_pairs<true, int64_t, float>(n,
-																									num_pairs,
-																									(int64_t*) csrPtr,
-																									(int64_t*) csrInd,
-																									(int64_t*) first_pair,
-																									(int64_t*) second_pair,
-																									(float*) weight_in,
-																									(float*) work,
-																									(float*) weight_i,
-																									(float*) weight_s,
-																									(float*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr);
-		cugraph::overlap_pairs<false, int64_t, float>(n,
-																									num_pairs,
-																									(int64_t*) csrPtr,
-																									(int64_t*) csrInd,
-																									(int64_t*) first_pair,
-																									(int64_t*) second_pair,
-																									(float*) weight_in,
-																									(float*) work,
-																									(float*) weight_i,
-																									(float*) weight_s,
-																									(float*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::overlap_pairs<true, int64_t, double>(n,
-																									num_pairs,
-																									(int64_t*) csrPtr,
-																									(int64_t*) csrInd,
-																									(int64_t*) first_pair,
-																									(int64_t*) second_pair,
-																									(double*) weight_in,
-																									(double*) work,
-																									(double*) weight_i,
-																									(double*) weight_s,
-																									(double*) weight_j);
-	}
-
-	if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) {
-		int64_t n = graph->adjList->offsets->size - 1;
-		int64_t num_pairs = first->size;
-		ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr);
-		ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr);
-		cugraph::overlap_pairs<false, int64_t, double>(n,
-																										num_pairs,
-																										(int64_t*) csrPtr,
-																										(int64_t*) csrInd,
-																										(int64_t*) first_pair,
-																										(int64_t*) second_pair,
-																										(double*) weight_in,
-																										(double*) work,
-																										(double*) weight_i,
-																										(double*) weight_s,
-																										(double*) weight_j);
-	}
-
-	// Clean up temp arrays
-	ALLOC_FREE_TRY(weight_i, nullptr);
-	ALLOC_FREE_TRY(weight_s, nullptr);
-	ALLOC_FREE_TRY(work, nullptr);
-
-	return GDF_SUCCESS;
-}
-
diff --git a/cpp/src/pagerank.cu b/cpp/src/pagerank.cu
deleted file mode 100644
index 668e19d1bf3..00000000000
--- a/cpp/src/pagerank.cu
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Pagerank solver
-// Author: Alex Fender afender@nvidia.com
-
-#include <stdio.h>
-#include <stdlib.h>  
-#include <stddef.h>
-#include <string>
- #include <sstream>
-#include <iostream>
-#include <iomanip>
-#include "graph_utils.cuh"
-#include "pagerank.cuh"
-#include "cub/cub.cuh"
-#include <algorithm>
-#include <iomanip>
-
-#include <rmm_utils.h>
-
-namespace cugraph
-{
-  
-#ifdef DEBUG
-  #define PR_VERBOSE
-#endif
-template <typename IndexType, typename ValueType>
-bool  pagerankIteration( IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd,ValueType *cscVal,
-                                     ValueType alpha, ValueType *a, ValueType *b, float tolerance, int iter, int max_iter, 
-                                     ValueType * &tmp,  void* cub_d_temp_storage, size_t  cub_temp_storage_bytes, 
-                                     ValueType * &pr, ValueType *residual) {
-    
-    ValueType  dot_res;
-    cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal,
-        cscPtr, cscInd, tmp, pr,
-        n, n, e);
-   
-    scal(n, alpha, pr);
-    dot_res = dot( n, a, tmp);
-    axpy(n, dot_res,  b,  pr);
-    scal(n, (ValueType)1.0/nrm2(n, pr) , pr);
-    axpy(n, (ValueType)-1.0,  pr,  tmp);
-    *residual = nrm2(n, tmp);
-    if (*residual < tolerance)
-    {
-        scal(n, (ValueType)1.0/nrm1(n,pr), pr);
-        return true;
-    }
-    else
-    {
-        if (iter< max_iter)
-        {
-            std::swap(pr, tmp);
-        }
-        else
-        {
-           scal(n, (ValueType)1.0/nrm1(n,pr), pr);
-        }
-        return false;
-    }
-}
-
-template <typename IndexType, typename ValueType>
-int pagerank (  IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd, ValueType *cscVal,
-                       ValueType alpha, ValueType *a, bool has_guess, float tolerance, int max_iter, 
-                       ValueType * &pagerank_vector, ValueType * &residual) {
-  int max_it, i = 0 ;
-  float tol;
-  bool converged = false;
-  ValueType randomProbability =  static_cast<ValueType>( 1.0/n);
-  ValueType *b=0, *tmp=0;
-  void*    cub_d_temp_storage = NULL;
-  size_t   cub_temp_storage_bytes = 0;
-
-  if (max_iter > 0 )
-      max_it = max_iter;
-  else
-      max_it =  500;
-
-  if (tolerance == 0.0f)
-      tol =  1.0E-6f;
-  else if (tolerance < 1.0f && tolerance > 0.0f)
-      tol = tolerance;
-  else
-      return -1;
-
-  if (alpha <= 0.0f || alpha >= 1.0f)
-          return -1;
-
-  cudaStream_t stream{nullptr};
-	
-  ALLOC_MANAGED_TRY ((void**)&b,    sizeof(ValueType) * n, stream);
-  ALLOC_MANAGED_TRY ((void**)&tmp,    sizeof(ValueType) * n, stream);
-  cudaCheckError();
-
-  if (!has_guess)  {
-       fill(n, pagerank_vector, randomProbability);
-       fill(n, tmp, randomProbability);
-  }
-  else {
-    copy(n, pagerank_vector, tmp);
-  }
-
-
-  fill(n, b, randomProbability);
-  update_dangling_nodes(n, a, alpha);
-
-  cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal,
-                                             cscPtr, cscInd, tmp, pagerank_vector, n, n, e);
-   // Allocate temporary storage
- ALLOC_MANAGED_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream);
- cudaCheckError()
-  #ifdef PR_VERBOSE
-      std::stringstream ss;
-      ss.str(std::string());
-      ss <<" ------------------PageRank------------------"<< std::endl;
-      ss <<" --------------------------------------------"<< std::endl;
-      ss << std::setw(10) << "Iteration" << std::setw(15) << "Residual" << std::endl;
-      ss <<" --------------------------------------------"<< std::endl;
-      std::cout<<ss.str();
-  #endif
-
-  while (!converged && i < max_it)
-  { 
-      i++;
-      converged = pagerankIteration(n, e, cscPtr, cscInd, cscVal,
-                                           alpha, a, b, tol, i, max_it, tmp, 
-                                           cub_d_temp_storage, cub_temp_storage_bytes, 
-                                           pagerank_vector, residual);
-       #ifdef PR_VERBOSE
-          ss.str(std::string());
-          ss << std::setw(10) << i ;
-          ss.precision(3);
-          ss << std::setw(15) << std::scientific << *residual  << std::endl;
-          std::cout<<ss.str();
-      #endif
-  }
-  #ifdef PR_VERBOSE
-      std::cout <<" --------------------------------------------"<< std::endl;
-  #endif
-  //printv(n,pagerank_vector,0);
-
-  ALLOC_FREE_TRY(b, stream);  
-  ALLOC_FREE_TRY(tmp, stream);
-  ALLOC_FREE_TRY(cub_d_temp_storage, stream);    
-  
-  return converged ? 0 : 1;
-}
-
-//template int pagerank<int, half> (  int n, int e, int *cscPtr, int *cscInd,half *cscVal, half alpha, half *a, bool has_guess, float tolerance, int max_iter, half * &pagerank_vector, half * &residual);
-template int pagerank<int, float> (  int n, int e, int *cscPtr, int *cscInd,float *cscVal, float alpha, float *a, bool has_guess, float tolerance, int max_iter, float * &pagerank_vector, float * &residual);
-template int pagerank<int, double> (  int n, int e, int *cscPtr, int *cscInd,double *cscVal, double alpha, double *a, bool has_guess, float tolerance, int max_iter, double * &pagerank_vector, double * &residual);
-
-} //namespace cugraph
diff --git a/cpp/src/pagerank.cuh b/cpp/src/pagerank.cuh
deleted file mode 100644
index d3e1572d3bd..00000000000
--- a/cpp/src/pagerank.cuh
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- *
- */
-
-// Pagerank solver
-// Author: Alex Fender afender@nvidia.com
- 
-#pragma once
-namespace cugraph
-{
-
-template <typename IndexType, typename ValueType>
-int pagerank (  IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd,ValueType *cscVal,
-                       ValueType alpha, ValueType *a, bool has_guess, float tolerance, int max_iter, ValueType * &pagerank_vector, ValueType * &residual);
-
-} //namespace cugraph
diff --git a/cpp/src/snmg/blas/spmv.cu b/cpp/src/snmg/blas/spmv.cu
new file mode 100644
index 00000000000..c5b369396c7
--- /dev/null
+++ b/cpp/src/snmg/blas/spmv.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// snmg spmv
+// Author: Alex Fender afender@nvidia.com
+ 
+#include "spmv.cuh"
+
+template <typename idx_t,typename val_t>
+gdf_error gdf_snmg_csrmv_impl (size_t * part_offsets, gdf_column * off, gdf_column * ind, gdf_column * val, gdf_column ** x_cols){
+  
+  GDF_REQUIRE( part_offsets != nullptr, GDF_INVALID_API_CALL );
+  GDF_REQUIRE( off != nullptr, GDF_INVALID_API_CALL );
+  GDF_REQUIRE( ind != nullptr, GDF_INVALID_API_CALL );
+  GDF_REQUIRE( val != nullptr, GDF_INVALID_API_CALL );
+  GDF_REQUIRE( x_cols != nullptr, GDF_INVALID_API_CALL );
+  GDF_REQUIRE( off->size > 0, GDF_INVALID_API_CALL );
+  GDF_REQUIRE( ind->size > 0, GDF_INVALID_API_CALL );
+  GDF_REQUIRE( val->size > 0, GDF_INVALID_API_CALL );
+  GDF_REQUIRE( ind->size == val->size, GDF_COLUMN_SIZE_MISMATCH ); 
+  GDF_REQUIRE( off->dtype == ind->dtype, GDF_UNSUPPORTED_DTYPE );  
+  GDF_REQUIRE( off->null_count + ind->null_count + val->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );                 
+
+  auto p = omp_get_num_threads();
+
+  val_t* x[p];
+  for (auto i = 0; i < p; ++i)
+  {
+    GDF_REQUIRE( x_cols[i] != nullptr, GDF_INVALID_API_CALL );
+    GDF_REQUIRE( x_cols[i]->size > 0, GDF_INVALID_API_CALL );
+    x[i]= static_cast<val_t*>(x_cols[i]->data);
+  }
+  cugraph::SNMGinfo snmg_env;
+  cugraph::SNMGcsrmv<idx_t,val_t> spmv_solver(snmg_env, part_offsets,
+                                      static_cast<idx_t*>(off->data), 
+                                      static_cast<idx_t*>(ind->data), 
+                                      static_cast<val_t*>(val->data), 
+                                      x);
+  spmv_solver.run(x);
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_snmg_csrmv (size_t * part_offsets, gdf_column * off, gdf_column * ind, gdf_column * val, gdf_column ** x_cols){
+    switch (val->dtype) {
+      case GDF_FLOAT32:   return gdf_snmg_csrmv_impl<int32_t,float>(part_offsets, off, ind, val, x_cols);
+      case GDF_FLOAT64:   return gdf_snmg_csrmv_impl<int32_t,double>(part_offsets, off, ind, val, x_cols);
+      default: return GDF_UNSUPPORTED_DTYPE;
+    }
+}
diff --git a/cpp/src/snmg/blas/spmv.cuh b/cpp/src/snmg/blas/spmv.cuh
new file mode 100644
index 00000000000..8b7120a8e65
--- /dev/null
+++ b/cpp/src/snmg/blas/spmv.cuh
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// snmg spmv
+// Author: Alex Fender afender@nvidia.com
+ 
+#pragma once
+#include "cub/cub.cuh"
+#include <omp.h>
+#include "utilities/graph_utils.cuh"
+#include "snmg/utils.cuh"
+//#define SNMG_DEBUG
+
+namespace cugraph
+{
+
+template <typename IndexType, typename ValueType>
+class SNMGcsrmv 
+{ 
+
+  private:
+    size_t v_glob;
+    size_t v_loc;
+    size_t e_loc;
+    SNMGinfo env;
+    size_t* part_off;
+    int i;
+    int p;
+    IndexType * off;
+    IndexType * ind;
+    ValueType * val;
+    ValueType * y_loc;
+    cudaStream_t stream;
+    void* cub_d_temp_storage;
+    size_t cub_temp_storage_bytes;
+
+  public: 
+    SNMGcsrmv(SNMGinfo & env_, size_t* part_off_, 
+              IndexType * off_, IndexType * ind_, ValueType * val_, ValueType ** x) : 
+              env(env_), part_off(part_off_), off(off_), ind(ind_), val(val_) { 
+      sync_all();
+      cub_d_temp_storage = NULL;
+      cub_temp_storage_bytes = 0;
+      stream = nullptr;
+      i = env.get_thread_num();
+      p = env.get_num_threads(); 
+      v_glob = part_off[p];
+      v_loc = part_off[i+1]-part_off[i];
+      IndexType tmp;
+      cudaMemcpy(&tmp, &off[v_loc], sizeof(IndexType),cudaMemcpyDeviceToHost);
+      cudaCheckError();
+      e_loc = tmp;
+
+      // Allocate the local result
+      ALLOC_TRY ((void**)&y_loc, v_loc*sizeof(ValueType), stream);
+
+      // get temporary storage size for CUB
+      cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, 
+                                      val, off, ind, x[i], y_loc, v_loc, v_glob, e_loc);
+      cudaCheckError();
+      // Allocate CUB's temporary storage
+      ALLOC_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream);
+    } 
+
+    ~SNMGcsrmv() { 
+      ALLOC_FREE_TRY(cub_d_temp_storage, stream);
+      ALLOC_FREE_TRY(y_loc, stream);
+    }
+
+    // run the power iteration
+    void run (ValueType ** x) {
+    // Local SPMV
+    cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, 
+                                    val, off, ind, x[i], y_loc, v_loc, v_glob, e_loc);
+    cudaCheckError()
+    sync_all();
+     
+ #ifdef SNMG_DEBUG
+    print_mem_usage();  
+    #pragma omp master 
+    {std::cout <<  omp_get_wtime() - t << " ";}
+     Wait for all local spmv
+    t = omp_get_wtime();
+    sync_all();
+    #pragma omp master 
+    {std::cout <<  omp_get_wtime() - t << " ";}
+    Update the output vector
+#endif
+     
+    allgather (env, part_off, y_loc, x);
+  }
+};
+
+
+} //namespace cugraph
diff --git a/cpp/src/snmg/degree/degree.cu b/cpp/src/snmg/degree/degree.cu
new file mode 100644
index 00000000000..514228e7fd2
--- /dev/null
+++ b/cpp/src/snmg/degree/degree.cu
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "degree.cuh"
+
+template<typename idx_t>
+gdf_error gdf_snmg_degree_impl(int x,
+                               size_t* part_offsets,
+                               gdf_column* off,
+                               gdf_column* ind,
+                               gdf_column** x_cols) {
+  GDF_REQUIRE(off->size > 0, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(ind->size > 0, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(off->dtype == ind->dtype, GDF_UNSUPPORTED_DTYPE);
+  GDF_REQUIRE(off->null_count + ind->null_count == 0, GDF_VALIDITY_UNSUPPORTED);
+
+  gdf_error status;
+  auto p = omp_get_num_threads();
+
+  idx_t* degree[p];
+  for (auto i = 0; i < p; ++i) {
+    GDF_REQUIRE(x_cols[i] != nullptr, GDF_INVALID_API_CALL);
+    GDF_REQUIRE(x_cols[i]->size > 0, GDF_INVALID_API_CALL);
+    degree[i] = static_cast<idx_t*>(x_cols[i]->data);
+  }
+
+  status = cugraph::snmg_degree(x,
+                                part_offsets,
+                                static_cast<idx_t*>(off->data),
+                                static_cast<idx_t*>(ind->data),
+                                degree);
+  return status;
+}
+
+gdf_error gdf_snmg_degree(int x,
+                          size_t* part_offsets,
+                          gdf_column* off,
+                          gdf_column* ind,
+                          gdf_column** x_cols) {
+  GDF_REQUIRE(part_offsets != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(off != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(ind != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(x_cols != nullptr, GDF_INVALID_API_CALL);
+  switch (off->dtype) {
+    case GDF_INT32:
+      return gdf_snmg_degree_impl<int32_t>(x, part_offsets, off, ind, x_cols);
+    case GDF_INT64:
+      return gdf_snmg_degree_impl<int64_t>(x, part_offsets, off, ind, x_cols);
+    default:
+      return GDF_INVALID_API_CALL;
+  }
+}
diff --git a/cpp/src/snmg/degree/degree.cuh b/cpp/src/snmg/degree/degree.cuh
new file mode 100644
index 00000000000..1e22da4ef4b
--- /dev/null
+++ b/cpp/src/snmg/degree/degree.cuh
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <omp.h>
+#include "utilities/graph_utils.cuh"
+#include "snmg/utils.cuh"
+#include "rmm_utils.h"
+
+namespace cugraph {
+  /**
+   * Single node multi-GPU method for degree calculation on a partitioned graph.
+   * @param x Indicates whether to compute in degree, out degree, or the sum of both.
+   *    0 = in + out degree
+   *    1 = in-degree
+   *    2 = out-degree
+   * @param part_off The vertex partitioning of the global graph
+   * @param off The offsets array of the local partition
+   * @param ind The indices array of the local partition
+   * @param degree Pointer to pointers to memory on each GPU for the result
+   * @return Error code
+   */
+  template<typename idx_t>
+  gdf_error snmg_degree(int x, size_t* part_off, idx_t* off, idx_t* ind, idx_t** degree) {
+    sync_all();
+    SNMGinfo env;
+    auto i = env.get_thread_num();
+    auto p = env.get_num_threads();
+
+    // Getting the global and local vertices and edges
+    size_t glob_v = part_off[p];
+    size_t loc_v = part_off[i + 1] - part_off[i];
+    idx_t tmp;
+    CUDA_TRY(cudaMemcpy(&tmp, &off[loc_v], sizeof(idx_t), cudaMemcpyDeviceToHost));
+    size_t loc_e = tmp;
+
+    // Allocating the local result array, and setting all entries to zero.
+    idx_t* local_result;
+    ALLOC_TRY((void** )&local_result, glob_v * sizeof(idx_t), nullptr);
+    thrust::fill(rmm::exec_policy(nullptr)->on(nullptr), local_result, local_result + glob_v, 0);
+
+    // In-degree
+    if (x == 1 || x == 0) {
+      dim3 nthreads, nblocks;
+      nthreads.x = min(static_cast<idx_t>(loc_e), static_cast<idx_t>(CUDA_MAX_KERNEL_THREADS));
+      nthreads.y = 1;
+      nthreads.z = 1;
+      nblocks.x = min(static_cast<idx_t>((loc_e + nthreads.x - 1) / nthreads.x),
+                      static_cast<idx_t>(env.get_num_sm() * 32));
+      nblocks.y = 1;
+      nblocks.z = 1;
+      degree_coo<idx_t, idx_t> <<<nblocks, nthreads>>>(static_cast<idx_t>(loc_e),
+                                                       static_cast<idx_t>(loc_e),
+                                                       ind,
+                                                       local_result);
+      cudaCheckError();
+    }
+
+    // Out-degree
+    if (x == 2 || x == 0) {
+      dim3 nthreads, nblocks;
+      nthreads.x = min(static_cast<idx_t>(loc_v), static_cast<idx_t>(CUDA_MAX_KERNEL_THREADS));
+      nthreads.y = 1;
+      nthreads.z = 1;
+      nblocks.x = min(static_cast<idx_t>((loc_v + nthreads.x - 1) / nthreads.x),
+                      static_cast<idx_t>(env.get_num_sm() * 32));
+      nblocks.y = 1;
+      nblocks.z = 1;
+      degree_offsets<idx_t, idx_t> <<<nblocks, nthreads>>>(static_cast<idx_t>(loc_v),
+                                                           static_cast<idx_t>(loc_e),
+                                                           off,
+                                                           local_result + part_off[i]);
+      cudaCheckError();
+    }
+
+    // Combining the local results into global results
+    sync_all();
+    treeReduce<idx_t, thrust::plus<idx_t> >(env, glob_v, local_result, degree);
+
+    // Broadcasting the global result to all GPUs
+    treeBroadcast(env, glob_v, local_result, degree);
+
+    return GDF_SUCCESS;
+  }
+
+  template<>
+  gdf_error snmg_degree<int64_t>(int x,
+                                 size_t* part_off,
+                                 int64_t* off,
+                                 int64_t* ind,
+                                 int64_t** degree) {
+    sync_all();
+    SNMGinfo env;
+    auto i = env.get_thread_num();
+    auto p = env.get_num_threads();
+
+    // Getting the global and local vertices and edges
+    size_t glob_v = part_off[p];
+    size_t loc_v = part_off[i + 1] - part_off[i];
+    int64_t tmp;
+    CUDA_TRY(cudaMemcpy(&tmp, &off[loc_v], sizeof(int64_t), cudaMemcpyDeviceToHost));
+    size_t loc_e = tmp;
+
+    // Allocating the local result array, and setting all entries to zero.
+    int64_t* local_result;
+    ALLOC_TRY((void** )&local_result, glob_v * sizeof(int64_t), nullptr);
+    thrust::fill(rmm::exec_policy(nullptr)->on(nullptr), local_result, local_result + glob_v, 0);
+
+    // In-degree
+    if (x == 1 || x == 0) {
+      dim3 nthreads, nblocks;
+      nthreads.x = min(static_cast<int64_t>(loc_e), static_cast<int64_t>(CUDA_MAX_KERNEL_THREADS));
+      nthreads.y = 1;
+      nthreads.z = 1;
+      nblocks.x = min(static_cast<int64_t>((loc_e + nthreads.x - 1) / nthreads.x),
+                      static_cast<int64_t>(env.get_num_sm() * 32));
+      nblocks.y = 1;
+      nblocks.z = 1;
+      degree_coo<int64_t, double> <<<nblocks, nthreads>>>(static_cast<int64_t>(loc_e),
+                                                          static_cast<int64_t>(loc_e),
+                                                          ind,
+                                                          reinterpret_cast<double*>(local_result));
+      cudaCheckError();
+    }
+
+    // Out-degree
+    if (x == 2 || x == 0) {
+      dim3 nthreads, nblocks;
+      nthreads.x = min(static_cast<int64_t>(loc_v), static_cast<int64_t>(CUDA_MAX_KERNEL_THREADS));
+      nthreads.y = 1;
+      nthreads.z = 1;
+      nblocks.x = min(static_cast<int64_t>((loc_v + nthreads.x - 1) / nthreads.x),
+                      static_cast<int64_t>(env.get_num_sm() * 32));
+      nblocks.y = 1;
+      nblocks.z = 1;
+      degree_offsets<int64_t, double> <<<nblocks, nthreads>>>(static_cast<int64_t>(loc_v),
+                                                              static_cast<int64_t>(loc_e),
+                                                              off,
+                                                              reinterpret_cast<double*>(local_result
+                                                                  + part_off[i]));
+      cudaCheckError();
+    }
+
+    // Convert the values written as doubles back to int64:
+    dim3 nthreads, nblocks;
+    nthreads.x = min(static_cast<int64_t>(glob_v), static_cast<int64_t>(CUDA_MAX_KERNEL_THREADS));
+    nthreads.y = 1;
+    nthreads.z = 1;
+    nblocks.x = min(static_cast<int64_t>((glob_v + nthreads.x - 1) / nthreads.x),
+                    static_cast<int64_t>(env.get_num_sm() * 32));
+    nblocks.y = 1;
+    nblocks.z = 1;
+    type_convert<double, int64_t> <<<nblocks, nthreads>>>(reinterpret_cast<double*>(local_result), glob_v);
+    cudaCheckError();
+
+    // Combining the local results into global results
+    treeReduce<int64_t, thrust::plus<int64_t> >(env, glob_v, local_result, degree);
+
+    // Broadcasting the global result to all GPUs
+    treeBroadcast(env, glob_v, local_result, degree);
+
+    return GDF_SUCCESS;
+  }
+}
diff --git a/cpp/src/snmg/link_analysis/pagerank.cuh b/cpp/src/snmg/link_analysis/pagerank.cuh
new file mode 100644
index 00000000000..7d2af4491ef
--- /dev/null
+++ b/cpp/src/snmg/link_analysis/pagerank.cuh
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// snmg pagerank
+// Author: Alex Fender afender@nvidia.com
+ 
+#pragma once
+#include "cub/cub.cuh"
+#include <omp.h>
+#include "utilities/graph_utils.cuh"
+#include "snmg/utils.cuh"
+#include "snmg/blas/spmv.cuh"
+//#define SNMG_DEBUG
+
+namespace cugraph
+{
+
+  template<typename IndexType, typename ValueType>
+__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+transition_kernel(const size_t e,
+                  const IndexType *ind,
+                  const IndexType *degree,
+                  ValueType *val) {
+  for (auto i = threadIdx.x + blockIdx.x * blockDim.x; 
+       i < e; 
+       i += gridDim.x * blockDim.x)
+    val[i] = 1.0 / degree[ind[i]];
+}
+
+template <typename IndexType, typename ValueType>
+class SNMGpagerank 
+{ 
+  private:
+    size_t v_glob; //global number of vertices
+    size_t v_loc;  //local number of vertices
+    size_t e_loc;  //local number of edges
+    int id; // thread id
+    int nt; // number of threads
+    ValueType alpha; // damping factor
+    SNMGinfo env;  //info about the snmg env setup
+    cudaStream_t stream;  
+    
+    //Vertex offsets for each partition. 
+    //This information should be available on all threads/devices
+    //part_offsets[device_id] contains the global ID 
+    //of the first vertex of the partion owned by device_id. 
+    //part_offsets[num_devices] contains the global number of vertices
+    size_t* part_off; 
+    
+    // local CSR matrix
+    IndexType * off;
+    IndexType * ind;
+    ValueType * val;
+
+    // vectors of size v_glob 
+    ValueType * bookmark; // constant vector with dangling node info
+
+    bool is_setup;
+
+  public: 
+    SNMGpagerank(SNMGinfo & env_, size_t* part_off_, 
+                 IndexType * off_, IndexType * ind_) : 
+                 env(env_), part_off(part_off_), off(off_), ind(ind_) { 
+      id = env.get_thread_num();
+      nt = env.get_num_threads(); 
+      v_glob = part_off[nt];
+      v_loc = part_off[id+1]-part_off[id];
+      IndexType tmp_e;
+      cudaMemcpy(&tmp_e, &off[v_loc], sizeof(IndexType),cudaMemcpyDeviceToHost);
+      cudaCheckError();
+      e_loc = tmp_e;
+      stream = nullptr;
+      is_setup = false;
+      ALLOC_TRY ((void**)&bookmark,   sizeof(ValueType) * v_glob, stream);
+      ALLOC_TRY ((void**)&val, sizeof(ValueType) * e_loc, stream);
+    } 
+    ~SNMGpagerank() { 
+      ALLOC_FREE_TRY(bookmark, stream); 
+      ALLOC_FREE_TRY(val, stream);
+    }
+
+    void transition_vals(const IndexType *degree) {
+      int threads = min(static_cast<IndexType>(e_loc), 256);
+      int blocks = min(static_cast<IndexType>(32*env.get_num_sm()), CUDA_MAX_BLOCKS);
+      transition_kernel<IndexType, ValueType> <<<blocks, threads>>> (e_loc, ind, degree, val);
+      cudaCheckError();
+    }
+
+    void flag_leafs(const IndexType *degree) {
+      int threads = min(static_cast<IndexType>(v_glob), 256);
+      int blocks = min(static_cast<IndexType>(32*env.get_num_sm()), CUDA_MAX_BLOCKS);
+      flag_leafs_kernel<IndexType, ValueType> <<<blocks, threads>>> (v_glob, degree, bookmark);
+      cudaCheckError();
+    }    
+
+
+    // Artificially create the google matrix by setting val and bookmark
+    void setup(ValueType _alpha) {
+      if (!is_setup) {
+        alpha=_alpha;
+        ValueType zero = 0.0; 
+        IndexType *degree;
+        ALLOC_TRY ((void**)&degree,   sizeof(IndexType) * v_glob, stream);
+        
+        // TODO snmg degree
+        int nthreads = min(static_cast<IndexType>(e_loc), 256);
+        int nblocks = min(static_cast<IndexType>(32*env.get_num_sm()), CUDA_MAX_BLOCKS);
+        degree_coo<IndexType, IndexType><<<nblocks, nthreads>>>(v_glob, e_loc, ind, degree);
+        
+        // Update dangling node vector
+        fill(v_glob, bookmark, zero);
+        flag_leafs(degree);
+        update_dangling_nodes(v_glob, bookmark, alpha);
+
+        // Transition matrix
+        transition_vals(degree);
+
+        //exit
+        ALLOC_FREE_TRY(degree, stream);
+        is_setup = true;
+      }
+      else
+        throw std::string("Setup can be called only once");
+    }
+
+    // run the power iteration on the google matrix
+    void solve (int max_iter, ValueType ** pagerank) {
+      if (is_setup) {
+        ValueType  dot_res;
+        ValueType one = 1.0;
+        ValueType *pr = pagerank[id];
+        fill(v_glob, pagerank[id], one/v_glob);
+        dot_res = dot( v_glob, bookmark, pr);
+        SNMGcsrmv<IndexType,ValueType> spmv_solver(env, part_off, off, ind, val, pagerank);
+        for (auto i = 0; i < max_iter; ++i) {
+          spmv_solver.run(pagerank);
+          scal(v_glob, alpha, pr);
+          addv(v_glob, dot_res * (one/v_glob) , pr);
+          dot_res = dot( v_glob, bookmark, pr);
+          scal(v_glob, one/nrm2(v_glob, pr) , pr);
+        }
+        scal(v_glob, one/nrm1(v_glob,pr), pr);
+      }
+      else {
+          throw std::string("Solve was called before setup");
+      }
+    }
+};
+
+} //namespace cugraph
diff --git a/cpp/src/snmg/snmg_utils.cuh b/cpp/src/snmg/snmg_utils.cuh
deleted file mode 100644
index eea6a43053d..00000000000
--- a/cpp/src/snmg/snmg_utils.cuh
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// snmg utils
-// Author: Alex Fender afender@nvidia.com
- 
-#pragma once
-#include <omp.h>
-
-namespace cugraph
-{
-
-// Wait for all host threads 
-void sync_all() {
-  cudaDeviceSynchronize();
-  #pragma omp barrier 
-}
-
-// enable peer access (all to all)
-gdf_error setup_peer_access() {
-  auto i = omp_get_thread_num();
-  auto p = omp_get_num_threads();  
-  for (int j = 0; j < p; ++j) {
-    if (i != j) {
-      int canAccessPeer = 0;
-      CUDA_TRY(cudaDeviceCanAccessPeer(&canAccessPeer, i, j));
-      if (canAccessPeer) {
-		    cudaDeviceEnablePeerAccess(j, 0);
-        cudaError_t status = cudaGetLastError();
-        if (!(status == cudaSuccess || status == cudaErrorPeerAccessAlreadyEnabled)) {
-        	std::cerr << "Could not Enable Peer Access from" << i << " to " << j << std::endl;
-        	return GDF_CUDA_ERROR;
-        }
-      }
-      else {
-        std::cerr << "P2P access required from " << i << " to " << j << std::endl;
-        return GDF_CUDA_ERROR;
-      }
-    }
-  }
-  return GDF_SUCCESS;
-}
-
-// Each GPU copies its x_loc to x_glob[offset[device]] on all GPU
-template <typename val_t>
-gdf_error allgather (size_t* offset, val_t* x_loc, val_t ** x_glob) {
-  auto i = omp_get_thread_num();
-  auto p = omp_get_num_threads();  
-  size_t n_loc= offset[i+1]-offset[i];
-
-  GDF_TRY(setup_peer_access()); 
-  // this causes issues with CUB. TODO :  verify the impact on performance.
-
-  // send the local spmv output (x_loc) to all peers to reconstruct the global vector x_glob 
-  // After this call each peer has a full, updated, copy of x_glob
-  for (int j = 0; j < p; ++j)
-    CUDA_TRY(cudaMemcpyPeer(x_glob[j]+offset[i],j, x_loc,i, n_loc*sizeof(val_t)));
-    //CUDA_TRY(cudaMemcpy(x_glob[j]+offset[i], x_loc, n_loc*sizeof(val_t),cudaMemcpyDeviceToDevice));
-  
-  //Make sure everyone has finished copying before returning
-  sync_all();
-
-  return GDF_SUCCESS;
-}
-
-void print_mem_usage()
-{
-  size_t free,total;
-  cudaMemGetInfo(&free, &total);  
-  std::cout<< std::endl<< "Mem used: "<<total-free<<std::endl;
-}
-
-} //namespace cugraph
diff --git a/cpp/src/snmg/spmv.cuh b/cpp/src/snmg/spmv.cuh
deleted file mode 100644
index 3296f0ce1f4..00000000000
--- a/cpp/src/snmg/spmv.cuh
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// snmg spmv
-// Author: Alex Fender afender@nvidia.com
- 
-#pragma once
-#include "cub/cub.cuh"
-#include <omp.h>
-#include "graph_utils.cuh"
-#include "snmg_utils.cuh"
-//#define SNMG_DEBUG
-
-namespace cugraph
-{
-
-template <typename idx_t,typename val_t>
-gdf_error snmg_csrmv (size_t* part_off, idx_t * off, idx_t * ind, val_t * val, val_t ** x) {
-  sync_all();
-  void* cub_d_temp_storage = NULL;
-  size_t cub_temp_storage_bytes = 0;
-  cudaStream_t stream{nullptr};
-  auto i = omp_get_thread_num();
-  auto p = omp_get_num_threads(); 
-  size_t v_glob = part_off[p];
-  size_t v_loc = part_off[i+1]-part_off[i];
-  idx_t tmp;
-  CUDA_TRY(cudaMemcpy(&tmp, &off[v_loc], sizeof(idx_t),cudaMemcpyDeviceToHost));
-  size_t e_loc = tmp;
-  val_t* y_loc;
-  //double t = omp_get_wtime();
-  
-  // Allocate the local result
-  ALLOC_MANAGED_TRY ((void**)&y_loc, v_loc*sizeof(val_t), stream);
-
-  // get temporary storage size for CUB
-  CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, 
-  	                              val, off, ind, x[i], y_loc, v_loc, v_glob, e_loc));
-  // Allocate CUB's temporary storage
-  ALLOC_MANAGED_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream);
-
-  // Local SPMV
-  CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, 
-  	                              val, off, ind, x[i], y_loc, v_loc, v_glob, e_loc));
-  print_mem_usage();	
-  // Free CUB's temporary storage
-  ALLOC_FREE_TRY(cub_d_temp_storage, stream);
-  //#pragma omp master 
-  //{std::cout <<  omp_get_wtime() - t << " ";}
-
-  // Wait for all local spmv
-  //t = omp_get_wtime();
-  sync_all();
-  //#pragma omp master 
-  //{std::cout <<  omp_get_wtime() - t << " ";}
-
-  //Update the output vector
-  allgather (part_off, y_loc, x);
-
-  return GDF_SUCCESS;
-}
-
-} //namespace cugraph
diff --git a/cpp/src/snmg/utils.cu b/cpp/src/snmg/utils.cu
new file mode 100644
index 00000000000..ebee5976de5
--- /dev/null
+++ b/cpp/src/snmg/utils.cu
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <omp.h>
+#include <iostream>
+
+namespace cugraph {
+  void sync_all() {
+    cudaDeviceSynchronize();
+    #pragma omp barrier
+  }
+
+  void print_mem_usage() {
+    size_t free,total;
+    cudaMemGetInfo(&free, &total);
+    std::cout<< std::endl<< "Mem used: "<<total-free<<std::endl;
+  }
+}
diff --git a/cpp/src/snmg/utils.cuh b/cpp/src/snmg/utils.cuh
new file mode 100644
index 00000000000..16a4728d94f
--- /dev/null
+++ b/cpp/src/snmg/utils.cuh
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// snmg utils
+// Author: Alex Fender afender@nvidia.com
+ 
+#pragma once
+#include <omp.h>
+#include "rmm_utils.h"
+
+namespace cugraph
+{
+
+// basic info about the snmg env setup
+class SNMGinfo 
+{ 
+  private:
+    int i, p, n_sm;
+  
+  public: 
+    SNMGinfo() { 
+      int tmp_p, tmp_i;
+      //get info from cuda
+      cudaGetDeviceCount(&tmp_p);
+      cudaGetDevice(&tmp_i);
+
+      //get info from omp 
+      i = omp_get_thread_num();
+      p = omp_get_num_threads();
+
+      // check that thread_num and num_threads are compatible with the device ID and the number of device 
+      if (tmp_i != i) {
+        std::cerr << "Thread ID and GPU ID do not match" << std::endl;
+      }
+      if (p > tmp_p) {
+        std::cerr << "More threads than GPUs" << std::endl;
+      }
+      // number of SM, usefull for kernels paramters
+      cudaDeviceGetAttribute(&n_sm, cudaDevAttrMultiProcessorCount, i);
+      cudaCheckError();
+    } 
+    ~SNMGinfo() { }
+
+    int get_thread_num() {
+      return i; 
+    }
+    int get_num_threads() {
+      return p; 
+    }
+    int get_num_sm() {
+      return n_sm; 
+    } 
+    // enable peer access (all to all)
+    void setup_peer_access() {
+      for (int j = 0; j < p; ++j) {
+        if (i != j) {
+          int canAccessPeer = 0;
+          cudaDeviceCanAccessPeer(&canAccessPeer, i, j);
+          cudaCheckError();
+          if (canAccessPeer) {
+            cudaDeviceEnablePeerAccess(j, 0);
+            cudaError_t status = cudaGetLastError();
+            if (!(status == cudaSuccess || status == cudaErrorPeerAccessAlreadyEnabled)) {
+              std::cerr << "Could not Enable Peer Access from" << i << " to " << j << std::endl;
+            }
+          }
+          else {
+            std::cerr << "P2P access required from " << i << " to " << j << std::endl;
+          }
+        }
+      }
+    }
+};
+
+// Wait for all host threads 
+void sync_all();
+
+// Each GPU copies its x_loc to x_glob[offset[device]] on all GPU
+template <typename val_t>
+void allgather (SNMGinfo & env, size_t* offset, val_t* x_loc, val_t ** x_glob) {
+  auto i = env.get_thread_num();
+  auto p = env.get_num_threads();  
+  size_t n_loc= offset[i+1]-offset[i];
+
+  env.setup_peer_access(); 
+  // this causes issues with CUB. TODO :  verify the impact on performance.
+
+  // send the local spmv output (x_loc) to all peers to reconstruct the global vector x_glob 
+  // After this call each peer has a full, updated, copy of x_glob
+  for (int j = 0; j < p; ++j) {
+    cudaMemcpyPeer(x_glob[j]+offset[i],j, x_loc,i, n_loc*sizeof(val_t));
+    cudaCheckError();
+  }
+  
+  //Make sure everyone has finished copying before returning
+  sync_all();
+
+}
+
+/**
+ * @tparam val_t The value type
+ * @tparam func_t The reduce functor type
+ * @param length The length of each array being combined
+ * @param x_loc Pointer to the local array
+ * @param x_glob Pointer to global array pointers
+ * @return Error code
+ */
+template <typename val_t, typename func_t>
+gdf_error treeReduce(SNMGinfo& env, size_t length, val_t* x_loc, val_t** x_glob){
+  auto i = env.get_thread_num();
+  auto p = env.get_num_threads();
+  env.setup_peer_access();
+  int rank = 1;
+  while(rank < p){
+    // Copy local data to the receiver's global buffer
+    if((i - rank) % (rank * 2) == 0){
+      int receiver = i - rank;
+      cudaMemcpyPeer(x_glob[receiver], receiver, x_loc, i, length*sizeof(val_t));
+      cudaCheckError();
+    }
+
+    // Sync everything now. This shouldn't be required as cudaMemcpyPeer is supposed to synchronize...
+    sync_all();
+
+    // Reduce the data from the receiver's global buffer with its local one
+    if(i % (rank * 2) == 0 && i + rank < p){
+      func_t op;
+      thrust::transform(rmm::exec_policy(nullptr)->on(nullptr),
+                        x_glob[i],
+                        x_glob[i] + length,
+                        x_loc,
+                        x_loc,
+                        op);
+      cudaCheckError();
+    }
+    rank *= 2;
+  }
+
+  // Thread 0 copies it's local result into it's global space
+  if (i == 0) {
+    cudaMemcpy(x_glob[i], x_loc, sizeof(val_t) * length, cudaMemcpyDefault);
+    cudaCheckError();
+  }
+
+  // Sync everything before returning
+  sync_all();
+
+  return GDF_SUCCESS;
+}
+
+/**
+ * @tparam val_t The value type
+ * @param length The length of the array being broadcast
+ * @param x_loc The local array for each node
+ * @param x_glob Pointer to the global array pointers
+ * @return Error code
+ */
+template <typename val_t>
+gdf_error treeBroadcast(SNMGinfo& env, size_t length, val_t* x_loc, val_t** x_glob){
+  auto i = env.get_thread_num();
+  auto p = env.get_num_threads();
+  env.setup_peer_access();
+  int rank = 1;
+  while(rank * 2 < p)
+    rank *= 2;
+  for(; rank >= 1; rank /= 2){
+    if(i % (rank * 2) == 0 and i + rank < p){
+      int receiver = i + rank;
+      cudaMemcpyPeer(x_glob[receiver], receiver, x_glob[i], i, sizeof(val_t) * length);
+      cudaCheckError();
+    }
+  }
+
+  // Sync everything before returning
+  sync_all();
+
+  return GDF_SUCCESS;
+}
+
+void print_mem_usage();
+
+} //namespace cugraph
diff --git a/cpp/src/structure/cugraph.cu b/cpp/src/structure/cugraph.cu
new file mode 100644
index 00000000000..a5b1dd0e4ab
--- /dev/null
+++ b/cpp/src/structure/cugraph.cu
@@ -0,0 +1,313 @@
+// -*-c++-*-
+
+ /*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+// Graph analytics features
+// Author: Alex Fender afender@nvidia.com
+
+#include <cugraph.h>
+#include "utilities/graph_utils.cuh"
+#include "converters/COOtoCSR.cuh"
+#include "utilities/error_utils.h"
+#include "converters/renumber.cuh"
+#include <library_types.h>
+#include <nvgraph/nvgraph.h>
+#include <thrust/device_vector.h>
+
+#include <rmm_utils.h>
+
+/*
+ * cudf has gdf_column_free and using this is, in general, better design than
+ * creating our own, but we will keep this as cudf is planning to remove the
+ * function. cudf plans to redesign cudf::column to fundamentally solve this
+ * problem, so once they finished the redesign, we need to update this code to
+ * use their new features. Until that time, we may rely on this as a temporary
+ * solution.
+ */
+void gdf_col_delete(gdf_column* col) {
+  if (col != nullptr) {
+    cudaStream_t stream {nullptr};
+    if (col->data != nullptr) {
+      ALLOC_FREE_TRY(col->data, stream);
+    }
+    if (col->valid != nullptr) {
+      ALLOC_FREE_TRY(col->valid, stream);
+    }
+#if 0/* Currently, gdf_column_view does not set col_name, and col_name can have
+        an arbitrary value, so freeing col_name can lead to freeing a ranodom
+        address. This problem should be cleaned up once cudf finishes
+        redesigning cudf::column. */
+    if (col->col_name != nullptr) {
+      free(col->col_name);
+    }
+#endif
+    delete col;
+  }
+}
+
+void gdf_col_release(gdf_column* col) {
+  delete col;
+}
+
+void cpy_column_view(const gdf_column *in, gdf_column *out) {
+  if (in != nullptr && out !=nullptr) {
+    gdf_column_view(out, in->data, in->valid, in->size, in->dtype);
+  }
+}
+
+gdf_error gdf_adj_list_view(gdf_graph *graph, const gdf_column *offsets,
+                                 const gdf_column *indices, const gdf_column *edge_data) {
+  //This function returns an error if this graph object has at least one graph
+  //representation to prevent a single object storing two different graphs.
+  GDF_REQUIRE( ((graph->edgeList == nullptr) && (graph->adjList == nullptr) &&
+    (graph->transposedAdjList == nullptr)), GDF_INVALID_API_CALL);
+  GDF_REQUIRE( offsets->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
+  GDF_REQUIRE( indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
+  GDF_REQUIRE( (offsets->dtype == indices->dtype), GDF_UNSUPPORTED_DTYPE );
+  GDF_REQUIRE( ((offsets->dtype == GDF_INT32) || (offsets->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE );
+  GDF_REQUIRE( (offsets->size > 0), GDF_DATASET_EMPTY );
+
+  graph->adjList = new gdf_adj_list;
+  graph->adjList->offsets = new gdf_column;
+  graph->adjList->indices = new gdf_column;
+  graph->adjList->ownership = 0;
+
+  cpy_column_view(offsets, graph->adjList->offsets);
+  cpy_column_view(indices, graph->adjList->indices);
+  if (edge_data) {
+      GDF_REQUIRE( indices->size == edge_data->size, GDF_COLUMN_SIZE_MISMATCH );
+      graph->adjList->edge_data = new gdf_column;
+      cpy_column_view(edge_data, graph->adjList->edge_data);
+  }
+  else {
+    graph->adjList->edge_data = nullptr;
+  }
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_adj_list::get_vertex_identifiers(gdf_column *identifiers) {
+  GDF_REQUIRE( offsets != nullptr , GDF_INVALID_API_CALL);
+  GDF_REQUIRE( offsets->data != nullptr , GDF_INVALID_API_CALL);
+  cugraph::sequence<int>((int)offsets->size-1, (int*)identifiers->data);
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_adj_list::get_source_indices (gdf_column *src_indices) {
+  GDF_REQUIRE( offsets != nullptr , GDF_INVALID_API_CALL);
+  GDF_REQUIRE( offsets->data != nullptr , GDF_INVALID_API_CALL);
+  GDF_REQUIRE( src_indices->size == indices->size, GDF_COLUMN_SIZE_MISMATCH );
+  GDF_REQUIRE( src_indices->dtype == indices->dtype, GDF_UNSUPPORTED_DTYPE );
+  GDF_REQUIRE( src_indices->size > 0, GDF_DATASET_EMPTY );
+  cugraph::offsets_to_indices<int>((int*)offsets->data, offsets->size-1, (int*)src_indices->data);
+
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_edge_list_view(gdf_graph *graph, const gdf_column *src_indices,
+                                 const gdf_column *dest_indices, const gdf_column *edge_data) {
+  //This function returns an error if this graph object has at least one graph
+  //representation to prevent a single object storing two different graphs.
+  GDF_REQUIRE( ((graph->edgeList == nullptr) && (graph->adjList == nullptr) &&
+    (graph->transposedAdjList == nullptr)), GDF_INVALID_API_CALL);
+  GDF_REQUIRE( src_indices->size == dest_indices->size, GDF_COLUMN_SIZE_MISMATCH );
+  GDF_REQUIRE( src_indices->dtype == dest_indices->dtype, GDF_UNSUPPORTED_DTYPE );
+  GDF_REQUIRE( ((src_indices->dtype == GDF_INT32) || (src_indices->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE );
+  GDF_REQUIRE( src_indices->size > 0, GDF_DATASET_EMPTY );
+  GDF_REQUIRE( src_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
+  GDF_REQUIRE( dest_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED );
+
+  graph->edgeList = new gdf_edge_list;
+  graph->edgeList->src_indices = new gdf_column;
+  graph->edgeList->dest_indices = new gdf_column;
+  graph->edgeList->ownership = 0;
+
+  cpy_column_view(src_indices, graph->edgeList->src_indices);
+  cpy_column_view(dest_indices, graph->edgeList->dest_indices);
+  if (edge_data) {
+      GDF_REQUIRE( src_indices->size == edge_data->size, GDF_COLUMN_SIZE_MISMATCH );
+      graph->edgeList->edge_data = new gdf_column;
+      cpy_column_view(edge_data, graph->edgeList->edge_data);
+  }
+  else {
+    graph->edgeList->edge_data = nullptr;
+  }
+
+  return GDF_SUCCESS;
+}
+
+template <typename T, typename WT>
+gdf_error gdf_add_adj_list_impl (gdf_graph *graph) {
+    if (graph->adjList == nullptr) {
+      GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL);
+      int nnz = graph->edgeList->src_indices->size, status = 0;
+      graph->adjList = new gdf_adj_list;
+      graph->adjList->offsets = new gdf_column;
+      graph->adjList->indices = new gdf_column;
+      graph->adjList->ownership = 1;
+
+    if (graph->edgeList->edge_data!= nullptr) {
+      graph->adjList->edge_data = new gdf_column;
+
+      CSR_Result_Weighted<int32_t,WT> adj_list;
+      status = ConvertCOOtoCSR_weighted((int*)graph->edgeList->src_indices->data, (int*)graph->edgeList->dest_indices->data, (WT*)graph->edgeList->edge_data->data, nnz, adj_list);
+
+      gdf_column_view(graph->adjList->offsets, adj_list.rowOffsets,
+                            nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype);
+      gdf_column_view(graph->adjList->indices, adj_list.colIndices,
+                            nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype);
+      gdf_column_view(graph->adjList->edge_data, adj_list.edgeWeights,
+                          nullptr, adj_list.nnz, graph->edgeList->edge_data->dtype);
+    }
+    else {
+      CSR_Result<int> adj_list;
+      status = ConvertCOOtoCSR((int*)graph->edgeList->src_indices->data,(int*)graph->edgeList->dest_indices->data, nnz, adj_list);
+      gdf_column_view(graph->adjList->offsets, adj_list.rowOffsets,
+                            nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype);
+      gdf_column_view(graph->adjList->indices, adj_list.colIndices,
+                            nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype);
+    }
+    if (status !=0) {
+      std::cerr << "Could not generate the adj_list" << std::endl;
+      return GDF_CUDA_ERROR;
+    }
+  }
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_add_edge_list (gdf_graph *graph) {
+    if (graph->edgeList == nullptr) {
+      GDF_REQUIRE( graph->adjList != nullptr , GDF_INVALID_API_CALL);
+      int *d_src;
+      graph->edgeList = new gdf_edge_list;
+      graph->edgeList->src_indices = new gdf_column;
+      graph->edgeList->dest_indices = new gdf_column;
+      graph->edgeList->ownership = 2;
+
+      cudaStream_t stream{nullptr};
+      ALLOC_TRY((void**)&d_src, sizeof(int) * graph->adjList->indices->size, stream);
+
+      cugraph::offsets_to_indices<int>((int*)graph->adjList->offsets->data,
+                                  graph->adjList->offsets->size-1,
+                                  (int*)d_src);
+
+      gdf_column_view(graph->edgeList->src_indices, d_src,
+                      nullptr, graph->adjList->indices->size, graph->adjList->indices->dtype);
+      cpy_column_view(graph->adjList->indices, graph->edgeList->dest_indices);
+
+      if (graph->adjList->edge_data != nullptr) {
+        graph->edgeList->edge_data = new gdf_column;
+        cpy_column_view(graph->adjList->edge_data, graph->edgeList->edge_data);
+      }
+  }
+  return GDF_SUCCESS;
+}
+
+
+template <typename WT>
+gdf_error gdf_add_transposed_adj_list_impl (gdf_graph *graph) {
+    if (graph->transposedAdjList == nullptr ) {
+      GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL);
+      int nnz = graph->edgeList->src_indices->size, status = 0;
+      graph->transposedAdjList = new gdf_adj_list;
+      graph->transposedAdjList->offsets = new gdf_column;
+      graph->transposedAdjList->indices = new gdf_column;
+      graph->transposedAdjList->ownership = 1;
+
+      if (graph->edgeList->edge_data) {
+        graph->transposedAdjList->edge_data = new gdf_column;
+        CSR_Result_Weighted<int32_t,WT> adj_list;
+        status = ConvertCOOtoCSR_weighted( (int*)graph->edgeList->dest_indices->data, (int*)graph->edgeList->src_indices->data, (WT*)graph->edgeList->edge_data->data, nnz, adj_list);
+        gdf_column_view(graph->transposedAdjList->offsets, adj_list.rowOffsets,
+                              nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype);
+        gdf_column_view(graph->transposedAdjList->indices, adj_list.colIndices,
+                              nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype);
+        gdf_column_view(graph->transposedAdjList->edge_data, adj_list.edgeWeights,
+                            nullptr, adj_list.nnz, graph->edgeList->edge_data->dtype);
+      }
+      else {
+
+        CSR_Result<int> adj_list;
+        status = ConvertCOOtoCSR((int*)graph->edgeList->dest_indices->data, (int*)graph->edgeList->src_indices->data, nnz, adj_list);
+        gdf_column_view(graph->transposedAdjList->offsets, adj_list.rowOffsets,
+                              nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype);
+        gdf_column_view(graph->transposedAdjList->indices, adj_list.colIndices,
+                              nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype);
+      }
+      if (status !=0) {
+        std::cerr << "Could not generate the adj_list" << std::endl;
+        return GDF_CUDA_ERROR;
+      }
+    }
+    return GDF_SUCCESS;
+}
+
+gdf_error gdf_add_adj_list(gdf_graph *graph) {
+  if (graph->adjList != nullptr)
+    return GDF_SUCCESS;
+
+  GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL);
+  GDF_REQUIRE( graph->edgeList->src_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE );
+
+  if (graph->edgeList->edge_data != nullptr) {
+    switch (graph->edgeList->edge_data->dtype) {
+      case GDF_FLOAT32:   return gdf_add_adj_list_impl<int32_t, float>(graph);
+      case GDF_FLOAT64:   return gdf_add_adj_list_impl<int32_t, double>(graph);
+      default: return GDF_UNSUPPORTED_DTYPE;
+    }
+  }
+  else {
+    return gdf_add_adj_list_impl<int32_t, float>(graph);
+  }
+}
+
+gdf_error gdf_add_transposed_adj_list(gdf_graph *graph) {
+  if (graph->edgeList == nullptr)
+    gdf_add_edge_list(graph);
+
+  GDF_REQUIRE(graph->edgeList->src_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+  GDF_REQUIRE(graph->edgeList->dest_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+
+  if (graph->edgeList->edge_data != nullptr) {
+    switch (graph->edgeList->edge_data->dtype) {
+      case GDF_FLOAT32:   return gdf_add_transposed_adj_list_impl<float>(graph);
+      case GDF_FLOAT64:   return gdf_add_transposed_adj_list_impl<double>(graph);
+      default: return GDF_UNSUPPORTED_DTYPE;
+    }
+  }
+  else {
+    return gdf_add_transposed_adj_list_impl<float>(graph);
+  }
+}
+
+gdf_error gdf_delete_adj_list(gdf_graph *graph) {
+  if (graph->adjList) {
+    delete graph->adjList;
+  }
+  graph->adjList = nullptr;
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_delete_edge_list(gdf_graph *graph) {
+  if (graph->edgeList) {
+    delete graph->edgeList;
+  }
+  graph->edgeList = nullptr;
+  return GDF_SUCCESS;
+}
+
+gdf_error gdf_delete_transposed_adj_list(gdf_graph *graph) {
+  if (graph->transposedAdjList) {
+    delete graph->transposedAdjList;
+  }
+  graph->transposedAdjList = nullptr;
+  return GDF_SUCCESS;
+}
diff --git a/cpp/src/tests/CMakeLists.txt b/cpp/src/tests/CMakeLists.txt
index ee3418fa4c8..e2e47ac3e97 100644
--- a/cpp/src/tests/CMakeLists.txt
+++ b/cpp/src/tests/CMakeLists.txt
@@ -46,7 +46,7 @@ function(configure_test TEST_NAME Tests_SRCS)
 #    message(STATUS "${TEST_NAME} will link against: gdf, cugraph")
 
     add_executable(${TEST_NAME} ${Tests_SRCS})
-    target_link_libraries(${TEST_NAME} OpenMP::OpenMP_CXX gmock_main gmock GTest::GTest cudart cudf cugraph nvgraph)
+    target_link_libraries(${TEST_NAME} OpenMP::OpenMP_CXX gmock_main gmock GTest::GTest cugraph nvgraph cudf cudart)
     set_target_properties(${TEST_NAME} PROPERTIES
         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gtests/")
 
@@ -129,13 +129,27 @@ set(RENUMBERING_TEST_SRCS
         "${CMAKE_CURRENT_SOURCE_DIR}/renumber/renumber_test.cu")
 
 configure_test(RENUMBERING_TEST "${RENUMBERING_TEST_SRCS}")
-
+###################################################################################################
+#-SNMG_SPMV  tests --------------------------------------------------------------------------------
 set(SNMG_SPMV_TEST_SRCS
         "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
         "${CMAKE_CURRENT_SOURCE_DIR}/snmg_spmv/snmg_spmv_test.cu")
 
 configure_test(SNMG_SPMV_TEST "${SNMG_SPMV_TEST_SRCS}")
 
+###################################################################################################
+#-SNMG_DEGREE  tests --------------------------------------------------------------------------------
+set(SNMG_DEGREE_TEST_SRCS
+        "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+        "${CMAKE_CURRENT_SOURCE_DIR}/snmg_degree/snmg_degree_test.cu")
+
+configure_test(SNMG_DEGREE_TEST "${SNMG_DEGREE_TEST_SRCS}")
+
+set(SNMG_PR_TEST_SRCS
+        "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c"
+        "${CMAKE_CURRENT_SOURCE_DIR}/snmg_pagerank/snmg_pagerank_test.cu")
+
+configure_test(SNMG_PR_TEST "${SNMG_PR_TEST_SRCS}")
 
 message(STATUS "******** Tests are ready ********")
 
diff --git a/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_jaccard.cpp b/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_jaccard.cpp
index 28e96f1f1a1..c6c612bbfa5 100644
--- a/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_jaccard.cpp
+++ b/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_jaccard.cpp
@@ -98,7 +98,7 @@ TEST(nvgraph_jaccard, success)
   float gamma = 1.0;
 
   cudaStream_t stream{nullptr};
-  ALLOC_MANAGED_TRY((void**)&weight_j, sizeof(float)*edges, stream);
+  ALLOC_TRY((void**)&weight_j, sizeof(float)*edges, stream);
   
   ASSERT_EQ(nvgraphJaccard (CUDA_R_32I, CUDA_R_32F, no_vertex, edges,
                             (void*)G.adjList->offsets->data, 
@@ -164,7 +164,7 @@ TEST(nvgraph_jaccard_grmat, success)
   cudaMemcpy ((void*) &ind_h[0], G.adjList->indices->data, sizeof(int)*edges, cudaMemcpyDeviceToHost);
 
   cudaStream_t stream{nullptr};
-  ALLOC_MANAGED_TRY((void**)&weight_j, sizeof(float)*edges, stream);
+  ALLOC_TRY((void**)&weight_j, sizeof(float)*edges, stream);
 
   ASSERT_EQ(nvgraphJaccard (CUDA_R_32I, CUDA_R_32F, vertices, edges,
                             (void*)G.adjList->offsets->data,
diff --git a/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_louvain.cpp b/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_louvain.cpp
index fba7b4b8c6c..932d4d99a4f 100644
--- a/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_louvain.cpp
+++ b/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_louvain.cpp
@@ -51,7 +51,7 @@ TEST(nvgraph_louvain, success)
   int* best_cluster_vec = NULL;
 
   cudaStream_t stream{nullptr};
-  ALLOC_MANAGED_TRY((void**)&best_cluster_vec, sizeof(int) * no_vertex, stream);
+  ALLOC_TRY((void**)&best_cluster_vec, sizeof(int) * no_vertex, stream);
   
   ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, nvgraphLouvain (CUDA_R_32I, CUDA_R_32F, no_vertex, ind_h.size(),
                             G.adjList->offsets->data, G.adjList->indices->data, G.adjList->edge_data->data, weighted, has_init_cluster, nullptr,
@@ -94,7 +94,7 @@ TEST(nvgraph_louvain_grmat, success)
 
   ASSERT_EQ(gdf_grmat_gen(argv, vertices, edges, &col_src, &col_dest, nullptr), GDF_SUCCESS);
   cudaStream_t stream{nullptr};
-  ALLOC_MANAGED_TRY ((void**)&col_weights.data, sizeof(int) * edges, stream);
+  ALLOC_TRY ((void**)&col_weights.data, sizeof(int) * edges, stream);
   col_weights.size = edges;
   std::vector<float> w_h (edges, (float)1.0);
   cudaMemcpy (col_weights.data, (void*) &(w_h[0]), sizeof(float)*edges, cudaMemcpyHostToDevice);
@@ -110,7 +110,7 @@ TEST(nvgraph_louvain_grmat, success)
   int num_level = 0;
   int* best_cluster_vec = NULL;
 
-  ALLOC_MANAGED_TRY ((void**)&best_cluster_vec, sizeof(int) * vertices, stream);
+  ALLOC_TRY ((void**)&best_cluster_vec, sizeof(int) * vertices, stream);
 
   ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, nvgraphLouvain (CUDA_R_32I, CUDA_R_32F, vertices, edges, G.adjList->offsets->data, G.adjList->indices->data, G.adjList->edge_data->data, weighted, has_init_cluster, nullptr, (void*) &modularity, (void*) best_cluster_vec, (void *)(&num_level)));
 
diff --git a/cpp/src/tests/pagerank/pagerank_test.cu b/cpp/src/tests/pagerank/pagerank_test.cu
index 46c1150f292..5ed111fd5cb 100644
--- a/cpp/src/tests/pagerank/pagerank_test.cu
+++ b/cpp/src/tests/pagerank/pagerank_test.cu
@@ -106,12 +106,6 @@ class Tests_Pagerank : public ::testing::TestWithParam<Pagerank_Usecase> {
      // Read
      ASSERT_EQ( (mm_to_coo<int,T>(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)) , 0)<< "could not read matrix data"<< "\n";
      ASSERT_EQ(fclose(fpin),0);
-
-     //std::cout<< *std::min_element(cooRowInd.begin(), cooRowInd.end()) <<std::endl;
-     //std::cout<< *std::max_element(cooRowInd.begin(), cooRowInd.end()) <<std::endl <<std::endl;
-     //std::cout<< *std::min_element(cooColInd.begin(), cooColInd.end()) <<std::endl;
-     //std::cout<< *std::max_element(cooColInd.begin(), cooColInd.end()) <<std::endl <<std::endl; 
-     //std::cout<< cooColInd.size() <<std::endl;
     
     // gdf columns
     col_src = create_gdf_column(cooRowInd);
@@ -154,25 +148,14 @@ class Tests_Pagerank : public ::testing::TestWithParam<Pagerank_Usecase> {
       fclose(fpin);
       T err;
       int n_err = 0;
-      for (int i = 0; i < m; i++)
-      {
-          //if(i > (m-10))
-          //  std::cout << expected_res[i] << " " << calculated_res[i] <<std::endl;
+      for (int i = 0; i < m; i++) {
           err = fabs(expected_res[i] - calculated_res[i]);
-          if (err> tol*1.1)
-          {
-              n_err++;
+          if (err> tol*1.1) {
+              n_err++; // count the number of mismatches 
           }
       }
-      if (n_err)
-      {
-          //EXPECT_NEAR(tot_err/n_err, cugraph_Const<T>::tol, cugraph_Const<T>::tol*9.99); // Network x used n*1e-10 for precision
+      if (n_err) {
           EXPECT_LE(n_err, 0.001*m); // we tolerate 0.1% of values with a litte difference
-          //printf("number of incorrect entries: %d\n", n_err);
-          //if (n_err > 0.001*m)
-          //{
-          //  eq(calculated_res,expected_res);
-          //}
       }
     }
   }
diff --git a/cpp/src/tests/renumber/renumber_test.cu b/cpp/src/tests/renumber/renumber_test.cu
index cd70e631f3c..c982ec71ec1 100644
--- a/cpp/src/tests/renumber/renumber_test.cu
+++ b/cpp/src/tests/renumber/renumber_test.cu
@@ -21,7 +21,7 @@
 
 #include "cuda_profiler_api.h"
 
-#include "renumber.cuh"
+#include "converters/renumber.cuh"
 #include "rmm_utils.h"
 
 #include <chrono>
@@ -93,8 +93,10 @@ TEST_F(RenumberingTest, SmallFixedVertexList)
   uint32_t tmp_results[length];
   uint32_t tmp_map[2 * length];
 
-  EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint32_t) * length), cudaSuccess);
-  EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint32_t) * length), cudaSuccess);
+  cudaStream_t stream{nullptr};
+
+  EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint32_t) * length, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint32_t) * length, stream), RMM_SUCCESS);
 
   EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice), cudaSuccess);
   EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice), cudaSuccess);
@@ -116,8 +118,8 @@ TEST_F(RenumberingTest, SmallFixedVertexList)
     EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
   }
 
-  EXPECT_EQ(cudaFree(src_d), cudaSuccess);
-  EXPECT_EQ(cudaFree(dst_d), cudaSuccess);
+  EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS);
   EXPECT_EQ(test_free(number_map_d), cudaSuccess);
 }
 
@@ -138,8 +140,10 @@ TEST_F(RenumberingTest, SmallFixedVertexList64Bit)
   uint64_t tmp_results[length];
   uint64_t tmp_map[2 * length];
 
-  EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint64_t) * length), cudaSuccess);
-  EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint64_t) * length), cudaSuccess);
+  cudaStream_t stream{nullptr};
+
+  EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint64_t) * length, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint64_t) * length, stream), RMM_SUCCESS);
 
   EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), cudaSuccess);
   EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), cudaSuccess);
@@ -161,8 +165,8 @@ TEST_F(RenumberingTest, SmallFixedVertexList64Bit)
     EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
   }
 
-  EXPECT_EQ(cudaFree(src_d), cudaSuccess);
-  EXPECT_EQ(cudaFree(dst_d), cudaSuccess);
+  EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS);
   EXPECT_EQ(test_free(number_map_d), cudaSuccess);
 }
 
@@ -185,10 +189,12 @@ TEST_F(RenumberingTest, SmallFixedVertexList64BitTo32Bit)
   uint32_t tmp_results[length];
   uint64_t tmp_map[2 * length];
 
-  EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint64_t) * length), cudaSuccess);
-  EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint64_t) * length), cudaSuccess);
-  EXPECT_EQ(cudaMalloc(&src_renumbered_d, sizeof(uint32_t) * length), cudaSuccess);
-  EXPECT_EQ(cudaMalloc(&dst_renumbered_d, sizeof(uint32_t) * length), cudaSuccess);
+  cudaStream_t stream{nullptr};
+
+  EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint64_t) * length, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint64_t) * length, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_ALLOC(&src_renumbered_d, sizeof(uint32_t) * length, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_ALLOC(&dst_renumbered_d, sizeof(uint32_t) * length, stream), RMM_SUCCESS);
 
   EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), cudaSuccess);
   EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), cudaSuccess);
@@ -210,8 +216,8 @@ TEST_F(RenumberingTest, SmallFixedVertexList64BitTo32Bit)
     EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]);
   }
 
-  EXPECT_EQ(cudaFree(src_d), cudaSuccess);
-  EXPECT_EQ(cudaFree(dst_d), cudaSuccess);
+  EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS);
   EXPECT_EQ(test_free(number_map_d), cudaSuccess);
 }
 
@@ -228,8 +234,10 @@ TEST_F(RenumberingTest, Random100KVertexSet)
   uint64_t *tmp_results = (uint64_t *) malloc(num_verts * sizeof(uint64_t));
   uint64_t *tmp_map     = (uint64_t *) malloc(2 * num_verts * sizeof(uint64_t));
 
-  EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint64_t) * num_verts), cudaSuccess);
-  EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint64_t) * num_verts), cudaSuccess);
+  cudaStream_t stream{nullptr};
+
+  EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint64_t) * num_verts, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint64_t) * num_verts, stream), RMM_SUCCESS);
 
   //
   //  Generate random source and vertex values
@@ -296,8 +304,8 @@ TEST_F(RenumberingTest, Random100KVertexSet)
 
   EXPECT_EQ(min_id, 0);
   EXPECT_EQ(max_id, (unique_verts - 1));
-  EXPECT_EQ(cudaFree(src_d), cudaSuccess);
-  EXPECT_EQ(cudaFree(dst_d), cudaSuccess);
+  EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS);
   EXPECT_EQ(test_free(number_map_d), cudaSuccess);
   free(src_data);
   free(dst_data);
@@ -317,8 +325,10 @@ TEST_F(RenumberingTest, Random10MVertexSet)
   uint32_t *dst_d;
   uint32_t *number_map_d;
 
-  EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint32_t) * num_verts), cudaSuccess);
-  EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint32_t) * num_verts), cudaSuccess);
+  cudaStream_t stream{nullptr};
+
+  EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint32_t) * num_verts, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint32_t) * num_verts, stream), RMM_SUCCESS);
 
   //
   //  Init the random number generate
@@ -326,7 +336,7 @@ TEST_F(RenumberingTest, Random10MVertexSet)
   const int num_threads{64};
   curandState *state;
 
-  EXPECT_EQ(cudaMalloc(&state, sizeof(curandState) * num_threads), cudaSuccess);
+  EXPECT_EQ(RMM_ALLOC(&state, sizeof(curandState) * num_threads, stream), RMM_SUCCESS);
   setup_generator<<<num_threads,1>>>(state);
   generate_sources<<<num_threads,1>>>(state, num_verts, src_d);
   generate_destinations<<<num_threads,1>>>(state, num_verts, src_d, dst_d);
@@ -346,8 +356,8 @@ TEST_F(RenumberingTest, Random10MVertexSet)
   std::cout << "  unique verts = " << unique_verts << std::endl;
   std::cout << "  hash size = " << hash_size << std::endl;
 
-  EXPECT_EQ(cudaFree(src_d), cudaSuccess);
-  EXPECT_EQ(cudaFree(dst_d), cudaSuccess);
+  EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS);
   EXPECT_EQ(test_free(number_map_d), cudaSuccess);
 }
 
@@ -364,8 +374,10 @@ TEST_F(RenumberingTest, Random100MVertexSet)
   uint32_t *dst_d;
   uint32_t *number_map_d;
 
-  EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint32_t) * num_verts), cudaSuccess);
-  EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint32_t) * num_verts), cudaSuccess);
+  cudaStream_t stream{nullptr};
+
+  EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint32_t) * num_verts, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint32_t) * num_verts, stream), RMM_SUCCESS);
 
   //
   //  Init the random number generate
@@ -373,7 +385,7 @@ TEST_F(RenumberingTest, Random100MVertexSet)
   const int num_threads{64};
   curandState *state;
 
-  EXPECT_EQ(cudaMalloc(&state, sizeof(curandState) * num_threads), cudaSuccess);
+  EXPECT_EQ(RMM_ALLOC(&state, sizeof(curandState) * num_threads, stream), RMM_SUCCESS);
   setup_generator<<<num_threads,1>>>(state);
   generate_sources<<<num_threads,1>>>(state, num_verts, src_d);
   generate_destinations<<<num_threads,1>>>(state, num_verts, src_d, dst_d);
@@ -393,7 +405,7 @@ TEST_F(RenumberingTest, Random100MVertexSet)
   std::cout << "  unique verts = " << unique_verts << std::endl;
   std::cout << "  hash size = " << hash_size << std::endl;
 
-  EXPECT_EQ(cudaFree(src_d), cudaSuccess);
-  EXPECT_EQ(cudaFree(dst_d), cudaSuccess);
+  EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS);
+  EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS);
   EXPECT_EQ(test_free(number_map_d), cudaSuccess);
 }
diff --git a/cpp/src/tests/snmg_degree/snmg_degree_test.cu b/cpp/src/tests/snmg_degree/snmg_degree_test.cu
new file mode 100644
index 00000000000..8612f242be3
--- /dev/null
+++ b/cpp/src/tests/snmg_degree/snmg_degree_test.cu
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+#include "cuda_profiler_api.h"
+#include <cugraph.h>
+#include <omp.h>
+#include "test_utils.h"
+#include "snmg_test_utils.h"
+
+//#define SNMG_VERBOSE
+
+// ref Degree on the host
+template<typename idx_t>
+void ref_degree_h(int x,
+                  std::vector<idx_t> & off_h,
+                  std::vector<idx_t> & ind_h,
+                  std::vector<idx_t> & degree) {
+  for (auto i = 0; i < degree.size(); i++)
+    degree[i] = 0;
+  if (x == 0 || x == 2) {
+    for (auto i = 0; i < degree.size(); ++i) {
+      degree[i] += off_h[i + 1] - off_h[i];
+    }
+  }
+  if (x == 0 || x == 1) {
+    for (auto i = 0; i < ind_h.size(); i++)
+      degree[ind_h[i]] += 1;
+  }
+}
+
+struct MGDegree_Usecase {
+  std::string matrix_file;
+  int x;
+  MGDegree_Usecase(const std::string& a, int _x) {
+    x = _x;
+    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
+    // if RAPIDS_DATASET_ROOT_DIR not set, default to "/datasets"
+    const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir();
+    if ((a != "") && (a[0] != '/')) {
+      matrix_file = rapidsDatasetRootDir + "/" + a;
+    } else {
+      matrix_file = a;
+    }
+  }
+  MGDegree_Usecase& operator=(const MGDegree_Usecase& rhs) {
+    matrix_file = rhs.matrix_file;
+    return *this;
+  }
+};
+
+class Tests_MGDegree: public ::testing::TestWithParam<MGDegree_Usecase> {
+public:
+  Tests_MGDegree() {
+  }
+  static void SetupTestCase() {
+  }
+  static void TearDownTestCase() {
+  }
+  virtual void SetUp() {
+  }
+  virtual void TearDown() {
+  }
+
+  static std::vector<double> mgspmv_time;
+
+  template<typename idx_t>
+  void run_current_test(const MGDegree_Usecase& param) {
+    const ::testing::TestInfo* const test_info =
+        ::testing::UnitTest::GetInstance()->current_test_info();
+    std::stringstream ss;
+    std::string test_id = std::string(test_info->test_case_name()) + std::string(".")
+        + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file)
+        + std::string("_") + ss.str().c_str();
+    std::cout << test_id << "\n";
+    int m, k, nnz, n_gpus;
+    MM_typecode mc;
+    gdf_error status;
+
+    double t;
+
+    FILE* fpin = fopen(param.matrix_file.c_str(), "r");
+
+    if (!fpin) {
+      std::cout << "Could not open file: " << param.matrix_file << "\n";
+      FAIL();
+    }
+
+    ASSERT_EQ(mm_properties<int>(fpin, 1, &mc, &m, &k, &nnz),0)<< "could not read Matrix Market file properties"<< "\n";
+    ASSERT_TRUE(mm_is_matrix(mc));
+    ASSERT_TRUE(mm_is_coordinate(mc));
+    ASSERT_FALSE(mm_is_complex(mc));
+    ASSERT_FALSE(mm_is_skew(mc));
+
+    // Allocate memory on host
+    std::vector<idx_t> cooRowInd(nnz), cooColInd(nnz), csrColInd(nnz), csrRowPtr(m + 1);
+    std::vector<idx_t> degree_h(m, 0.0), degree_ref(m, 0.0), csrVal(nnz);
+
+    // Read
+    ASSERT_EQ( (mm_to_coo<int,int>(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], NULL, NULL)) , 0)<< "could not read matrix data"<< "\n";
+    ASSERT_EQ(fclose(fpin), 0);
+    //ASSERT_EQ( (coo_to_csr<int,val_t> (m, m, nnz, &cooRowInd[0],  &cooColInd[0], NULL, NULL, &csrRowPtr[0], NULL, NULL, NULL)), 0) << "could not covert COO to CSR "<< "\n";
+    coo2csr(cooRowInd, cooColInd, csrRowPtr, csrColInd);
+
+    CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus));
+    std::vector<size_t> v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus + 1);
+    gdf_column *col_x[n_gpus];
+    //reference result
+    t = omp_get_wtime();
+    ref_degree_h(param.x, csrRowPtr, csrColInd, degree_ref);
+    std::cout << "CPU time: " << omp_get_wtime() - t << "\n";
+    if (nnz < 1200000000)
+        {
+#pragma omp parallel num_threads(1)
+      {
+        //omp_set_num_threads(n_gpus);
+        auto i = omp_get_thread_num();
+        auto p = omp_get_num_threads();
+        CUDA_RT_CALL(cudaSetDevice(i));
+
+#ifdef SNMG_VERBOSE
+#pragma omp master
+        {
+          std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
+          std::cout << "Number of threads : "<< p <<std::endl;
+        }
+#endif
+
+        gdf_column *col_off = new gdf_column,
+            *col_ind = new gdf_column,
+            *col_val = new gdf_column;
+        col_x[i] = new gdf_column;
+        create_gdf_column(degree_h, col_x[i]);
+#pragma omp barrier
+
+        //load a chunk of the graph on each GPU
+        load_csr_loc(csrRowPtr, csrColInd, csrVal,
+                     v_loc,
+                     e_loc, part_offset,
+                     col_off,
+                     col_ind, col_val);
+
+        t = omp_get_wtime();
+        status = gdf_snmg_degree(param.x, &part_offset[0], col_off, col_ind, col_x);
+        if (status != 0){
+          std::cout << "Call to gdf_snmg_degree failed: " << gdf_error_get_name(status) << "\n";
+          std::cout << "Dtypes: " << col_off->dtype << "," << col_ind->dtype << "\n";
+        }
+        EXPECT_EQ(status, 0);
+#pragma omp master
+        {
+          std::cout << "GPU time: " << omp_get_wtime() - t << "\n";
+        }
+
+#pragma omp master
+        {
+          //printv(m, (val_t *)col_x[0]->data, 0);
+          CUDA_RT_CALL(cudaMemcpy(&degree_h[0],
+                                  col_x[0]->data,
+                                  sizeof(idx_t) * m,
+                                  cudaMemcpyDeviceToHost));
+
+          for (auto j = 0; j < degree_h.size(); ++j)
+            EXPECT_EQ(degree_ref[j], degree_h[j]);
+        }
+
+        gdf_col_delete(col_off);
+        gdf_col_delete(col_ind);
+        gdf_col_delete(col_val);
+        gdf_col_delete(col_x[i]);
+      }
+    }
+    if (n_gpus > 1)
+        {
+      // Only using the 4 fully connected GPUs on DGX1
+      if (n_gpus == 8)
+        n_gpus = 4;
+
+#pragma omp parallel num_threads(n_gpus)
+      {
+        //omp_set_num_threads(n_gpus);
+        auto i = omp_get_thread_num();
+        auto p = omp_get_num_threads();
+        CUDA_RT_CALL(cudaSetDevice(i));
+
+#ifdef SNMG_VERBOSE
+#pragma omp master
+        {
+          std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
+          std::cout << "Number of threads : "<< p <<std::endl;
+        }
+#endif
+
+        gdf_column *col_off = new gdf_column,
+            *col_ind = new gdf_column,
+            *col_val = new gdf_column;
+        col_x[i] = new gdf_column;
+        create_gdf_column(degree_h, col_x[i]);
+#pragma omp barrier
+
+        //load a chunck of the graph on each GPU
+        load_csr_loc(csrRowPtr, csrColInd, csrVal,
+                     v_loc,
+                     e_loc, part_offset,
+                     col_off,
+                     col_ind, col_val);
+
+        t = omp_get_wtime();
+        status = gdf_snmg_degree(param.x, &part_offset[0], col_off, col_ind, col_x);
+        if (status != 0){
+          std::cout << "Call to gdf_snmg_degree failed: " << gdf_error_get_name(status) << "\n";
+          std::cout << "Dtypes: " << col_off->dtype << "," << col_ind->dtype << "\n";
+        }
+        EXPECT_EQ(status, 0);
+#pragma omp master
+        {
+          std::cout << "multi-GPU time: " << omp_get_wtime() - t << "\n";
+        }
+
+#pragma omp master
+        {
+          //printv(m, (val_t *)col_x[0]->data, 0);
+          CUDA_RT_CALL(cudaMemcpy(&degree_h[0],
+                                  col_x[0]->data,
+                                  sizeof(idx_t) * m,
+                                  cudaMemcpyDeviceToHost));
+
+          for (auto j = 0; j < degree_h.size(); ++j)
+            EXPECT_EQ(degree_ref[j], degree_h[j]);
+        }
+
+        gdf_col_delete(col_off);
+        gdf_col_delete(col_ind);
+        gdf_col_delete(col_val);
+        gdf_col_delete(col_x[i]);
+      }
+    }
+    std::cout << std::endl;
+  }
+};
+
+TEST_P(Tests_MGDegree, CheckInt32_mtx) {
+  run_current_test<int>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGDegree,
+                        ::testing::Values(MGDegree_Usecase("test/datasets/karate.mtx", 0)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/karate.mtx", 1)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/karate.mtx", 2)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/netscience.mtx", 0)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/netscience.mtx", 1)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/netscience.mtx", 2)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/cit-Patents.mtx", 0)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/cit-Patents.mtx", 1)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/cit-Patents.mtx", 2)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/webbase-1M.mtx", 0)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/webbase-1M.mtx", 1)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/webbase-1M.mtx", 2)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/web-Google.mtx", 0)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/web-Google.mtx", 1)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/web-Google.mtx", 2)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/wiki-Talk.mtx", 0)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/wiki-Talk.mtx", 1)
+                                                           ,
+                                          MGDegree_Usecase("test/datasets/wiki-Talk.mtx", 2)
+                                                           )
+                                                           );
+
+class Tests_MGDegree_hibench: public ::testing::TestWithParam<MGDegree_Usecase> {
+public:
+  Tests_MGDegree_hibench() {
+  }
+  static void SetupTestCase() {
+  }
+  static void TearDownTestCase() {
+  }
+  virtual void SetUp() {
+  }
+  virtual void TearDown() {
+  }
+
+  static std::vector<double> mgspmv_time;
+
+  template<typename idx_t>
+  void run_current_test(const MGDegree_Usecase& param) {
+    const ::testing::TestInfo* const test_info =
+        ::testing::UnitTest::GetInstance()->current_test_info();
+    std::stringstream ss;
+    std::string test_id = std::string(test_info->test_case_name()) + std::string(".")
+        + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file)
+        + std::string("_") + ss.str().c_str();
+    std::cout << "Filename: " << param.matrix_file << ", x=" << param.x << "\n";
+    int m, nnz, n_gpus;
+    gdf_error status;
+    std::vector<idx_t> cooRowInd, cooColInd;
+    double t;
+
+    ASSERT_EQ(read_single_file(param.matrix_file.c_str(), cooRowInd, cooColInd), 0);
+    nnz = cooRowInd.size();
+    m = std::max(*(std::max_element(cooRowInd.begin(), cooRowInd.end())),
+                 *(std::max_element(cooColInd.begin(), cooColInd.end())));
+    m += 1;
+
+    // Allocate memory on host
+    std::vector<idx_t> csrColInd(nnz), csrRowPtr(m + 1), degree_ref(m), degree_h(m), csrVal(nnz);
+    coo2csr(cooRowInd, cooColInd, csrRowPtr, csrColInd);
+    CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus));
+    std::vector<size_t> v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus + 1);
+    gdf_column *col_x[n_gpus];
+    //reference result
+    t = omp_get_wtime();
+    ref_degree_h(param.x, csrRowPtr, csrColInd, degree_ref);
+    std::cout << "CPU time: " << omp_get_wtime() - t << "\n";
+
+    if (nnz < 1200000000) {
+#pragma omp parallel num_threads(1)
+      {
+        //omp_set_num_threads(n_gpus);
+        auto i = omp_get_thread_num();
+        auto p = omp_get_num_threads();
+        CUDA_RT_CALL(cudaSetDevice(i));
+
+#ifdef SNMG_VERBOSE
+#pragma omp master
+        {
+          std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
+          std::cout << "Number of threads : "<< p <<std::endl;
+        }
+#endif
+
+        gdf_column *col_off = new gdf_column,
+            *col_ind = new gdf_column,
+            *col_val = new gdf_column;
+        col_x[i] = new gdf_column;
+        create_gdf_column(degree_h, col_x[i]);
+#pragma omp barrier
+
+        //load a chunk of the graph on each GPU
+        load_csr_loc(csrRowPtr, csrColInd, csrVal,
+                     v_loc,
+                     e_loc, part_offset,
+                     col_off,
+                     col_ind, col_val);
+        //printv(col_val->size,(float*)col_val->data,0);
+        t = omp_get_wtime();
+        status = gdf_snmg_degree(param.x, &part_offset[0], col_off, col_ind, col_x);
+        if (status != 0){
+          std::cout << "Call to gdf_snmg_degree failed: " << gdf_error_get_name(status) << "\n";
+          std::cout << "Dtypes: " << col_off->dtype << "," << col_ind->dtype << "\n";
+        }
+        EXPECT_EQ(status, 0);
+#pragma omp master
+        {
+          std::cout << "GPU time: " << omp_get_wtime() - t << "\n";
+        }
+
+#pragma omp master
+        {
+          //printv(m, (val_t *)col_x[0]->data, 0);
+          CUDA_RT_CALL(cudaMemcpy(&degree_h[0],
+                                  col_x[0]->data,
+                                  sizeof(idx_t) * m,
+                                  cudaMemcpyDeviceToHost));
+
+          for (auto j = 0; j < degree_ref.size(); ++j)
+            EXPECT_EQ(degree_ref[j], degree_h[j]);
+        }
+
+        gdf_col_delete(col_off);
+        gdf_col_delete(col_ind);
+        gdf_col_delete(col_val);
+        gdf_col_delete(col_x[i]);
+      }
+    }
+    if (n_gpus > 1) {
+      // Only using the 4 fully connected GPUs on DGX1
+      if (n_gpus == 8)
+        n_gpus = 4;
+
+#pragma omp parallel num_threads(n_gpus)
+      {
+        auto i = omp_get_thread_num();
+        auto p = omp_get_num_threads();
+        CUDA_RT_CALL(cudaSetDevice(i));
+
+#ifdef SNMG_VERBOSE
+#pragma omp master
+        {
+          std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
+          std::cout << "Number of threads : "<< p <<std::endl;
+        }
+#endif
+
+        gdf_column *col_off = new gdf_column,
+            *col_ind = new gdf_column,
+            *col_val = new gdf_column;
+        col_x[i] = new gdf_column;
+        create_gdf_column(degree_h, col_x[i]);
+#pragma omp barrier
+
+        //load a chunk of the graph on each GPU
+        load_csr_loc(csrRowPtr, csrColInd, csrVal,
+                     v_loc,
+                     e_loc, part_offset,
+                     col_off,
+                     col_ind, col_val);
+        //printv(col_val->size,(float*)col_val->data,0);
+        t = omp_get_wtime();
+        status = gdf_snmg_degree(param.x, &part_offset[0], col_off, col_ind, col_x);
+        if (status != 0){
+          std::cout << "Call to gdf_snmg_degree failed: " << gdf_error_get_name(status) << "\n";
+          std::cout << "Dtypes: " << col_off->dtype << "," << col_ind->dtype << "\n";
+        }
+        EXPECT_EQ(status, 0);
+#pragma omp master
+        {
+          std::cout << "multi-GPU time: " << omp_get_wtime() - t << "\n";
+        }
+
+#pragma omp master
+        {
+          //printv(m, (val_t *)col_x[0]->data, 0);
+          CUDA_RT_CALL(cudaMemcpy(&degree_h[0],
+                                  col_x[0]->data,
+                                  sizeof(idx_t) * m,
+                                  cudaMemcpyDeviceToHost));
+
+          for (auto j = 0; j < degree_h.size(); ++j)
+            EXPECT_EQ(degree_ref[j], degree_h[j]);
+        }
+
+        gdf_col_delete(col_off);
+        gdf_col_delete(col_ind);
+        gdf_col_delete(col_val);
+        gdf_col_delete(col_x[i]);
+      }
+    }
+    std::cout << std::endl;
+  }
+};
+
+TEST_P(Tests_MGDegree_hibench, CheckFP32_hibench) {
+  run_current_test<int>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(hibench_test,
+                        Tests_MGDegree_hibench,
+                        ::testing::Values(MGDegree_Usecase("benchmark/hibench/1/Input-small/edges/part-00000",
+                                                           0)
+                                                           ,
+                                          MGDegree_Usecase("benchmark/hibench/1/Input-small/edges/part-00000",
+                                                           1)
+                                                           ,
+                                          MGDegree_Usecase("benchmark/hibench/1/Input-small/edges/part-00000",
+                                                           2)
+                                                           ,
+                                          MGDegree_Usecase("benchmark/hibench/1/Input-large/edges/part-00000",
+                                                           0)
+                                                           ,
+                                          MGDegree_Usecase("benchmark/hibench/1/Input-large/edges/part-00000",
+                                                           1)
+                                                           ,
+                                          MGDegree_Usecase("benchmark/hibench/1/Input-large/edges/part-00000",
+                                                           2)
+                                                           ,
+                                          MGDegree_Usecase("benchmark/hibench/1/Input-huge/edges/part-00000",
+                                                           0)
+                                                           ,
+                                          MGDegree_Usecase("benchmark/hibench/1/Input-huge/edges/part-00000",
+                                                           1)
+                                                           ,
+                                          MGDegree_Usecase("benchmark/hibench/1/Input-huge/edges/part-00000",
+                                                           2)
+                                                           )
+                                                           );
+
+int main(int argc, char **argv) {
+  srand(42);
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
+
diff --git a/cpp/src/tests/snmg_pagerank/snmg_pagerank_test.cu b/cpp/src/tests/snmg_pagerank/snmg_pagerank_test.cu
new file mode 100644
index 00000000000..e65e4267600
--- /dev/null
+++ b/cpp/src/tests/snmg_pagerank/snmg_pagerank_test.cu
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+#include "cuda_profiler_api.h"
+#include <cugraph.h>
+#include <omp.h>
+#include "test_utils.h"
+#include "snmg_test_utils.h"
+#include "snmg/link_analysis/pagerank.cuh"
+
+//#define SNMG_VERBOSE
+
+typedef struct MGPagerank_Usecase_t {
+  std::string matrix_file;
+  std::string result_file;
+
+  MGPagerank_Usecase_t(const std::string& a, const std::string& b) {
+    // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR
+    // if RAPIDS_DATASET_ROOT_DIR not set, default to "/datasets"
+    const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir();
+    if ((a != "") && (a[0] != '/')) {
+      matrix_file = rapidsDatasetRootDir + "/" + a;
+    } else {
+      matrix_file = a;
+    }
+    if ((b != "") && (b[0] != '/')) {
+      result_file = rapidsDatasetRootDir + "/" + b;
+    } else {
+      result_file = b;
+    }
+  }
+  MGPagerank_Usecase_t& operator=(const MGPagerank_Usecase_t& rhs) {
+    matrix_file = rhs.matrix_file;
+    result_file = rhs.result_file;
+    return *this;
+  }
+} MGPagerank_Usecase;
+
+template <typename val_t>
+void verify_pr(gdf_column* col_pagerank, const MGPagerank_Usecase& param){
+  // Check vs golden data
+  if (param.result_file.length()>0)
+  {
+    int m = col_pagerank->size;
+    std::vector<val_t> calculated_res(m);
+    CUDA_RT_CALL(cudaMemcpy(&calculated_res[0],   col_pagerank->data, sizeof(val_t) * m, cudaMemcpyDeviceToHost));
+    std::sort(calculated_res.begin(), calculated_res.end());
+    FILE* fpin = fopen(param.result_file.c_str(),"rb");
+    ASSERT_TRUE(fpin != NULL) << " Cannot read file with reference data: " << param.result_file << std::endl;
+    std::vector<val_t> expected_res(m);
+    ASSERT_EQ(read_binary_vector(fpin, m, expected_res), 0);
+    fclose(fpin);
+    val_t err;
+    int n_err = 0;
+    for (int i = 0; i < m; i++) {
+        err = fabs(expected_res[i] - calculated_res[i]);
+        if (err> 1e-5) {
+            n_err++; // count the number of mismatches 
+        }
+    }
+    if (n_err) {
+        EXPECT_LE(n_err, 0.001*m); // tolerate 0.1% of values with a litte difference
+    }
+  }
+}
+
+class Tests_MGPagerank : public ::testing::TestWithParam<MGPagerank_Usecase> {
+  public:
+  Tests_MGPagerank() {  }
+  static void SetupTestCase() {  }
+  static void TearDownTestCase() { }
+  virtual void SetUp() {  }
+  virtual void TearDown() {  }
+
+  static std::vector<double> mgpr_time;   
+  
+  template <typename idx_t,typename val_t>
+  void run_current_test(const MGPagerank_Usecase& param) {
+     const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+     std::stringstream ss; 
+     std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file)+ std::string("_") + ss.str().c_str();
+
+     int m, k, nnz, n_gpus, max_iter=50;
+     val_t alpha = 0.85;
+     MM_typecode mc;
+
+     double t;
+
+     FILE* fpin = fopen(param.matrix_file.c_str(),"r");
+     
+     ASSERT_EQ(mm_properties<int>(fpin, 1, &mc, &m, &k, &nnz),0) << "could not read Matrix Market file properties"<< "\n";
+     ASSERT_TRUE(mm_is_matrix(mc));
+     ASSERT_TRUE(mm_is_coordinate(mc));
+     ASSERT_FALSE(mm_is_complex(mc));
+     ASSERT_FALSE(mm_is_skew(mc));
+     
+     // Allocate memory on host
+     std::vector<idx_t> cooRowInd(nnz), cooColInd(nnz), csrColInd(nnz), csrRowPtr(m+1);
+     std::vector<val_t> cooVal(nnz), csrVal(nnz), pagerank_h(m, 1.0/m);
+
+     // Read
+     ASSERT_EQ( (mm_to_coo<int,val_t>(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], NULL, NULL)) , 0)<< "could not read matrix data"<< "\n";
+     ASSERT_EQ(fclose(fpin),0);
+     
+     // WARNING transpose happening here
+     coo2csr(cooColInd, cooRowInd, csrRowPtr, csrColInd);
+
+     CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus));  
+     std::vector<size_t> v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus+1);
+     random_vals(csrVal);
+     gdf_column *col_pagerank[n_gpus];
+
+     if (nnz<1200000000)
+     {
+       #pragma omp parallel num_threads(1)
+       {
+        auto i = omp_get_thread_num();
+        auto p = omp_get_num_threads(); 
+        CUDA_RT_CALL(cudaSetDevice(i));
+
+        #ifdef SNMG_VERBOSE 
+          #pragma omp master 
+          { 
+            std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
+            std::cout << "Number of threads : "<< p <<std::endl;
+          }
+        #endif
+
+        gdf_column *col_off = new gdf_column, 
+                   *col_ind = new gdf_column, 
+                   *col_val = new gdf_column;
+        col_pagerank[i] = new gdf_column;
+        create_gdf_column(pagerank_h, col_pagerank[i]);
+        #pragma omp barrier
+
+        //load a chunck of the graph on each GPU 
+        load_csr_loc(csrRowPtr, csrColInd, csrVal, 
+                     v_loc, e_loc, part_offset,
+                     col_off, col_ind, col_val);
+        
+        t = omp_get_wtime();
+        cugraph::SNMGinfo env;
+        cugraph::SNMGpagerank<idx_t,val_t> pr_solver(env, &part_offset[0], static_cast<idx_t*>(col_off->data), static_cast<idx_t*>(col_ind->data));
+        pr_solver.setup(alpha);
+
+        val_t* pagerank[p];
+        for (auto i = 0; i < p; ++i)
+          pagerank[i]= static_cast<val_t*>(col_pagerank[i]->data);
+
+        pr_solver.solve(max_iter, pagerank);
+        #pragma omp master 
+        {std::cout <<  omp_get_wtime() - t << " ";}
+
+        verify_pr<val_t>(col_pagerank[i], param);
+
+        gdf_col_delete(col_off);
+        gdf_col_delete(col_ind);
+        gdf_col_delete(col_val);
+        gdf_col_delete(col_pagerank[i]);
+      }
+    }
+// TODO Enable when degree function is present
+#if 0
+    if (n_gpus > 1)
+    {
+      // Only using the 4 fully connected GPUs on DGX1
+      if (n_gpus == 8)
+        n_gpus = 4;
+
+      #pragma omp parallel num_threads(n_gpus)
+       {
+          auto i = omp_get_thread_num();
+          auto p = omp_get_num_threads(); 
+          CUDA_RT_CALL(cudaSetDevice(i));
+
+          #ifdef SNMG_VERBOSE 
+            #pragma omp master 
+            { 
+              std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
+              std::cout << "Number of threads : "<< p <<std::endl;
+            }
+          #endif
+
+          gdf_column *col_off = new gdf_column, 
+                     *col_ind = new gdf_column, 
+                     *col_val = new gdf_column;
+          col_pagerank[i] = new gdf_column;
+          create_gdf_column(pagerank_h, col_pagerank[i]);
+          #pragma omp barrier
+
+          //load a chunck of the graph on each GPU 
+          load_csr_loc(csrRowPtr, csrColInd, csrVal, 
+                       v_loc, e_loc, part_offset,
+                       col_off, col_ind, col_val);
+          t = omp_get_wtime();
+          cugraph::SNMGinfo env;
+          cugraph::SNMGpagerank<idx_t,val_t> pr_solver(env, &part_offset[0], static_cast<idx_t*>(col_off->data), static_cast<idx_t*>(col_ind->data));
+          pr_solver.setup(alpha);
+
+          val_t* pagerank[p];
+          for (auto i = 0; i < p; ++i)
+            pagerank[i]= static_cast<val_t*>(col_pagerank[i]->data);
+
+          pr_solver.solve(max_iter, pagerank);
+          #pragma omp master 
+          {std::cout <<  omp_get_wtime() - t << " ";}
+
+          verify_pr<val_t>(col_pagerank[i], param);
+
+          gdf_col_delete(col_off);
+          gdf_col_delete(col_ind);
+          gdf_col_delete(col_val);
+          gdf_col_delete(col_pagerank[i]);
+
+
+       }
+    }
+#endif
+    std::cout << std::endl;
+  }
+
+};
+
+class Tests_MGPR_hibench : public ::testing::TestWithParam<MGPagerank_Usecase> {
+  public:
+  Tests_MGPR_hibench() {  }
+  static void SetupTestCase() {  }
+  static void TearDownTestCase() { }
+  virtual void SetUp() {  }
+  virtual void TearDown() {  }
+
+  static std::vector<double> mgspmv_time;   
+
+  template <typename idx_t,typename val_t>
+  void run_current_test(const MGPagerank_Usecase& param) {
+     const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info();
+     std::stringstream ss; 
+     std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file)+ std::string("_") + ss.str().c_str();
+
+     int m, nnz, n_gpus, max_iter=50;
+     val_t alpha = 0.85;
+     std::vector<idx_t> cooRowInd, cooColInd;
+     double t;
+
+     ASSERT_EQ(read_single_file(param.matrix_file.c_str(),cooRowInd,cooColInd),0);
+     nnz = cooRowInd.size();
+     m = std::max( *(std::max_element(cooRowInd.begin(), cooRowInd.end())),
+                   *(std::max_element(cooColInd.begin(), cooColInd.end())));
+
+     // Allocate memory on host
+     std::vector<idx_t> csrColInd(nnz), csrRowPtr(m+1);
+     std::vector<val_t> cooVal(nnz), csrVal(nnz), pagerank_h(m, 1.0/m);
+     coo2csr(cooRowInd, cooColInd, csrRowPtr, csrColInd);
+     CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus));  
+     std::vector<size_t> v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus+1);
+     random_vals(csrVal);
+     gdf_column *col_pagerank[n_gpus];
+
+     if (n_gpus > 1)
+     {
+      // Only using the 4 fully connected GPUs on DGX1
+      if (n_gpus == 8)
+        n_gpus = 4;
+
+       #pragma omp parallel num_threads(n_gpus)
+       {
+        auto i = omp_get_thread_num();
+        auto p = omp_get_num_threads(); 
+        CUDA_RT_CALL(cudaSetDevice(i));
+
+        #ifdef SNMG_VERBOSE 
+          #pragma omp master 
+          { 
+            std::cout << "Number of GPUs : "<< n_gpus <<std::endl;
+            std::cout << "Number of threads : "<< p <<std::endl;
+          }
+        #endif
+
+        gdf_column *col_off = new gdf_column, 
+                   *col_ind = new gdf_column, 
+                   *col_val = new gdf_column;
+        col_pagerank[i] = new gdf_column;
+        create_gdf_column(pagerank_h, col_pagerank[i]);
+        #pragma omp barrier
+
+        //load a chunck of the graph on each GPU 
+        load_csr_loc(csrRowPtr, csrColInd, csrVal, 
+                     v_loc, e_loc, part_offset,
+                     col_off, col_ind, col_val);
+        
+        t = omp_get_wtime();
+        cugraph::SNMGinfo env;
+        cugraph::SNMGpagerank<idx_t,val_t> pr_solver(env, &part_offset[0], static_cast<idx_t*>(col_off->data), static_cast<idx_t*>(col_ind->data));
+        pr_solver.setup(alpha);
+
+        val_t* pagerank[p];
+        for (auto i = 0; i < p; ++i)
+          pagerank[i]= static_cast<val_t*>(col_pagerank[i]->data);
+
+        pr_solver.solve(max_iter, pagerank);
+        #pragma omp master 
+        {std::cout <<  omp_get_wtime() - t << " ";}
+
+        verify_pr<val_t>(col_pagerank[i], param);
+
+        gdf_col_delete(col_off);
+        gdf_col_delete(col_ind);
+        gdf_col_delete(col_val);
+        gdf_col_delete(col_pagerank[i]);
+      }
+    }
+    std::cout << std::endl;
+  }
+};
+
+
+TEST_P(Tests_MGPagerank, CheckFP32_mtx) {
+    run_current_test<int, float>(GetParam());
+}
+TEST_P(Tests_MGPagerank, CheckFP64) {
+    run_current_test<int,double>(GetParam());
+}
+TEST_P(Tests_MGPR_hibench, CheckFP32_hibench) {
+    run_current_test<int, float>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGPagerank, 
+                        ::testing::Values(   MGPagerank_Usecase("test/datasets/karate.mtx", "")
+                                            ,MGPagerank_Usecase("test/datasets/web-BerkStan.mtx", "test/ref/pagerank/web-BerkStan.pagerank_val_0.85.bin")
+                                            ,MGPagerank_Usecase("test/datasets/web-Google.mtx",   "test/ref/pagerank/web-Google.pagerank_val_0.85.bin")
+                                            ,MGPagerank_Usecase("test/datasets/wiki-Talk.mtx",    "test/ref/pagerank/wiki-Talk.pagerank_val_0.85.bin")
+                                            ,MGPagerank_Usecase("test/datasets/cit-Patents.mtx",  "test/ref/pagerank/cit-Patents.pagerank_val_0.85.bin")
+                                            ,MGPagerank_Usecase("test/datasets/ljournal-2008.mtx","test/ref/pagerank/ljournal-2008.pagerank_val_0.85.bin")
+                                            ,MGPagerank_Usecase("test/datasets/webbase-1M.mtx",   "test/ref/pagerank/webbase-1M.pagerank_val_0.85.bin")
+                                         )
+                       );
+
+INSTANTIATE_TEST_CASE_P(hibench_test, Tests_MGPR_hibench,  
+                        ::testing::Values(   MGPagerank_Usecase("benchmark/hibench/1/Input-small/edges/part-00000", "")
+                                            ,MGPagerank_Usecase("benchmark/hibench/1/Input-large/edges/part-00000", "")
+                                            ,MGPagerank_Usecase("benchmark/hibench/1/Input-huge/edges/part-00000", "")
+                                         )
+                       );
+
+
+
+int main(int argc, char **argv)  {
+    srand(42);
+    ::testing::InitGoogleTest(&argc, argv);
+        
+  return RUN_ALL_TESTS();
+}
+
+
diff --git a/cpp/src/tests/test_utils.h b/cpp/src/tests/test_utils.h
index dc87d7403ea..0c82a4e1b43 100644
--- a/cpp/src/tests/test_utils.h
+++ b/cpp/src/tests/test_utils.h
@@ -222,8 +222,8 @@ void printCsrMatI(int m, int n, int nnz,std::vector<int> & csrRowPtr, std::vecto
  */
 template <typename IndexType_>
 int mm_properties(FILE * f, int tg, MM_typecode * t,
-		  IndexType_ * m, IndexType_ * n,
-		  IndexType_ * nnz) {
+                  IndexType_ * m, IndexType_ * n,
+                  IndexType_ * nnz) {
 
   // Read matrix properties from file
   int mint, nint, nnzint;
@@ -279,7 +279,7 @@ int mm_properties(FILE * f, int tg, MM_typecode * t,
 
       // Check if entry is diagonal
       if(row == col)
-	--(*nnz);
+          --(*nnz);
 
     }
   }
@@ -310,8 +310,8 @@ int mm_properties(FILE * f, int tg, MM_typecode * t,
  */
 template <typename IndexType_, typename ValueType_>
 int mm_to_coo(FILE *f, int tg, IndexType_ nnz,
-	      IndexType_ * cooRowInd, IndexType_ * cooColInd,
-	      ValueType_ * cooRVal  , ValueType_ * cooIVal) {
+              IndexType_ * cooRowInd, IndexType_ * cooColInd,
+              ValueType_ * cooRVal  , ValueType_ * cooIVal) {
 
   // Read matrix properties from file
   MM_typecode t;
@@ -381,20 +381,20 @@ int mm_to_coo(FILE *f, int tg, IndexType_ nnz,
 
       // Modify entry value if matrix is skew symmetric or Hermitian
       if(mm_is_skew(t)) {
-	rval = -rval;
-	ival = -ival;
+        rval = -rval;
+        ival = -ival;
       }
       else if(mm_is_hermitian(t)) {
-	ival = -ival;
+        ival = -ival;
       }
 
       // Record entry
       cooRowInd[j] = col;
       cooColInd[j] = row;
       if(cooRVal != NULL)
-	cooRVal[j] = rval;
+        cooRVal[j] = rval;
       if(cooIVal != NULL)
-	cooIVal[j] = ival;
+        cooIVal[j] = ival;
       ++j;
 
     }
@@ -435,10 +435,10 @@ class lesser_tuple {
  */
 template <typename IndexType_, typename ValueType_>
 void coo_sort(IndexType_ nnz, int sort_by_row,
-	      IndexType_ * cooRowInd,
-	      IndexType_ * cooColInd,
-	      ValueType_ * cooRVal,
-	      ValueType_ * cooIVal) {
+              IndexType_ * cooRowInd,
+              IndexType_ * cooColInd,
+              ValueType_ * cooRVal,
+              ValueType_ * cooIVal) {
 
   // Determine whether to sort by row or by column
   int i;
@@ -451,21 +451,21 @@ void coo_sort(IndexType_ nnz, int sort_by_row,
   using namespace thrust;
   if((cooRVal==NULL) && (cooIVal==NULL))
     stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd)),
-		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz)),
-		lesser_tuple(i));
+                make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz)),
+                lesser_tuple(i));
   else if((cooRVal==NULL) && (cooIVal!=NULL))
     stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooIVal)),
-		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooIVal+nnz)),
-		lesser_tuple(i));
+                make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooIVal+nnz)),
+                lesser_tuple(i));
   else if((cooRVal!=NULL) && (cooIVal==NULL))
     stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooRVal)),
-		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooRVal+nnz)),
-		lesser_tuple(i));
+                make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooRVal+nnz)),
+                lesser_tuple(i));
   else
     stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooRVal,cooIVal)),
-		make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,
-					     cooRVal+nnz,cooIVal+nnz)),
-		lesser_tuple(i));
+                make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,
+                cooRVal+nnz,cooIVal+nnz)),
+                lesser_tuple(i));
 }
 
 template <typename IndexT>
@@ -632,7 +632,7 @@ gdf_column_ptr create_gdf_column(std::vector<col_type> const & host_vector)
   // Allocate device storage for gdf_column and copy contents from host_vector
   const size_t input_size_bytes = host_vector.size() * sizeof(col_type);
   cudaStream_t stream{nullptr};
-  ALLOC_MANAGED_TRY((void**)&(the_column->data), input_size_bytes, stream);
+  ALLOC_TRY((void**)&(the_column->data), input_size_bytes, stream);
   cudaMemcpy(the_column->data, host_vector.data(), input_size_bytes, cudaMemcpyHostToDevice);
 
   // Deduce the type and set the gdf_dtype accordingly
@@ -666,7 +666,7 @@ void create_gdf_column(std::vector<col_type> const & host_vector, gdf_column * t
   // Allocate device storage for gdf_column and copy contents from host_vector
   const size_t input_size_bytes = host_vector.size() * sizeof(col_type);
   cudaStream_t stream{nullptr};
-  ALLOC_MANAGED_TRY((void**)&(the_column->data), input_size_bytes, stream);
+  ALLOC_TRY((void**)&(the_column->data), input_size_bytes, stream);
   cudaMemcpy(the_column->data, host_vector.data(), input_size_bytes, cudaMemcpyHostToDevice);
 
   // Deduce the type and set the gdf_dtype accordingly
diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu
new file mode 100644
index 00000000000..c42be78943c
--- /dev/null
+++ b/cpp/src/traversal/bfs.cu
@@ -0,0 +1,498 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+#include <cugraph.h>
+#include <algorithm>
+#include <iomanip>
+#include "bfs.cuh"
+#include <limits>
+#include "rmm_utils.h"
+
+#include "utilities/graph_utils.cuh"
+#include "bfs_kernels.cuh"
+
+using namespace bfs_kernels;
+
+namespace cugraph {
+  enum BFS_ALGO_STATE {
+    TOPDOWN, BOTTOMUP
+  };
+
+  template<typename IndexType>
+  void Bfs<IndexType>::setup() {
+
+    // Determinism flag, false by default
+    deterministic = false;
+    //Working data
+    //Each vertex can be in the frontier at most once
+    ALLOC_TRY(&frontier, n * sizeof(IndexType), nullptr);
+
+    //We will update frontier during the execution
+    //We need the orig to reset frontier, or ALLOC_FREE_TRY
+    original_frontier = frontier;
+
+    //size of bitmaps for vertices
+    vertices_bmap_size = (n / (8 * sizeof(int)) + 1);
+    //ith bit of visited_bmap is set <=> ith vertex is visited
+    ALLOC_TRY(&visited_bmap, sizeof(int) * vertices_bmap_size, nullptr);
+
+    //ith bit of isolated_bmap is set <=> degree of ith vertex = 0
+    ALLOC_TRY(&isolated_bmap, sizeof(int) * vertices_bmap_size, nullptr);
+
+    //vertices_degree[i] = degree of vertex i
+    ALLOC_TRY(&vertex_degree, sizeof(IndexType) * n, nullptr);
+
+    //Cub working data
+    cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes);
+
+    //We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive
+    ALLOC_TRY(&buffer_np1_1, (n + 1) * sizeof(IndexType), nullptr);
+    ALLOC_TRY(&buffer_np1_2, (n + 1) * sizeof(IndexType), nullptr);
+
+    //Using buffers : top down
+
+    //frontier_vertex_degree[i] is the degree of vertex frontier[i]
+    frontier_vertex_degree = buffer_np1_1;
+    //exclusive sum of frontier_vertex_degree
+    exclusive_sum_frontier_vertex_degree = buffer_np1_2;
+
+    //Using buffers : bottom up
+    //contains list of unvisited vertices
+    unvisited_queue = buffer_np1_1;
+    //size of the "last" unvisited queue : size_last_unvisited_queue
+    //refers to the size of unvisited_queue
+    //which may not be up to date (the queue may contains vertices that are now visited)
+
+    //We may leave vertices unvisited after bottom up main kernels - storing them here
+    left_unvisited_queue = buffer_np1_2;
+
+    //We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket
+    //See top down kernels for more details
+    ALLOC_TRY(&exclusive_sum_frontier_vertex_buckets_offsets,
+              ((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType), nullptr);
+
+    //Init device-side counters
+    //Those counters must be/can be reset at each bfs iteration
+    //Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck
+    ALLOC_TRY(&d_counters_pad, 4 * sizeof(IndexType), nullptr);
+
+    d_new_frontier_cnt = &d_counters_pad[0];
+    d_mu = &d_counters_pad[1];
+    d_unvisited_cnt = &d_counters_pad[2];
+    d_left_unvisited_cnt = &d_counters_pad[3];
+
+    //Lets use this int* for the next 3 lines
+    //Its dereferenced value is not initialized - so we dont care about what we put in it
+    IndexType * d_nisolated = d_new_frontier_cnt;
+    cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream);
+
+    //Computing isolated_bmap
+    //Only dependent on graph - not source vertex - done once
+    flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream);
+    cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
+
+    //We need nisolated to be ready to use
+    cudaStreamSynchronize(stream);
+  }
+
+  template<typename IndexType>
+  void Bfs<IndexType>::configure(IndexType *_distances,
+                                 IndexType *_predecessors,
+                                 int *_edge_mask)
+  {
+    distances = _distances;
+    predecessors = _predecessors;
+    edge_mask = _edge_mask;
+
+    useEdgeMask = (edge_mask != NULL);
+    computeDistances = (distances != NULL);
+    computePredecessors = (predecessors != NULL);
+
+    //We need distances to use bottom up
+    if (directed && !computeDistances)
+      ALLOC_TRY(&distances, n * sizeof(IndexType), nullptr);
+  }
+
+  template<typename IndexType>
+  void Bfs<IndexType>::traverse(IndexType source_vertex) {
+
+    //Init visited_bmap
+    //If the graph is undirected, we not that
+    //we will never discover isolated vertices (in degree = out degree = 0)
+    //we avoid a lot of work by flagging them now
+    //in g500 graphs they represent ~25% of total vertices
+    //more than that for wiki and twitter graphs
+
+    if (directed) {
+      cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream);
+    }
+    else {
+      cudaMemcpyAsync(visited_bmap,
+                      isolated_bmap,
+                      vertices_bmap_size * sizeof(int),
+                      cudaMemcpyDeviceToDevice,
+                      stream);
+    }
+
+    //If needed, setting all vertices as undiscovered (inf distance)
+    //We dont use computeDistances here
+    //if the graph is undirected, we may need distances even if
+    //computeDistances is false
+    if (distances)
+      fill_vec(distances, n, vec_t<IndexType>::max, stream);
+
+    //If needed, setting all predecessors to non-existent (-1)
+    if (computePredecessors) {
+      cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream);
+    }
+
+    //
+    //Initial frontier
+    //
+
+    frontier = original_frontier;
+
+    if (distances) {
+      cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream);
+    }
+
+    //Setting source_vertex as visited
+    //There may be bit already set on that bmap (isolated vertices) - if the graph is undirected
+    int current_visited_bmap_source_vert = 0;
+
+    if (!directed) {
+      cudaMemcpyAsync(&current_visited_bmap_source_vert,
+                      &visited_bmap[source_vertex / INT_SIZE],
+                      sizeof(int),
+                      cudaMemcpyDeviceToHost);
+      //We need current_visited_bmap_source_vert
+      cudaStreamSynchronize(stream);
+    }
+
+    int m = (1 << (source_vertex % INT_SIZE));
+
+    //In that case, source is isolated, done now
+    if (!directed && (m & current_visited_bmap_source_vert)) {
+      //Init distances and predecessors are done, (cf Streamsync in previous if)
+      return;
+    }
+
+    m |= current_visited_bmap_source_vert;
+
+    cudaMemcpyAsync(&visited_bmap[source_vertex / INT_SIZE],
+                    &m,
+                    sizeof(int),
+                    cudaMemcpyHostToDevice,
+                    stream);
+
+    //Adding source_vertex to init frontier
+    cudaMemcpyAsync(&frontier[0],
+                    &source_vertex,
+                    sizeof(IndexType),
+                    cudaMemcpyHostToDevice,
+                    stream);
+
+    //mf : edges in frontier
+    //nf : vertices in frontier
+    //mu : edges undiscovered
+    //nu : nodes undiscovered
+    //lvl : current frontier's depth
+    IndexType mf, nf, mu, nu;
+    bool growing;
+    IndexType lvl = 1;
+
+    //Frontier has one vertex
+    nf = 1;
+
+    //all edges are undiscovered (by def isolated vertices have 0 edges)
+    mu = nnz;
+
+    //all non isolated vertices are undiscovered (excepted source vertex, which is in frontier)
+    //That number is wrong if source_vertex is also isolated - but it's not important
+    nu = n - nisolated - nf;
+
+    //Last frontier was 0, now it is 1
+    growing = true;
+
+    IndexType size_last_left_unvisited_queue = n; //we just need value > 0
+    IndexType size_last_unvisited_queue = 0; //queue empty
+
+    //Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+    set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream);
+    exclusive_sum(d_cub_exclusive_sum_storage,
+                  cub_exclusive_sum_storage_bytes,
+                  frontier_vertex_degree,
+                  exclusive_sum_frontier_vertex_degree,
+                  nf + 1,
+                  stream);
+
+    cudaMemcpyAsync(&mf,
+                    &exclusive_sum_frontier_vertex_degree[nf],
+                    sizeof(IndexType),
+                    cudaMemcpyDeviceToHost,
+                    stream);
+
+    //We need mf
+    cudaStreamSynchronize(stream);
+
+    //At first we know we have to use top down
+    BFS_ALGO_STATE algo_state = TOPDOWN;
+
+    //useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data
+    //undirected g : need parents to be in children's neighbors
+    bool can_use_bottom_up = !directed && distances;
+
+    while (nf > 0) {
+      //Each vertices can appear only once in the frontierer array - we know it will fit
+      new_frontier = frontier + nf;
+      IndexType old_nf = nf;
+      resetDevicePointers();
+
+      if (can_use_bottom_up) {
+        //Choosing algo
+        //Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf
+
+        switch (algo_state) {
+          case TOPDOWN:
+            if (mf > mu / alpha)
+              algo_state = BOTTOMUP;
+            break;
+          case BOTTOMUP:
+            if (!growing && nf < n / beta) {
+
+              //We need to prepare the switch back to top down
+              //We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here
+              count_unvisited_edges(unvisited_queue,
+                                    size_last_unvisited_queue,
+                                    visited_bmap,
+                                    vertex_degree,
+                                    d_mu,
+                                    stream);
+
+              //Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+              set_frontier_degree(frontier_vertex_degree,
+                                  frontier,
+                                  vertex_degree,
+                                  nf,
+                                  stream);
+              exclusive_sum(d_cub_exclusive_sum_storage,
+                            cub_exclusive_sum_storage_bytes,
+                            frontier_vertex_degree,
+                            exclusive_sum_frontier_vertex_degree,
+                            nf + 1,
+                            stream);
+
+              cudaMemcpyAsync(&mf,
+                              &exclusive_sum_frontier_vertex_degree[nf],
+                              sizeof(IndexType),
+                              cudaMemcpyDeviceToHost,
+                              stream);
+
+              cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream);
+
+              //We will need mf and mu
+              cudaStreamSynchronize(stream);
+              algo_state = TOPDOWN;
+            }
+            break;
+        }
+      }
+
+      //Executing algo
+
+      switch (algo_state) {
+        case TOPDOWN:
+          compute_bucket_offsets(exclusive_sum_frontier_vertex_degree,
+                                 exclusive_sum_frontier_vertex_buckets_offsets,
+                                 nf,
+                                 mf,
+                                 stream);
+          frontier_expand(row_offsets,
+                          col_indices,
+                          frontier,
+                          nf,
+                          mf,
+                          lvl,
+                          new_frontier,
+                          d_new_frontier_cnt,
+                          exclusive_sum_frontier_vertex_degree,
+                          exclusive_sum_frontier_vertex_buckets_offsets,
+                          visited_bmap,
+                          distances,
+                          predecessors,
+                          edge_mask,
+                          isolated_bmap,
+                          directed,
+                          stream,
+                          deterministic);
+
+          mu -= mf;
+
+          cudaMemcpyAsync(&nf,
+                          d_new_frontier_cnt,
+                          sizeof(IndexType),
+                          cudaMemcpyDeviceToHost,
+                          stream);
+          cudaCheckError();
+
+          //We need nf
+          cudaStreamSynchronize(stream);
+
+          if (nf) {
+            //Typical pre-top down workflow. set_frontier_degree + exclusive-scan
+            set_frontier_degree(frontier_vertex_degree,
+                                new_frontier,
+                                vertex_degree,
+                                nf,
+                                stream);
+            exclusive_sum(d_cub_exclusive_sum_storage,
+                           cub_exclusive_sum_storage_bytes,
+                           frontier_vertex_degree,
+                           exclusive_sum_frontier_vertex_degree,
+                           nf + 1,
+                           stream);
+            cudaMemcpyAsync(&mf,
+                            &exclusive_sum_frontier_vertex_degree[nf],
+                            sizeof(IndexType),
+                            cudaMemcpyDeviceToHost,
+                            stream);
+
+            //We need mf
+            cudaStreamSynchronize(stream);
+          }
+          break;
+
+        case BOTTOMUP:
+          fill_unvisited_queue(visited_bmap,
+                               vertices_bmap_size,
+                               n,
+                               unvisited_queue,
+                               d_unvisited_cnt,
+                               stream,
+                               deterministic);
+
+          size_last_unvisited_queue = nu;
+
+          bottom_up_main(unvisited_queue,
+                         size_last_unvisited_queue,
+                         left_unvisited_queue,
+                         d_left_unvisited_cnt,
+                         visited_bmap,
+                         row_offsets,
+                         col_indices,
+                         lvl,
+                         new_frontier,
+                         d_new_frontier_cnt,
+                         distances,
+                         predecessors,
+                         edge_mask,
+                         stream,
+                         deterministic);
+
+          //The number of vertices left unvisited decreases
+          //If it wasnt necessary last time, it wont be this time
+          if (size_last_left_unvisited_queue) {
+            cudaMemcpyAsync(&size_last_left_unvisited_queue,
+                            d_left_unvisited_cnt,
+                            sizeof(IndexType),
+                            cudaMemcpyDeviceToHost,
+                            stream);
+            cudaCheckError()
+            //We need last_left_unvisited_size
+            cudaStreamSynchronize(stream);
+            bottom_up_large(left_unvisited_queue,
+                            size_last_left_unvisited_queue,
+                            visited_bmap,
+                            row_offsets,
+                            col_indices,
+                            lvl,
+                            new_frontier,
+                            d_new_frontier_cnt,
+                            distances,
+                            predecessors,
+                            edge_mask,
+                            stream,
+                            deterministic);
+          }
+          cudaMemcpyAsync(&nf,
+                          d_new_frontier_cnt,
+                          sizeof(IndexType),
+                          cudaMemcpyDeviceToHost,
+                          stream);
+          cudaCheckError()
+
+          //We will need nf
+          cudaStreamSynchronize(stream);
+          break;
+      }
+
+      //Updating undiscovered edges count
+      nu -= nf;
+
+      //Using new frontier
+      frontier = new_frontier;
+      growing = (nf > old_nf);
+
+      ++lvl;
+    }
+  }
+
+  template<typename IndexType>
+  void Bfs<IndexType>::resetDevicePointers() {
+    cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream);
+  }
+
+  template<typename IndexType>
+  void Bfs<IndexType>::clean() {
+    //the vectors have a destructor that takes care of cleaning
+    ALLOC_FREE_TRY(original_frontier, nullptr);
+    ALLOC_FREE_TRY(visited_bmap, nullptr);
+    ALLOC_FREE_TRY(isolated_bmap, nullptr);
+    ALLOC_FREE_TRY(vertex_degree, nullptr);
+    ALLOC_FREE_TRY(d_cub_exclusive_sum_storage, nullptr);
+    ALLOC_FREE_TRY(buffer_np1_1, nullptr);
+    ALLOC_FREE_TRY(buffer_np1_2, nullptr);
+    ALLOC_FREE_TRY(exclusive_sum_frontier_vertex_buckets_offsets, nullptr);
+    ALLOC_FREE_TRY(d_counters_pad, nullptr);
+
+    //In that case, distances is a working data
+    if (directed && !computeDistances)
+      ALLOC_FREE_TRY(distances, nullptr);
+  }
+
+  template class Bfs<int> ;
+} // end namespace cugraph
+
+gdf_error gdf_bfs(gdf_graph *graph, gdf_column *distances, gdf_column *predecessors, int start_vertex, bool directed) {
+  GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL);
+  gdf_error err = gdf_add_adj_list(graph);
+  if (err != GDF_SUCCESS)
+    return err;
+  GDF_REQUIRE(graph->adjList->offsets->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+  GDF_REQUIRE(graph->adjList->indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+  GDF_REQUIRE(distances->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+  GDF_REQUIRE(predecessors->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE);
+
+  int n = graph->adjList->offsets->size - 1;
+  int e = graph->adjList->indices->size;
+  int* offsets_ptr = (int*)graph->adjList->offsets->data;
+  int* indices_ptr = (int*)graph->adjList->indices->data;
+  int* distances_ptr = (int*)distances->data;
+  int* predecessors_ptr = (int*)predecessors->data;
+  int alpha = 15;
+  int beta = 18;
+
+  cugraph::Bfs<int> bfs(n, e, offsets_ptr, indices_ptr, directed, alpha, beta);
+  bfs.configure(distances_ptr, predecessors_ptr, nullptr);
+  bfs.traverse(start_vertex);
+  return GDF_SUCCESS;
+}
+
diff --git a/cpp/src/bfs.cuh b/cpp/src/traversal/bfs.cuh
old mode 100755
new mode 100644
similarity index 98%
rename from cpp/src/bfs.cuh
rename to cpp/src/traversal/bfs.cuh
index c665aabb6e3..a35b9b4bea4
--- a/cpp/src/bfs.cuh
+++ b/cpp/src/traversal/bfs.cuh
@@ -13,8 +13,6 @@
 
 #include <climits> 
 
-//Used in nvgraph.h
-
 #define TRAVERSAL_DEFAULT_ALPHA 15
 
 #define TRAVERSAL_DEFAULT_BETA 18
@@ -97,5 +95,5 @@ namespace cugraph {
 
 		void traverse(IndexType source_vertex);
 	};
-} // end namespace nvgraph
+} // end namespace cugraph
 
diff --git a/cpp/src/traversal/bfs_kernels.cuh b/cpp/src/traversal/bfs_kernels.cuh
new file mode 100644
index 00000000000..d4b31887b74
--- /dev/null
+++ b/cpp/src/traversal/bfs_kernels.cuh
@@ -0,0 +1,1566 @@
+/*
+ * Copyright (c) 2018 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <iostream>
+
+#include <cub/cub.cuh>
+#include <utilities/sm_utils.h>
+
+#define MAXBLOCKS 65535
+#define WARP_SIZE 32
+#define INT_SIZE 32
+
+//
+// Bottom up macros
+//
+
+#define FILL_UNVISITED_QUEUE_DIMX 256
+
+#define COUNT_UNVISITED_EDGES_DIMX 256
+
+#define MAIN_BOTTOMUP_DIMX 256
+#define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX/WARP_SIZE)
+
+#define LARGE_BOTTOMUP_DIMX 256
+
+//Number of edges processed in the main bottom up kernel
+#define MAIN_BOTTOMUP_MAX_EDGES 6
+
+//Power of 2 < 32 (strict <)
+#define BOTTOM_UP_LOGICAL_WARP_SIZE 4
+
+//
+// Top down macros
+//
+
+// We will precompute the results the binsearch_maxle every TOP_DOWN_BUCKET_SIZE edges
+#define TOP_DOWN_BUCKET_SIZE 32
+
+// DimX of the kernel
+#define TOP_DOWN_EXPAND_DIMX 256
+
+// TOP_DOWN_EXPAND_DIMX edges -> NBUCKETS_PER_BLOCK buckets
+#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX/TOP_DOWN_BUCKET_SIZE)
+
+// How many items_per_thread we can process with one bucket_offset loading
+// the -1 is here because we need the +1 offset
+#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1)
+
+// instruction parallelism
+// for how many edges will we create instruction parallelism
+#define TOP_DOWN_BATCH_SIZE 2
+
+#define COMPUTE_BUCKET_OFFSETS_DIMX 512
+
+//Other macros
+
+#define FLAG_ISOLATED_VERTICES_DIMX 128
+
+//Number of vertices handled by one thread
+//Must be power of 2, lower than 32
+#define FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD 4 
+
+//Number of threads involved in the "construction" of one int in the bitset
+#define FLAG_ISOLATED_VERTICES_THREADS_PER_INT (INT_SIZE/FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD)
+
+//
+// Parameters of the heuristic to switch between bottomup/topdown
+//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf 
+//
+
+using namespace cugraph;
+
+namespace bfs_kernels {
+  //
+  // gives the equivalent vectors from a type
+  // for the max val, would be better to use numeric_limits<>::max() once
+  // cpp11 is allowed in nvgraph
+  //
+
+  template<typename >
+  struct vec_t {
+    typedef int4 vec4;
+    typedef int2 vec2;
+  };
+
+  template<>
+  struct vec_t<int> {
+    typedef int4 vec4;
+    typedef int2 vec2;
+    static const int max = INT_MAX;
+  };
+
+  template<>
+  struct vec_t<long long int> {
+    typedef longlong4 vec4;
+    typedef longlong2 vec2;
+    static const long long int max = LLONG_MAX;
+  };
+
+  //
+  // ------------------------- Helper device functions -------------------
+  //
+
+  __forceinline__ __device__ int getMaskNRightmostBitSet(int n) {
+    if (n == INT_SIZE)
+      return (~0);
+    int mask = (1 << n) - 1;
+    return mask;
+  }
+
+  __forceinline__ __device__ int getMaskNLeftmostBitSet(int n) {
+    if (n == 0)
+      return 0;
+    int mask = ~((1 << (INT_SIZE - n)) - 1);
+    return mask;
+  }
+
+  __forceinline__ __device__ int getNextZeroBit(int& val) {
+    int ibit = __ffs(~val) - 1;
+    val |= (1 << ibit);
+
+    return ibit;
+  }
+
+  struct BitwiseAnd
+  {
+    template<typename T>
+    __host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
+                                      {
+      return (a & b);
+    }
+  };
+
+  struct BitwiseOr
+  {
+    template<typename T>
+    __host__  __device__  __forceinline__ T operator()(const T &a, const T &b) const
+                                      {
+      return (a | b);
+    }
+  };
+
+  template<typename IndexType>
+  __device__ IndexType binsearch_maxle(const IndexType *vec,
+                                       const IndexType val,
+                                       IndexType low,
+                                       IndexType high) {
+    while (true) {
+      if (low == high)
+        return low; //we know it exists
+      if ((low + 1) == high)
+        return (vec[high] <= val) ? high : low;
+
+      IndexType mid = low + (high - low) / 2;
+
+      if (vec[mid] > val)
+        high = mid - 1;
+      else
+        low = mid;
+
+    }
+  }
+
+  //
+  //  -------------------------  Bottom up -------------------------
+  //
+
+  //
+  // fill_unvisited_queue_kernel
+  //
+  // Finding unvisited vertices in the visited_bmap, and putting them in the queue
+  // Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted
+  // For instance, the queue can look like this :
+  // 34 38 45 58 61 4 18 24 29 71 84 85 90
+  // Because they are represented by those ints in the bitmap :
+  // [34 38 45 58 61] [4 18 24 29] [71 84 85 90]
+
+  //visited_bmap_nints = the visited_bmap is made of that number of ints
+
+  template<typename IndexType>
+  __global__ void fill_unvisited_queue_kernel(int *visited_bmap,
+                                              IndexType visited_bmap_nints,
+                                              IndexType n,
+                                              IndexType *unvisited,
+                                              IndexType *unvisited_cnt) {
+    typedef cub::BlockScan<int, FILL_UNVISITED_QUEUE_DIMX> BlockScan;
+    __shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+    //When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) )
+    //We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in
+    //unvisited_common_block_offset
+    __shared__ IndexType unvisited_common_block_offset;
+
+    //We don't want threads divergence in the loop (we're going to call __syncthreads)
+    //Using a block-only dependent in the condition of the loop
+    for (IndexType block_v_idx = blockIdx.x * blockDim.x;
+        block_v_idx < visited_bmap_nints;
+        block_v_idx += blockDim.x * gridDim.x) {
+
+      //Index of visited_bmap that this thread will compute
+      IndexType v_idx = block_v_idx + threadIdx.x;
+
+      int thread_visited_int = (v_idx < visited_bmap_nints)
+                        ? visited_bmap[v_idx]
+                          :
+                          (~0); //will be neutral in the next lines (virtual vertices all visited)
+
+      //The last int can only be partially valid
+      //If we are indeed taking care of the last visited int in this thread,
+      //We need to first disable (ie set as "visited") the inactive bits (vertices >= n)
+      if (v_idx == (visited_bmap_nints - 1)) {
+        int active_bits = n - (INT_SIZE * v_idx);
+        int inactive_bits = INT_SIZE - active_bits;
+        int mask = getMaskNLeftmostBitSet(inactive_bits);
+        thread_visited_int |= mask; //Setting inactive bits as visited
+      }
+
+      //Counting number of unvisited vertices represented by this int
+      int n_unvisited_in_int = __popc(~thread_visited_int);
+      int unvisited_thread_offset;
+
+      //We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue
+      //We ask for that space when computing the block scan, that will tell where to write those
+      //vertices in the queue, using the common offset of the block (see below)
+      BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset);
+
+      //Last thread knows how many vertices will be written to the queue by this block
+      //Asking for that space in the queue using the global count, and saving the common offset
+      if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) {
+        IndexType total = unvisited_thread_offset + n_unvisited_in_int;
+        unvisited_common_block_offset = atomicAdd(unvisited_cnt, total);
+      }
+
+      //syncthreads for two reasons : 
+      // - we need to broadcast unvisited_common_block_offset
+      // - we will reuse scan_temp_storage (cf CUB doc)
+      __syncthreads();
+
+      IndexType current_unvisited_index = unvisited_common_block_offset
+          + unvisited_thread_offset;
+      int nvertices_to_write = n_unvisited_in_int;
+
+      // getNextZeroBit uses __ffs, which gives least significant bit set
+      // which means that as long as n_unvisited_in_int is valid,
+      // we will use valid bits
+
+      while (nvertices_to_write > 0) {
+        if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) {
+          typename vec_t<IndexType>::vec4 vec_v;
+
+          vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+          vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+          vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+          vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+          typename vec_t<IndexType>::vec4 *unvisited_i4 = reinterpret_cast<typename vec_t<
+              IndexType>::vec4*>(&unvisited[current_unvisited_index]);
+          *unvisited_i4 = vec_v;
+
+          current_unvisited_index += 4;
+          nvertices_to_write -= 4;
+        }
+        else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) {
+          typename vec_t<IndexType>::vec2 vec_v;
+
+          vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+          vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+          typename vec_t<IndexType>::vec2 *unvisited_i2 = reinterpret_cast<typename vec_t<
+              IndexType>::vec2*>(&unvisited[current_unvisited_index]);
+          *unvisited_i2 = vec_v;
+
+          current_unvisited_index += 2;
+          nvertices_to_write -= 2;
+        } else {
+          IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int);
+
+          unvisited[current_unvisited_index] = v;
+
+          current_unvisited_index += 1;
+          nvertices_to_write -= 1;
+        }
+
+      }
+    }
+  }
+
+  //Wrapper
+  template<typename IndexType>
+  void fill_unvisited_queue(int *visited_bmap,
+                            IndexType visited_bmap_nints,
+                            IndexType n,
+                            IndexType *unvisited,
+                            IndexType *unvisited_cnt,
+                            cudaStream_t m_stream,
+                            bool deterministic) {
+    dim3 grid, block;
+    block.x = FILL_UNVISITED_QUEUE_DIMX;
+
+    grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x);
+
+    fill_unvisited_queue_kernel<<<grid, block, 0, m_stream>>>(visited_bmap,
+                                                              visited_bmap_nints,
+                                                              n,
+                                                              unvisited,
+                                                              unvisited_cnt);
+    cudaCheckError();
+  }
+
+  //
+  // count_unvisited_edges_kernel
+  // Couting the total number of unvisited edges in the graph - using an potentially unvisited queue
+  // We need the current unvisited vertices to be in the unvisited queue
+  // But visited vertices can be in the potentially_unvisited queue
+  // We first check if the vertex is still unvisited before using it
+  // Useful when switching from "Bottom up" to "Top down"
+  //
+
+  template<typename IndexType>
+  __global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited,
+                                               const IndexType potentially_unvisited_size,
+                                               const int *visited_bmap,
+                                               IndexType *degree_vertices,
+                                               IndexType *mu) {
+    typedef cub::BlockReduce<IndexType, COUNT_UNVISITED_EDGES_DIMX> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage reduce_temp_storage;
+
+    //number of undiscovered edges counted by this thread
+    IndexType thread_unvisited_edges_count = 0;
+
+    for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x;
+        idx < potentially_unvisited_size;
+        idx += blockDim.x * gridDim.x) {
+
+      IndexType u = potentially_unvisited[idx];
+      int u_visited_bmap = visited_bmap[u / INT_SIZE];
+      int is_visited = u_visited_bmap & (1 << (u % INT_SIZE));
+
+      if (!is_visited)
+        thread_unvisited_edges_count += degree_vertices[u];
+
+    }
+
+    //We need all thread_unvisited_edges_count to be ready before reducing
+    __syncthreads();
+
+    IndexType block_unvisited_edges_count =
+        BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count);
+
+    //block_unvisited_edges_count is only defined is th.x == 0
+    if (threadIdx.x == 0)
+      atomicAdd(mu, block_unvisited_edges_count);
+  }
+
+  //Wrapper
+  template<typename IndexType>
+  void count_unvisited_edges(const IndexType *potentially_unvisited,
+                             const IndexType potentially_unvisited_size,
+                             const int *visited_bmap,
+                             IndexType *node_degree,
+                             IndexType *mu,
+                             cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = COUNT_UNVISITED_EDGES_DIMX;
+    grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x);
+
+    count_unvisited_edges_kernel<<<grid, block, 0, m_stream>>>(potentially_unvisited,
+                                                               potentially_unvisited_size,
+                                                               visited_bmap,
+                                                               node_degree,
+                                                               mu);
+    cudaCheckError();
+  }
+
+  //
+  // Main Bottom Up kernel
+  // Here we will start to process unvisited vertices in the unvisited queue
+  // We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges
+  // If it's not possible to define a valid parent using only those edges,
+  // add it to the "left_unvisited_queue"
+  //
+
+  //
+  // We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property
+  // It is used to do a reduction locally and fully build the new visited_bmap
+  //
+
+  template<typename IndexType>
+  __global__ void main_bottomup_kernel(const IndexType *unvisited,
+                                       const IndexType unvisited_size,
+                                       IndexType *left_unvisited,
+                                       IndexType *left_unvisited_cnt,
+                                       int *visited_bmap,
+                                       const IndexType *row_ptr,
+                                       const IndexType *col_ind,
+                                       IndexType lvl,
+                                       IndexType *new_frontier,
+                                       IndexType *new_frontier_cnt,
+                                       IndexType *distances,
+                                       IndexType *predecessors,
+                                       int *edge_mask) {
+    typedef cub::BlockDiscontinuity<IndexType, MAIN_BOTTOMUP_DIMX> BlockDiscontinuity;
+    typedef cub::WarpReduce<int> WarpReduce;
+    typedef cub::BlockScan<int, MAIN_BOTTOMUP_DIMX> BlockScan;
+
+    __shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage;
+    __shared__ typename WarpReduce::TempStorage reduce_temp_storage;
+    __shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+    //To write vertices in the frontier,
+    //We will use a block scan to locally compute the offsets
+    //frontier_common_block_offset contains the common offset for the block
+    __shared__ IndexType frontier_common_block_offset;
+
+    // When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints
+    // from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23)
+    // vertices represented by the same int will be designed as part of the same "group"
+    // To detect the deliminations between those groups, we use BlockDiscontinuity
+    // Then we need to create the new "visited_bmap" within those group.
+    // We use a warp reduction that takes into account limits between groups to do it
+    // But a group can be cut in two different warps : in that case, the second warp
+    // put the result of its local reduction in local_visited_bmap_warp_head
+    // the first warp will then read it and finish the reduction
+
+    __shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS];
+
+    const int warpid = threadIdx.x / WARP_SIZE;
+    const int laneid = threadIdx.x % WARP_SIZE;
+
+    // we will call __syncthreads inside the loop
+    // we need to keep complete block active
+    for (IndexType block_off = blockIdx.x * blockDim.x;
+        block_off < unvisited_size;
+        block_off += blockDim.x * gridDim.x)
+            {
+      IndexType idx = block_off + threadIdx.x;
+
+      // This thread will take care of unvisited_vertex
+      // in the visited_bmap, it is represented by the int at index
+      // visited_bmap_index = unvisited_vertex/INT_SIZE
+      // it will be used by BlockDiscontinuity
+      // to flag the separation between groups of vertices (vertices represented by different in in visited_bmap)
+      IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one
+      visited_bmap_index[0] = -1;
+      IndexType unvisited_vertex = -1;
+
+      // local_visited_bmap gives info on the visited bit of unvisited_vertex
+      //
+      // By default, everything is visited
+      // This is because we only take care of unvisited vertices here,
+      // The other are by default unvisited
+      // If a vertex remain unvisited, we will notice it here
+      // That's why by default we consider everything visited ( ie ~0 )
+      // If we fail to assign one parent to an unvisited vertex, we will
+      // explicitly unset the bit
+      int local_visited_bmap = (~0);
+      int found = 0;
+      int more_to_visit = 0;
+      IndexType valid_parent;
+      IndexType left_unvisited_off;
+
+      if (idx < unvisited_size)
+          {
+        //Processing first STPV edges of unvisited v
+        //If bigger than that, push to left_unvisited queue
+        unvisited_vertex = unvisited[idx];
+
+        IndexType edge_begin = row_ptr[unvisited_vertex];
+        IndexType edge_end = row_ptr[unvisited_vertex + 1];
+
+        visited_bmap_index[0] = unvisited_vertex / INT_SIZE;
+
+        IndexType degree = edge_end - edge_begin;
+
+        for (IndexType edge = edge_begin;
+            edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge)
+            {
+          if (edge_mask && !edge_mask[edge])
+            continue;
+
+          IndexType parent_candidate = col_ind[edge];
+
+          if (distances[parent_candidate] == (lvl - 1))
+              {
+            found = 1;
+            valid_parent = parent_candidate;
+            break;
+          }
+        }
+
+        // This vertex will remain unvisited at the end of this kernel
+        // Explicitly say it
+        if (!found)
+          local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited
+        else
+        {
+          if (distances)
+            distances[unvisited_vertex] = lvl;
+          if (predecessors)
+            predecessors[unvisited_vertex] = valid_parent;
+        }
+
+        //If we haven't found a parent and there's more edge to check
+        if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES)
+        {
+          left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1);
+          more_to_visit = 1;
+        }
+
+      }
+
+      //
+      // We will separate vertices in group
+      // Two vertices are in the same group if represented by same int in visited_bmap
+      // ie u and v in same group <=> u/32 == v/32
+      //
+      // We will now flag the head of those group (first element of each group)
+      //
+      // 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue)
+      // 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained
+      // at most by two warps
+
+      int is_head_a[1]; //CUB need an array
+      BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a,
+                                                               visited_bmap_index,
+                                                               cub::Inequality());
+      int is_head = is_head_a[0];
+
+      // Computing the warp reduce within group
+      // This primitive uses the is_head flags to know where the limits of the groups are
+      // We use bitwise and as operator, because of the fact that 1 is the default value
+      // If a vertex is unvisited, we have to explicitly ask for it
+      int local_bmap_agg =
+          WarpReduce(reduce_temp_storage).HeadSegmentedReduce(local_visited_bmap,
+                                                              is_head,
+                                                              BitwiseAnd());
+
+      // We need to take care of the groups cut in two in two different warps
+      // Saving second part of the reduce here, then applying it on the first part bellow
+      // Corner case : if the first thread of the warp is a head, then this group is not cut in two
+      // and then we have to be neutral (for an bitwise and, it's an ~0)
+      if (laneid == 0)
+          {
+        local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg;
+      }
+
+      //broadcasting local_visited_bmap_warp_head
+      __syncthreads();
+
+      int head_ballot = cugraph::utils::ballot(is_head);
+
+      //As long as idx < unvisited_size, we know there's at least one head per warp
+      int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot);
+
+      int is_last_head_in_warp = (laneid == laneid_last_head_in_warp);
+
+      // if laneid == 0 && is_last_head_in_warp, it's a special case where
+      // a group of size 32 starts exactly at lane 0
+      // in that case, nothing to do (this group is not cut by a warp delimitation)
+      // we also have to make sure that a warp actually exists after this one (this corner case is handled after)
+      if (laneid != 0 && (is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS))
+      {
+        local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1];
+      }
+
+      //Three cases :
+      // -> This is the first group of the block - it may be cut in two (with previous block)
+      // -> This is the last group of the block - same thing
+      // -> This group is completely contained in this block
+
+      if (warpid == 0 && laneid == 0)
+          {
+        //The first elt of this group considered in this block is unvisited_vertex
+        //We know that's the case because elts are sorted in a group, and we are at laneid == 0
+        //We will do an atomicOr - we have to be neutral about elts < unvisited_vertex
+        int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid
+        int mask = getMaskNLeftmostBitSet(INT_SIZE - iv);
+        local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex
+        atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
+      }
+      else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) &&
+          laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case
+          idx < unvisited_size //we could be out
+              )
+              {
+        //Last head of the block
+        //We don't know if this group is complete
+
+        //last_v is the last unvisited_vertex of the group IN THIS block
+        //we dont know about the rest - we have to be neutral about elts > last_v
+
+        //the destination thread of the __shfl is active
+        int laneid_max = min((IndexType) (WARP_SIZE - 1),
+                      (unvisited_size - (block_off + 32 * warpid)));
+        IndexType last_v = cugraph::utils::shfl(unvisited_vertex,
+                                                laneid_max,
+                                                WARP_SIZE,
+                                                __activemask());
+
+        if (is_last_head_in_warp)
+        {
+          int ilast_v = last_v % INT_SIZE + 1;
+          int mask = getMaskNRightmostBitSet(ilast_v);
+          local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex
+          atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg);
+        }
+      }
+      else
+      {
+        //group completely in block
+        if (is_head && idx < unvisited_size) {
+          visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int
+        }
+      }
+
+      //Saving in frontier
+
+      int thread_frontier_offset;
+      BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset);
+      IndexType inclusive_sum = thread_frontier_offset + found;
+      if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum)
+          {
+        frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
+      }
+
+      //1) Broadcasting frontier_common_block_offset
+      //2) we want to reuse the *_temp_storage
+      __syncthreads();
+
+      if (found)
+        new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex;
+      if (more_to_visit)
+        left_unvisited[left_unvisited_off] = unvisited_vertex;
+
+    }
+  }
+
+  template<typename IndexType>
+  void bottom_up_main(IndexType *unvisited,
+                      IndexType unvisited_size,
+                      IndexType *left_unvisited,
+                      IndexType *d_left_unvisited_idx,
+                      int *visited,
+                      const IndexType *row_ptr,
+                      const IndexType *col_ind,
+                      IndexType lvl,
+                      IndexType *new_frontier,
+                      IndexType *new_frontier_idx,
+                      IndexType *distances,
+                      IndexType *predecessors,
+                      int *edge_mask,
+                      cudaStream_t m_stream,
+                      bool deterministic) {
+    dim3 grid, block;
+    block.x = MAIN_BOTTOMUP_DIMX;
+
+    grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x);
+
+    main_bottomup_kernel<<<grid, block, 0, m_stream>>>(unvisited,
+                                                       unvisited_size,
+                                                       left_unvisited,
+                                                       d_left_unvisited_idx,
+                                                       visited,
+                                                       row_ptr,
+                                                       col_ind,
+                                                       lvl,
+                                                       new_frontier,
+                                                       new_frontier_idx,
+                                                       distances,
+                                                       predecessors,
+                                                       edge_mask);
+    cudaCheckError();
+  }
+
+  //
+  // bottom_up_large_degree_kernel
+  // finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found
+  //
+  template<typename IndexType>
+  __global__ void bottom_up_large_degree_kernel(IndexType *left_unvisited,
+                                                IndexType left_unvisited_size,
+                                                int *visited,
+                                                const IndexType *row_ptr,
+                                                const IndexType *col_ind,
+                                                IndexType lvl,
+                                                IndexType *new_frontier,
+                                                IndexType *new_frontier_cnt,
+                                                IndexType *distances,
+                                                IndexType *predecessors,
+                                                int *edge_mask) {
+
+    int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE;
+    int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
+    int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE;
+
+    //Inactive threads are not a pb for __ballot (known behaviour)
+    for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id;
+        idx < left_unvisited_size;
+        idx += gridDim.x * logical_warps_per_block) {
+
+      //Unvisited vertices - potentially in the next frontier
+      IndexType v = left_unvisited[idx];
+
+      //Used only with symmetric graphs
+      //Parents are included in v's neighbors
+      IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited
+
+      IndexType end_i_edge = row_ptr[v + 1];
+
+      //We can have warp divergence in the next loop
+      //It's not a pb because the behaviour of __ballot
+      //is know with inactive threads
+      for (IndexType i_edge = first_i_edge + logical_lane_id;
+          i_edge < end_i_edge;
+          i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) {
+
+        IndexType valid_parent = -1;
+
+        if (!edge_mask || edge_mask[i_edge]) {
+          IndexType u = col_ind[i_edge];
+          IndexType lvl_u = distances[u];
+
+          if (lvl_u == (lvl - 1)) {
+            valid_parent = u;
+          }
+        }
+
+        unsigned int warp_valid_p_ballot = cugraph::utils::ballot((valid_parent != -1));
+
+        int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE;
+        unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1;
+        unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot
+            >> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp);
+        logical_warp_valid_p_ballot &= mask;
+
+        int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1;
+
+        if (chosen_thread == logical_lane_id) {
+          //Using only one valid parent (reduce bw)
+          IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1);
+          int m = 1 << (v % INT_SIZE);
+          atomicOr(&visited[v / INT_SIZE], m);
+          distances[v] = lvl;
+
+          if (predecessors)
+            predecessors[v] = valid_parent;
+
+          new_frontier[off] = v;
+        }
+
+        if (logical_warp_valid_p_ballot) {
+          break;
+        }
+      }
+
+    }
+  }
+
+  template<typename IndexType>
+  void bottom_up_large(IndexType *left_unvisited,
+                       IndexType left_unvisited_size,
+                       int *visited,
+                       const IndexType *row_ptr,
+                       const IndexType *col_ind,
+                       IndexType lvl,
+                       IndexType *new_frontier,
+                       IndexType *new_frontier_idx,
+                       IndexType *distances,
+                       IndexType *predecessors,
+                       int *edge_mask,
+                       cudaStream_t m_stream,
+                       bool deterministic) {
+    dim3 grid, block;
+    block.x = LARGE_BOTTOMUP_DIMX;
+    grid.x = min(  (IndexType) MAXBLOCKS,
+              ((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x);
+
+    bottom_up_large_degree_kernel<<<grid, block, 0, m_stream>>>(left_unvisited,
+                                                                left_unvisited_size,
+                                                                visited,
+                                                                row_ptr,
+                                                                col_ind,
+                                                                lvl,
+                                                                new_frontier,
+                                                                new_frontier_idx,
+                                                                distances,
+                                                                predecessors,
+                                                                edge_mask);
+    cudaCheckError();
+  }
+
+  //
+  //
+  //  ------------------------------ Top down ------------------------------
+  //
+  //
+
+  //
+  // compute_bucket_offsets_kernel
+  // simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer
+  //
+
+  template<typename IndexType>
+  __global__ void compute_bucket_offsets_kernel(const IndexType *frontier_degrees_exclusive_sum,
+                                                IndexType *bucket_offsets,
+                                                const IndexType frontier_size,
+                                                IndexType total_degree) {
+    IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+        * NBUCKETS_PER_BLOCK + 1);
+
+    for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x;
+        bid <= end;
+        bid += gridDim.x * blockDim.x) {
+
+      IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
+
+      bucket_offsets[bid] = binsearch_maxle(frontier_degrees_exclusive_sum,
+                                            eid,
+                                            (IndexType) 0,
+                                            frontier_size - 1);
+
+    }
+  }
+
+  template<typename IndexType>
+  void compute_bucket_offsets(IndexType *cumul,
+                              IndexType *bucket_offsets,
+                              IndexType frontier_size,
+                              IndexType total_degree,
+                              cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
+
+    grid.x = min(  (IndexType) MAXBLOCKS,
+              ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+                  * NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x);
+
+    compute_bucket_offsets_kernel<<<grid, block, 0, m_stream>>>(cumul,
+                                                                bucket_offsets,
+                                                                frontier_size,
+                                                                total_degree);
+    cudaCheckError();
+  }
+
+  //
+  // topdown_expand_kernel
+  // Read current frontier and compute new one with top down paradigm
+  // One thread = One edge
+  // To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than)
+  // This index k will give us the origin of this edge, which is frontier[k]
+  // This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k]
+  //
+  // To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches
+  // We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges
+  //
+  // Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k
+  // To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory
+  // We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below)
+  //
+  // We will then look which vertices are not visited yet :
+  // 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on
+  // 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue
+  //
+  // We then treat the candidates queue using the threadIdx.x < ncandidates
+  // If we are indeed the first thread to discover that vertex (result of atomicOr(visited))
+  // We add it to the new frontier
+  //
+
+  template<typename IndexType>
+  __global__ void topdown_expand_kernel(const IndexType *row_ptr,
+                                        const IndexType *col_ind,
+                                        const IndexType *frontier,
+                                        const IndexType frontier_size,
+                                        const IndexType totaldegree,
+                                        const IndexType max_items_per_thread,
+                                        const IndexType lvl,
+                                        IndexType *new_frontier,
+                                        IndexType *new_frontier_cnt,
+                                        const IndexType *frontier_degrees_exclusive_sum,
+                                        const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+                                        int *bmap,
+                                        IndexType *distances,
+                                        IndexType *predecessors,
+                                        const int *edge_mask,
+                                        const int *isolated_bmap,
+                                        bool directed) {
+    //BlockScan
+    typedef cub::BlockScan<IndexType, TOP_DOWN_EXPAND_DIMX> BlockScan;
+    __shared__ typename BlockScan::TempStorage scan_storage;
+
+    // We will do a scan to know where to write in frontier
+    // This will contain the common offset of the block
+    __shared__ IndexType frontier_common_block_offset;
+
+    __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
+    __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
+
+    //
+    // Frontier candidates local queue
+    // We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything
+    // We also save the predecessors here, because we will not be able to retrieve it after
+    //
+    __shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE
+        * TOP_DOWN_EXPAND_DIMX];
+    __shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE
+        * TOP_DOWN_EXPAND_DIMX];
+    __shared__ IndexType block_n_frontier_candidates;
+
+    IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
+    IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1)
+        / TOP_DOWN_EXPAND_DIMX;
+
+    n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
+
+    for (;
+        (n_items_per_thread_left > 0) && (block_offset < totaldegree);
+
+        block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
+            n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) {
+
+      // In this loop, we will process batch_set_size batches
+      IndexType nitems_per_thread = min(  n_items_per_thread_left,
+                              (IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD);
+
+      // Loading buckets offset (see compute_bucket_offsets_kernel)
+
+      if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
+        shared_buckets_offsets[threadIdx.x] =
+            frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE
+                + threadIdx.x];
+
+      // We will use shared_buckets_offsets
+      __syncthreads();
+
+      //
+      // shared_buckets_offsets gives us a range of the possible indexes
+      // for edge of linear_threadx, we are looking for the value k such as
+      // k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx
+      //
+      // we have 0 <= k < frontier_size
+      // but we also have :
+      //
+      // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
+      // <= k
+      // <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1]
+      //
+      // To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below)
+      // We will load them here
+      // We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop
+      // Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below)
+
+      //We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[
+      //If it doesn't fit, --right until it does, then loop
+      //It is excepted to fit on the first try, that's why we start right = nitems_per_thread
+
+      IndexType left = 0;
+      IndexType right = nitems_per_thread;
+
+      while (left < nitems_per_thread) {
+        //
+        // Values that are necessary to compute the local binary searches
+        // We only need those with indexes between extremes indexes of buckets_offsets
+        // We need the next val for the binary search, hence the +1
+        //
+
+        IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+            - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+
+        //If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1
+        while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
+          --right;
+
+          nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+              - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+        }
+
+        IndexType nitems_per_thread_for_this_load = right - left;
+
+        IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left
+            * NBUCKETS_PER_BLOCK];
+
+        if (threadIdx.x < nvalues_to_load) {
+          shared_frontier_degrees_exclusive_sum[threadIdx.x] =
+              frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+                  + threadIdx.x];
+        }
+
+        if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
+          shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
+              frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+                  + TOP_DOWN_EXPAND_DIMX];
+        }
+
+        //shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync
+        __syncthreads();
+
+        // Now we will process the edges
+        // Here each thread will process nitems_per_thread_for_this_load
+        for (IndexType item_index = 0;
+            item_index < nitems_per_thread_for_this_load;
+            item_index += TOP_DOWN_BATCH_SIZE) {
+
+          // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism)
+          // Reduces latency
+
+          IndexType current_max_edge_index = min(block_offset
+                                                 + (left
+                                                 + nitems_per_thread_for_this_load)
+                                                 * blockDim.x,
+                                                 totaldegree);
+
+          //We will need vec_u (source of the edge) until the end if we need to save the predecessors
+          //For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case)
+
+          IndexType vec_u[TOP_DOWN_BATCH_SIZE];
+          IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
+          IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
+
+          IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
+
+#pragma unroll
+          for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+
+            IndexType ibatch = left + item_index + iv;
+            IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x;
+
+            if (gid < current_max_edge_index) {
+              IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x)
+                  / TOP_DOWN_BUCKET_SIZE;
+              IndexType bucket_start = shared_buckets_offsets[start_off_idx]
+                  - frontier_degrees_exclusive_sum_block_offset;
+              IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1]
+                  - frontier_degrees_exclusive_sum_block_offset;
+
+              IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum,
+                                            gid,
+                                            bucket_start,
+                                            bucket_end)
+                  + frontier_degrees_exclusive_sum_block_offset;
+              vec_u[iv] = frontier[k]; // origin of this edge
+              vec_frontier_degrees_exclusive_sum_index[iv] =
+                  frontier_degrees_exclusive_sum[k];
+            } else {
+              vec_u[iv] = -1;
+              vec_frontier_degrees_exclusive_sum_index[iv] = -1;
+            }
+
+          }
+
+          IndexType *vec_row_ptr_u = &local_buf1[0];
+#pragma unroll
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType u = vec_u[iv];
+            //row_ptr for this vertex origin u
+            vec_row_ptr_u[iv] = (u != -1)
+                          ? row_ptr[u]
+                            :
+                            -1;
+          }
+
+          //We won't need row_ptr after that, reusing pointer
+          IndexType *vec_dest_v = vec_row_ptr_u;
+
+#pragma unroll
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType thread_item_index = left + item_index + iv;
+            IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x;
+
+            IndexType row_ptr_u = vec_row_ptr_u[iv];
+            IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
+
+            if (edge_mask && !edge_mask[edge])
+              row_ptr_u = -1; //disabling edge
+
+            //Destination of this edge
+            vec_dest_v[iv] = (row_ptr_u != -1)
+                        ? col_ind[edge]
+                          :
+                          -1;
+          }
+
+          //We don't need vec_frontier_degrees_exclusive_sum_index anymore
+          IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
+#pragma unroll
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType v = vec_dest_v[iv];
+            vec_v_visited_bmap[iv] = (v != -1)
+                              ? bmap[v / INT_SIZE]
+                                :
+                                (~0); //will look visited
+          }
+
+          // From now on we will consider v as a frontier candidate
+          // If for some reason vec_candidate[iv] should be put in the new_frontier
+          // Then set vec_candidate[iv] = -1
+          IndexType *vec_frontier_candidate = vec_dest_v;
+
+#pragma unroll
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType v = vec_frontier_candidate[iv];
+            int m = 1 << (v % INT_SIZE);
+
+            int is_visited = vec_v_visited_bmap[iv] & m;
+
+            if (is_visited)
+              vec_frontier_candidate[iv] = -1;
+          }
+
+          if (directed) {
+            //vec_v_visited_bmap is available
+
+            IndexType *vec_is_isolated_bmap = vec_v_visited_bmap;
+
+#pragma unroll
+            for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+              IndexType v = vec_frontier_candidate[iv];
+              vec_is_isolated_bmap[iv] = (v != -1)
+                                ? isolated_bmap[v / INT_SIZE]
+                                  :
+                                  -1;
+            }
+
+#pragma unroll
+            for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+              IndexType v = vec_frontier_candidate[iv];
+              int m = 1 << (v % INT_SIZE);
+              int is_isolated = vec_is_isolated_bmap[iv] & m;
+
+              //If v is isolated, we will not add it to the frontier (it's not a frontier candidate)
+              // 1st reason : it's useless
+              // 2nd reason : it will make top down algo fail
+              // we need each node in frontier to have a degree > 0
+              // If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr
+
+              if (is_isolated && v != -1) {
+                int m = 1 << (v % INT_SIZE);
+                atomicOr(&bmap[v / INT_SIZE], m);
+                if (distances)
+                  distances[v] = lvl;
+
+                if (predecessors)
+                  predecessors[v] = vec_u[iv];
+
+                //This is no longer a candidate, neutralize it
+                vec_frontier_candidate[iv] = -1;
+              }
+
+            }
+          }
+
+          //Number of successor candidate hold by this thread
+          IndexType thread_n_frontier_candidates = 0;
+
+#pragma unroll
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            IndexType v = vec_frontier_candidate[iv];
+            if (v != -1)
+              ++thread_n_frontier_candidates;
+          }
+
+          // We need to have all nfrontier_candidates to be ready before doing the scan
+          __syncthreads();
+
+          // We will put the frontier candidates in a local queue
+          // Computing offsets
+          IndexType thread_frontier_candidate_offset = 0; //offset inside block
+          BlockScan(scan_storage).ExclusiveSum(thread_n_frontier_candidates,
+                                               thread_frontier_candidate_offset);
+
+#pragma unroll
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            //May have bank conflicts
+            IndexType frontier_candidate = vec_frontier_candidate[iv];
+
+            if (frontier_candidate != -1) {
+              shared_local_new_frontier_candidates[thread_frontier_candidate_offset] =
+                  frontier_candidate;
+              shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] =
+                  vec_u[iv];
+              ++thread_frontier_candidate_offset;
+            }
+          }
+
+          if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
+            //No need to add nsuccessor_candidate, even if its an
+            //exclusive sum
+            //We incremented the thread_frontier_candidate_offset
+            block_n_frontier_candidates = thread_frontier_candidate_offset;
+          }
+
+          //broadcast block_n_frontier_candidates
+          __syncthreads();
+
+          IndexType naccepted_vertices = 0;
+          //We won't need vec_frontier_candidate after that
+          IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate;
+
+#pragma unroll
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            const int idx_shared = iv * blockDim.x + threadIdx.x;
+            vec_frontier_accepted_vertex[iv] = -1;
+
+            if (idx_shared < block_n_frontier_candidates) {
+              IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue
+              int m = 1 << (v % INT_SIZE);
+              int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old
+
+              if (!(m & q)) { //if this thread was the first to discover this node
+                if (distances)
+                  distances[v] = lvl;
+
+                if (predecessors) {
+                  IndexType pred = shared_local_new_frontier_predecessors[idx_shared];
+                  predecessors[v] = pred;
+                }
+
+                vec_frontier_accepted_vertex[iv] = v;
+                ++naccepted_vertices;
+              }
+            }
+
+          }
+
+          //We need naccepted_vertices to be ready
+          __syncthreads();
+
+          IndexType thread_new_frontier_offset;
+
+          BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset);
+
+          if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) {
+
+            IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices;
+            //for this thread, thread_new_frontier_offset + has_successor (exclusive sum)
+            if (inclusive_sum)
+              frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum);
+          }
+
+          //Broadcasting frontier_common_block_offset
+          __syncthreads();
+
+#pragma unroll
+          for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+            const int idx_shared = iv * blockDim.x + threadIdx.x;
+            if (idx_shared < block_n_frontier_candidates) {
+
+              IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv];
+
+              if (new_frontier_vertex != -1) {
+                IndexType off = frontier_common_block_offset + thread_new_frontier_offset++;
+                new_frontier[off] = new_frontier_vertex;
+              }
+            }
+          }
+
+        }
+
+        //We need to keep shared_frontier_degrees_exclusive_sum coherent
+        __syncthreads();
+
+        //Preparing for next load
+        left = right;
+        right = nitems_per_thread;
+      }
+
+      //we need to keep shared_buckets_offsets coherent
+      __syncthreads();
+    }
+
+  }
+
+  template<typename IndexType>
+  void frontier_expand(const IndexType *row_ptr,
+                       const IndexType *col_ind,
+                       const IndexType *frontier,
+                       const IndexType frontier_size,
+                       const IndexType totaldegree,
+                       const IndexType lvl,
+                       IndexType *new_frontier,
+                       IndexType *new_frontier_cnt,
+                       const IndexType *frontier_degrees_exclusive_sum,
+                       const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+                       int *visited_bmap,
+                       IndexType *distances,
+                       IndexType *predecessors,
+                       const int *edge_mask,
+                       const int *isolated_bmap,
+                       bool directed,
+                       cudaStream_t m_stream,
+                       bool deterministic) {
+    if (!totaldegree)
+      return;
+
+    dim3 block;
+    block.x = TOP_DOWN_EXPAND_DIMX;
+
+    IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1)
+        / (MAXBLOCKS * block.x);
+
+    dim3 grid;
+    grid.x = min(  (totaldegree + max_items_per_thread * block.x - 1)
+                  / (max_items_per_thread * block.x),
+              (IndexType) MAXBLOCKS);
+
+    topdown_expand_kernel<<<grid, block, 0, m_stream>>>(row_ptr,
+                                                        col_ind,
+                                                        frontier,
+                                                        frontier_size,
+                                                        totaldegree,
+                                                        max_items_per_thread,
+                                                        lvl,
+                                                        new_frontier,
+                                                        new_frontier_cnt,
+                                                        frontier_degrees_exclusive_sum,
+                                                        frontier_degrees_exclusive_sum_buckets_offsets,
+                                                        visited_bmap,
+                                                        distances,
+                                                        predecessors,
+                                                        edge_mask,
+                                                        isolated_bmap,
+                                                        directed);
+    cudaCheckError();
+  }
+
+  template<typename IndexType>
+  __global__ void flag_isolated_vertices_kernel(IndexType n,
+                                                int *isolated_bmap,
+                                                const IndexType *row_ptr,
+                                                IndexType *degrees,
+                                                IndexType *nisolated) {
+    typedef cub::BlockLoad<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
+        FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+    typedef cub::BlockStore<IndexType, FLAG_ISOLATED_VERTICES_DIMX,
+        FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+    typedef cub::BlockReduce<IndexType, FLAG_ISOLATED_VERTICES_DIMX> BlockReduce;
+    typedef cub::WarpReduce<int, FLAG_ISOLATED_VERTICES_THREADS_PER_INT> WarpReduce;
+
+    __shared__ typename BlockLoad::TempStorage load_temp_storage;
+    __shared__ typename BlockStore::TempStorage store_temp_storage;
+    __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage;
+
+    __shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX
+        / FLAG_ISOLATED_VERTICES_THREADS_PER_INT];
+
+    __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX];
+
+    for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD
+        * (blockDim.x * blockIdx.x);
+        block_off < n;
+        block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) {
+
+      IndexType thread_off = block_off
+          + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x;
+      IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1;
+
+      IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
+      IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1]
+
+      BlockLoad(load_temp_storage).Load(row_ptr + block_off,
+                                        thread_row_ptr,
+                                        block_valid_items,
+                                        -1);
+
+      //To compute 4 degrees, we need 5 values of row_ptr
+      //Saving the "5th" value in shared memory for previous thread to use
+      if (threadIdx.x > 0) {
+        row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0];
+      }
+
+      //If this is the last thread, it needs to load its row ptr tail value
+      if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) {
+        row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1];
+
+      }
+      __syncthreads(); // we may reuse temp_storage
+
+      int local_isolated_bmap = 0;
+
+      IndexType imax = (n - thread_off);
+
+      IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD];
+
+#pragma unroll
+      for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) {
+        IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i];
+
+        if (i < imax)
+          local_isolated_bmap |= ((degree == 0) << i);
+      }
+
+      if (last_node_thread < n) {
+        IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] =
+            row_ptr_tail[threadIdx.x]
+                - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1];
+
+        local_isolated_bmap |= ((degree == 0)
+            << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1));
+
+      }
+
+      local_isolated_bmap <<= (thread_off % INT_SIZE);
+
+      IndexType local_nisolated = __popc(local_isolated_bmap);
+
+      //We need local_nisolated and local_isolated_bmap to be ready for next steps
+      __syncthreads();
+
+      IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated);
+
+      if (threadIdx.x == 0 && total_nisolated) {
+        atomicAdd(nisolated, total_nisolated);
+      }
+
+      int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT;
+
+      //Building int for bmap
+      int int_aggregate_isolated_bmap =
+          WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(local_isolated_bmap,
+                                                                     BitwiseOr());
+
+      int is_head_of_visited_int =
+          ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0);
+      if (is_head_of_visited_int) {
+        isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap;
+      }
+
+      BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items);
+    }
+  }
+
+  template<typename IndexType>
+  void flag_isolated_vertices(IndexType n,
+                              int *isolated_bmap,
+                              const IndexType *row_ptr,
+                              IndexType *degrees,
+                              IndexType *nisolated,
+                              cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = FLAG_ISOLATED_VERTICES_DIMX;
+
+    grid.x = min(  (IndexType) MAXBLOCKS,
+              (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x);
+
+    flag_isolated_vertices_kernel<<<grid, block, 0, m_stream>>>(n,
+                                                                isolated_bmap,
+                                                                row_ptr,
+                                                                degrees,
+                                                                nisolated);
+    cudaCheckError();
+  }
+
+  //
+  //
+  //
+  // Some utils functions
+  //
+  //
+
+  //Creates CUB data for graph size n
+  template<typename IndexType>
+  void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) {
+    // Determine temporary device storage requirements for exclusive prefix scan
+    d_temp_storage = NULL;
+    temp_storage_bytes = 0;
+    IndexType *d_in = NULL, *d_out = NULL;
+    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n);
+    // Allocate temporary storage for exclusive prefix scan
+    cudaStream_t stream{nullptr};
+    ALLOC_TRY(&d_temp_storage, temp_storage_bytes, stream);
+  }
+
+  template<typename IndexType>
+  __global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) {
+    for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x;
+        u < n;
+        u += gridDim.x * blockDim.x)
+      vec[u] = val;
+
+  }
+
+  template<typename IndexType>
+  void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = 256;
+    grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+    fill_kernel<<<grid, block, 0, m_stream>>>(vec, n, val);
+    cudaCheckError();
+  }
+
+  template<typename IndexType>
+  __global__ void set_frontier_degree_kernel(IndexType *frontier_degree,
+                                             IndexType *frontier,
+                                             const IndexType *degree,
+                                             IndexType n) {
+    for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+        idx < n;
+        idx += gridDim.x * blockDim.x) {
+      IndexType u = frontier[idx];
+      frontier_degree[idx] = degree[u];
+    }
+  }
+
+  template<typename IndexType>
+  void set_frontier_degree(IndexType *frontier_degree,
+                           IndexType *frontier,
+                           const IndexType *degree,
+                           IndexType n,
+                           cudaStream_t m_stream) {
+    dim3 grid, block;
+    block.x = 256;
+    grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+    set_frontier_degree_kernel<<<grid, block, 0, m_stream>>>(frontier_degree,
+                                                             frontier,
+                                                             degree,
+                                                             n);
+    cudaCheckError();
+  }
+
+  template<typename IndexType>
+  void exclusive_sum(void *d_temp_storage,
+                     size_t temp_storage_bytes,
+                     IndexType *d_in,
+                     IndexType *d_out,
+                     IndexType num_items,
+                     cudaStream_t m_stream) {
+    if (num_items <= 1)
+      return; //DeviceScan fails if n==1
+    cub::DeviceScan::ExclusiveSum(d_temp_storage,
+                                  temp_storage_bytes,
+                                  d_in,
+                                  d_out,
+                                  num_items,
+                                  m_stream);
+  }
+
+  template<typename T>
+  __global__ void fill_vec_kernel(T *vec, T n, T val) {
+    for (T idx = blockIdx.x * blockDim.x + threadIdx.x;
+        idx < n;
+        idx += blockDim.x * gridDim.x)
+      vec[idx] = val;
+  }
+
+  template<typename T>
+  void fill_vec(T *vec, T n, T val, cudaStream_t stream) {
+    dim3 grid, block;
+    block.x = 256;
+    grid.x = (n + block.x - 1) / block.x;
+
+    fill_vec_kernel<<<grid, block, 0, stream>>>(vec, n, val);
+    cudaCheckError();
+  }
+}
+//
diff --git a/cpp/src/traversal/nvgraph_sssp.cu b/cpp/src/traversal/nvgraph_sssp.cu
new file mode 100644
index 00000000000..fdccfa23c91
--- /dev/null
+++ b/cpp/src/traversal/nvgraph_sssp.cu
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** ---------------------------------------------------------------------------*
+ * @brief Wrapper functions for Nvgraph sssp
+ *
+ * @file nvgraph_sssp.cu
+ * ---------------------------------------------------------------------------**/
+
+#include <nvgraph_gdf.h>
+#include <nvgraph/nvgraph.h>
+#include <thrust/device_vector.h>
+#include "utilities/error_utils.h"
+#include "converters/nvgraph.cuh"
+#include <rmm_utils.h>
+
+gdf_error gdf_sssp_nvgraph(gdf_graph *gdf_G,
+                           const int *source_vert,
+                           gdf_column *sssp_distances) {
+  GDF_REQUIRE(gdf_G != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(*source_vert >= 0, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(*source_vert < sssp_distances->size, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(sssp_distances != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(sssp_distances->data != nullptr, GDF_INVALID_API_CALL);
+  GDF_REQUIRE(!sssp_distances->valid, GDF_VALIDITY_UNSUPPORTED);
+  GDF_REQUIRE(sssp_distances->size > 0, GDF_INVALID_API_CALL);
+
+  // init nvgraph
+  // TODO : time this call
+  nvgraphHandle_t nvg_handle = 0;
+  nvgraphGraphDescr_t nvgraph_G = 0;
+  cudaDataType_t settype;
+
+  NVG_TRY(nvgraphCreate(&nvg_handle));
+  GDF_TRY(gdf_createGraph_nvgraph(nvg_handle, gdf_G, &nvgraph_G, true));
+
+  int sssp_index = 0;
+  int weight_index = 0;
+  rmm::device_vector<float> d_val;
+
+  cudaStream_t stream{nullptr};
+
+  if (gdf_G->transposedAdjList->edge_data == nullptr) {
+    // use a fp32 vector  [1,...,1]
+    settype = CUDA_R_32F;
+    d_val.resize(gdf_G->transposedAdjList->indices->size);
+    thrust::fill(rmm::exec_policy(stream)->on(stream), d_val.begin(), d_val.end(), 1.0);
+    NVG_TRY(nvgraphAttachEdgeData(nvg_handle,
+                                  nvgraph_G,
+                                  weight_index,
+                                  settype,
+                                  (void * ) thrust::raw_pointer_cast(d_val.data())));
+  }
+  else {
+    switch (gdf_G->transposedAdjList->edge_data->dtype) {
+      case GDF_FLOAT32:
+        settype = CUDA_R_32F;
+        break;
+      case GDF_FLOAT64:
+        settype = CUDA_R_64F;
+        break;
+      default:
+        return GDF_UNSUPPORTED_DTYPE;
+    }
+  }
+
+  NVG_TRY(nvgraphAttachVertexData(nvg_handle, nvgraph_G, 0, settype, sssp_distances->data));
+
+  NVG_TRY(nvgraphSssp(nvg_handle, nvgraph_G, weight_index, source_vert, sssp_index));
+
+  NVG_TRY(nvgraphDestroyGraphDescr(nvg_handle, nvgraph_G));
+  NVG_TRY(nvgraphDestroy(nvg_handle));
+
+  return GDF_SUCCESS;
+}
diff --git a/cpp/src/two_hop_neighbors.cu b/cpp/src/traversal/two_hop_neighbors.cu
similarity index 87%
rename from cpp/src/two_hop_neighbors.cu
rename to cpp/src/traversal/two_hop_neighbors.cu
index 6a38d46504b..de8bd9bfb0c 100644
--- a/cpp/src/two_hop_neighbors.cu
+++ b/cpp/src/traversal/two_hop_neighbors.cu
@@ -22,6 +22,7 @@
 #include "two_hop_neighbors.cuh"
 #include "utilities/error_utils.h"
 #include <rmm_utils.h>
+#include <rmm/thrust_rmm_allocator.h>
 
 #include <thrust/scan.h>
 #include <thrust/transform.h>
@@ -38,27 +39,28 @@ gdf_error gdf_get_two_hop_neighbors_impl(IndexType num_verts,
     IndexType num_edges;
     cudaMemcpy(&num_edges, &offsets[num_verts], sizeof(IndexType), cudaMemcpyDefault);
 
+    cudaStream_t stream {nullptr};
+
     // Allocate memory for temporary stuff
     IndexType *exsum_degree = nullptr;
     IndexType *first_pair = nullptr;
     IndexType *second_pair = nullptr;
     IndexType *block_bucket_offsets = nullptr;
 
-    ALLOC_MANAGED_TRY(&exsum_degree, sizeof(IndexType) * (num_edges + 1), nullptr);
+    ALLOC_TRY(&exsum_degree, sizeof(IndexType) * (num_edges + 1), stream);
 
     // Find the degree of the out vertex of each edge
     degree_iterator<IndexType> deg_it(offsets);
     deref_functor<degree_iterator<IndexType>, IndexType> deref(deg_it);
-    rmm_temp_allocator allocator(nullptr);
-    thrust::fill(thrust::cuda::par(allocator).on(nullptr), exsum_degree, exsum_degree + 1, 0);
-    thrust::transform(thrust::cuda::par(allocator).on(nullptr),
+    thrust::fill(rmm::exec_policy(stream)->on(stream), exsum_degree, exsum_degree + 1, 0);
+    thrust::transform(rmm::exec_policy(stream)->on(stream),
                                         indices,
                                         indices + num_edges,
                                         exsum_degree + 1,
                                         deref);
 
     // Take the inclusive sum of the degrees
-    thrust::inclusive_scan(thrust::cuda::par(allocator).on(nullptr),
+    thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream),
                                                     exsum_degree + 1,
                                                     exsum_degree + num_edges + 1,
                                                     exsum_degree + 1);
@@ -68,12 +70,12 @@ gdf_error gdf_get_two_hop_neighbors_impl(IndexType num_verts,
     cudaMemcpy(&output_size, &exsum_degree[num_edges], sizeof(IndexType), cudaMemcpyDefault);
 
     // Allocate memory for the scattered output
-    ALLOC_MANAGED_TRY(&second_pair, sizeof(IndexType) * output_size, nullptr);
-    ALLOC_MANAGED_TRY(&first_pair, sizeof(IndexType) * output_size, nullptr);
+    ALLOC_TRY(&second_pair, sizeof(IndexType) * output_size, stream);
+    ALLOC_TRY(&first_pair, sizeof(IndexType) * output_size, stream);
 
     // Figure out number of blocks and allocate memory for block bucket offsets
     IndexType num_blocks = (output_size + TWO_HOP_BLOCK_SIZE - 1) / TWO_HOP_BLOCK_SIZE;
-    ALLOC_MANAGED_TRY(&block_bucket_offsets, sizeof(IndexType) * (num_blocks + 1), nullptr);
+    ALLOC_TRY(&block_bucket_offsets, sizeof(IndexType) * (num_blocks + 1), stream);
 
     // Compute the block bucket offsets
     dim3 grid, block;
@@ -100,18 +102,18 @@ gdf_error gdf_get_two_hop_neighbors_impl(IndexType num_verts,
     // Remove duplicates and self pairings
     auto tuple_start = thrust::make_zip_iterator(thrust::make_tuple(first_pair, second_pair));
     auto tuple_end = tuple_start + output_size;
-    thrust::sort(thrust::cuda::par(allocator).on(nullptr), tuple_start, tuple_end);
-    tuple_end = thrust::copy_if(thrust::cuda::par(allocator).on(nullptr),
+    thrust::sort(rmm::exec_policy(stream)->on(stream), tuple_start, tuple_end);
+    tuple_end = thrust::copy_if(rmm::exec_policy(stream)->on(stream),
                                                             tuple_start,
                                                             tuple_end,
                                                             tuple_start,
                                                             self_loop_flagger<IndexType>());
-    tuple_end = thrust::unique(thrust::cuda::par(allocator).on(nullptr), tuple_start, tuple_end);
+    tuple_end = thrust::unique(rmm::exec_policy(stream)->on(stream), tuple_start, tuple_end);
 
     // Get things ready to return
     outputSize = tuple_end - tuple_start;
-    ALLOC_MANAGED_TRY(first, sizeof(IndexType) * outputSize, nullptr);
-    ALLOC_MANAGED_TRY(second, sizeof(IndexType) * outputSize, nullptr);
+    ALLOC_TRY(first, sizeof(IndexType) * outputSize, nullptr);
+    ALLOC_TRY(second, sizeof(IndexType) * outputSize, nullptr);
     cudaMemcpy(*first, first_pair, sizeof(IndexType) * outputSize, cudaMemcpyDefault);
     cudaMemcpy(*second, second_pair, sizeof(IndexType) * outputSize, cudaMemcpyDefault);
 
diff --git a/cpp/src/two_hop_neighbors.cuh b/cpp/src/traversal/two_hop_neighbors.cuh
similarity index 100%
rename from cpp/src/two_hop_neighbors.cuh
rename to cpp/src/traversal/two_hop_neighbors.cuh
diff --git a/cpp/src/utilities/degree.cu b/cpp/src/utilities/degree.cu
new file mode 100644
index 00000000000..5f84f68feab
--- /dev/null
+++ b/cpp/src/utilities/degree.cu
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cugraph.h>
+#include "utilities/error_utils.h"
+#include "utilities/graph_utils.cuh"
+
+gdf_error gdf_degree_impl(int n, int e, gdf_column* col_ptr, gdf_column* degree, bool offsets) {
+  if(offsets == true) {
+    dim3 nthreads, nblocks;
+    nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS);
+    nthreads.y = 1;
+    nthreads.z = 1;
+    nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
+    nblocks.y = 1;
+    nblocks.z = 1;
+
+    switch (col_ptr->dtype) {
+      case GDF_INT32:   cugraph::degree_offsets<int32_t, int32_t> <<<nblocks, nthreads>>>(n, e, static_cast<int*>(col_ptr->data), static_cast<int*>(degree->data));break;
+      default: return GDF_UNSUPPORTED_DTYPE;
+    }
+  }
+  else {
+    dim3 nthreads, nblocks;
+    nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS);
+    nthreads.y = 1;
+    nthreads.z = 1;
+    nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
+    nblocks.y = 1;
+    nblocks.z = 1;
+
+    switch (col_ptr->dtype) {
+      case GDF_INT32:   cugraph::degree_coo<int32_t, int32_t> <<<nblocks, nthreads>>>(n, e, static_cast<int*>(col_ptr->data), static_cast<int*>(degree->data));break;
+      default: return GDF_UNSUPPORTED_DTYPE;
+    }
+  }
+  return GDF_SUCCESS;
+}
+
+
+gdf_error gdf_degree(gdf_graph *graph, gdf_column *degree, int x) {
+  // Calculates the degree of all vertices of the graph
+  // x = 0: in+out degree
+  // x = 1: in-degree
+  // x = 2: out-degree
+  GDF_REQUIRE(graph->adjList != nullptr || graph->transposedAdjList != nullptr, GDF_INVALID_API_CALL);
+  int n;
+  int e;
+  if(graph->adjList != nullptr) {
+    n = graph->adjList->offsets->size -1;
+    e = graph->adjList->indices->size;
+  }
+  else {
+    n = graph->transposedAdjList->offsets->size - 1;
+    e = graph->transposedAdjList->indices->size;
+  }
+
+  if(x!=1) {
+    // Computes out-degree for x=0 and x=2
+    if(graph->adjList)
+      gdf_degree_impl(n, e, graph->adjList->offsets, degree, true);
+    else
+      gdf_degree_impl(n, e, graph->transposedAdjList->indices, degree, false);
+  }
+
+  if(x!=2) {
+    // Computes in-degree for x=0 and x=1
+    if(graph->adjList)
+      gdf_degree_impl(n, e, graph->adjList->indices, degree, false);
+    else
+      gdf_degree_impl(n, e, graph->transposedAdjList->offsets, degree, true);
+  }
+  return GDF_SUCCESS;
+}
diff --git a/cpp/src/utilities/error_utils.h b/cpp/src/utilities/error_utils.h
index 6b8416da844..c50feca3a12 100644
--- a/cpp/src/utilities/error_utils.h
+++ b/cpp/src/utilities/error_utils.h
@@ -23,9 +23,20 @@
 #define GDF_ERRORUTILS_H
 
 #include <iostream>
+
 #include <cuda.h>
 #include <cuda_runtime_api.h>
 
+#include <cudf/types.h>
+#include "nvgraph_error_utils.h"
+
+#define cudaCheckError() {                                              \
+    cudaError_t e=cudaGetLastError();                                     \
+    if(e!=cudaSuccess) {                                                  \
+      std::cerr << "Cuda failure: "  << cudaGetErrorString(e) << " at: " << __FILE__ << ':' << __LINE__ << std::endl;        \
+    }                                                                     \
+  }
+
 #define CUDA_TRY( call ) 									                            \
 {                                                                     \
     cudaError_t cudaStatus = call;                                    \
diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh
new file mode 100644
index 00000000000..a5331ef6bb4
--- /dev/null
+++ b/cpp/src/utilities/graph_utils.cuh
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ */
+
+// Interanl helper functions 
+// Author: Alex Fender afender@nvidia.com
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+//#include <library_types.h>
+//#include <cuda_fp16.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+#include <thrust/inner_product.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sort.h>
+
+#include <rmm_utils.h>
+#include "utilities/error_utils.h"
+
+#define USE_CG 1
+//#define DEBUG 1
+
+namespace cugraph
+{
+
+#define CUDA_MAX_BLOCKS 65535
+#define CUDA_MAX_KERNEL_THREADS 256  //kernel will launch at most 256 threads per block
+#define DEFAULT_MASK 0xffffffff
+#define US
+
+    template<typename T>
+    static __device__  __forceinline__ T shfl_up(T r, int offset, int bound = 32, int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#if USE_CG
+        return __shfl_up_sync(mask, r, offset, bound);
+#else
+        return __shfl_up(r, offset, bound);
+#endif
+#else
+        return 0.0f;
+#endif
+    }
+
+    template<typename T>
+    static __device__  __forceinline__ T shfl(T r, int lane, int bound = 32, int mask = DEFAULT_MASK) {
+#if __CUDA_ARCH__ >= 300
+#if USE_CG
+        return __shfl_sync(mask, r, lane, bound);
+#else
+        return __shfl(r, lane, bound);
+#endif
+#else
+        return 0.0f;
+#endif
+    }
+
+    template<typename IdxType, typename ValType>
+    __inline__   __device__
+    ValType parallel_prefix_sum(IdxType n, IdxType *ind, ValType *w) {
+        IdxType i, j, mn;
+        ValType v, last;
+        ValType sum = 0.0;
+        bool valid;
+
+        //Parallel prefix sum (using __shfl)
+        mn = (((n + blockDim.x - 1) / blockDim.x) * blockDim.x); //n in multiple of blockDim.x
+        for (i = threadIdx.x; i < mn; i += blockDim.x) {
+            //All threads (especially the last one) must always participate
+            //in the shfl instruction, otherwise their sum will be undefined.
+            //So, the loop stopping condition is based on multiple of n in loop increments,
+            //so that all threads enter into the loop and inside we make sure we do not
+            //read out of bounds memory checking for the actual size n.
+
+            //check if the thread is valid
+            valid = i < n;
+
+            //Notice that the last thread is used to propagate the prefix sum.
+            //For all the threads, in the first iteration the last is 0, in the following
+            //iterations it is the value at the last thread of the previous iterations.
+
+            //get the value of the last thread
+            last = shfl(sum, blockDim.x - 1, blockDim.x);
+
+            //if you are valid read the value from memory, otherwise set your value to 0
+            sum = (valid) ? w[ind[i]] : 0.0;
+
+            //do prefix sum (of size warpSize=blockDim.x =< 32)
+            for (j = 1; j < blockDim.x; j *= 2) {
+                v = shfl_up(sum, j, blockDim.x);
+                if (threadIdx.x >= j)
+                    sum += v;
+            }
+            //shift by last
+            sum += last;
+            //notice that no __threadfence or __syncthreads are needed in this implementation
+        }
+        //get the value of the last thread (to all threads)
+        last = shfl(sum, blockDim.x - 1, blockDim.x);
+
+        return last;
+    }
+
+//dot
+    template<typename T>
+    T dot(size_t n, T* x, T* y) {
+        cudaStream_t stream {nullptr};
+        T result = thrust::inner_product(rmm::exec_policy(stream)->on(stream),
+                                         thrust::device_pointer_cast(x),
+                                         thrust::device_pointer_cast(x + n),
+                                         thrust::device_pointer_cast(y),
+                                         0.0f);
+        cudaCheckError();
+        return result;
+    }
+
+//axpy
+    template<typename T>
+    struct axpy_functor: public thrust::binary_function<T, T, T> {
+        const T a;
+        axpy_functor(T _a) :
+                a(_a) {
+        }
+        __host__  __device__
+        T operator()(const T& x, const T& y) const {
+            return a * x + y;
+        }
+    };
+
+    template<typename T>
+    void axpy(size_t n, T a, T* x, T* y) {
+        cudaStream_t stream {nullptr};
+        thrust::transform(rmm::exec_policy(stream)->on(stream),
+                          thrust::device_pointer_cast(x),
+                          thrust::device_pointer_cast(x + n),
+                          thrust::device_pointer_cast(y),
+                          thrust::device_pointer_cast(y),
+                          axpy_functor<T>(a));
+        cudaCheckError();
+    }
+
+//norm
+    template<typename T>
+    struct square {
+        __host__  __device__
+        T operator()(const T& x) const {
+            return x * x;
+        }
+    };
+
+    template<typename T>
+    T nrm2(size_t n, T* x) {
+        cudaStream_t stream {nullptr};
+        T init = 0;
+        T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream)->on(stream),
+                                                      thrust::device_pointer_cast(x),
+                                                      thrust::device_pointer_cast(x + n),
+                                                      square<T>(),
+                                                      init,
+                                                      thrust::plus<T>()));
+        cudaCheckError();
+        return result;
+    }
+
+    template<typename T>
+    T nrm1(size_t n, T* x) {
+        cudaStream_t stream {nullptr};
+        T result = thrust::reduce(rmm::exec_policy(stream)->on(stream),
+                                  thrust::device_pointer_cast(x),
+                                  thrust::device_pointer_cast(x + n));
+        cudaCheckError();
+        return result;
+    }
+
+    template<typename T>
+    void scal(size_t n, T val, T* x) {
+        cudaStream_t stream {nullptr};
+        thrust::transform(rmm::exec_policy(stream)->on(stream),
+                          thrust::device_pointer_cast(x),
+                          thrust::device_pointer_cast(x + n),
+                          thrust::make_constant_iterator(val),
+                          thrust::device_pointer_cast(x),
+                          thrust::multiplies<T>());
+        cudaCheckError();
+    }
+
+    template<typename T>
+    void addv(size_t n, T val, T* x) {
+        cudaStream_t stream {nullptr};
+        thrust::transform(rmm::exec_policy(stream)->on(stream),
+                          thrust::device_pointer_cast(x),
+                          thrust::device_pointer_cast(x + n),
+                          thrust::make_constant_iterator(val),
+                          thrust::device_pointer_cast(x),
+                          thrust::plus<T>());
+        cudaCheckError();
+    }
+
+    template<typename T>
+    void fill(size_t n, T* x, T value) {
+        cudaStream_t stream {nullptr};
+        thrust::fill(rmm::exec_policy(stream)->on(stream),
+                     thrust::device_pointer_cast(x),
+                     thrust::device_pointer_cast(x + n), value);
+        cudaCheckError();
+    }
+
+    template<typename T>
+    void printv(size_t n, T* vec, int offset) {
+        thrust::device_ptr<T> dev_ptr(vec);
+        std::cout.precision(15);
+        std::cout << "sample size = " << n << ", offset = " << offset << std::endl;
+        thrust::copy(dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator<T>(std::cout, " ")); //Assume no RMM dependency; TODO: check / test (potential BUG !!!!!)
+        cudaCheckError();
+        std::cout << std::endl;
+    }
+
+    template<typename T>
+    void copy(size_t n, T *x, T *res) {
+        thrust::device_ptr<T> dev_ptr(x);
+        thrust::device_ptr<T> res_ptr(res);
+        cudaStream_t stream {nullptr};
+        thrust::copy_n(rmm::exec_policy(stream)->on(stream), dev_ptr, n, res_ptr);
+        cudaCheckError();
+    }
+
+    template<typename T>
+    struct is_zero {
+        __host__ __device__
+        bool operator()(const T x) {
+            return x == 0;
+        }
+    };
+
+    template<typename T>
+    struct dangling_functor: public thrust::unary_function<T, T> {
+        const T val;
+        dangling_functor(T _val) :
+                val(_val) {
+        }
+        __host__  __device__
+        T operator()(const T& x) const {
+            return val + x;
+        }
+    };
+
+    template<typename T>
+    void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor) {
+        cudaStream_t stream {nullptr};
+        thrust::transform_if(rmm::exec_policy(stream)->on(stream),
+                             thrust::device_pointer_cast(dangling_nodes),
+                             thrust::device_pointer_cast(dangling_nodes + n),
+                             thrust::device_pointer_cast(dangling_nodes),
+                             dangling_functor<T>(1.0 - damping_factor),
+                             is_zero<T>());
+        cudaCheckError();
+    }
+
+//google matrix kernels
+    template<typename IndexType, typename ValueType>
+    __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+    degree_coo(const IndexType n, const IndexType e, const IndexType *ind, ValueType *degree) {
+        for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
+            atomicAdd(&degree[ind[i]], (ValueType)1.0);
+    }
+
+    template<typename IndexType, typename ValueType>
+    __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+    flag_leafs_kernel(const size_t n, const IndexType *degree, ValueType *bookmark) {
+        for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
+            if (degree[i] == 0)
+                bookmark[i] = 1.0;
+    }
+
+    template<typename IndexType, typename ValueType>
+    __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+    degree_offsets(const IndexType n, const IndexType e, const IndexType *ind, ValueType *degree) {
+        for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x)
+            degree[i] += ind[i+1]-ind[i];
+    }
+
+    template<typename FromType, typename ToType>
+    __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+    type_convert(FromType* array, int n) {
+      for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x){
+        ToType val = array[i];
+        ToType* vals = (ToType*)array;
+        vals[i] = val;
+      }
+    }
+
+    template<typename IndexType, typename ValueType>
+    __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+    equi_prob3(const IndexType n,
+               const IndexType e,
+               const IndexType *csrPtr,
+               const IndexType *csrInd,
+               ValueType *val,
+               IndexType *degree) {
+        int j, row, col;
+        for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) {
+            for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1];
+                    j += gridDim.y * blockDim.y) {
+                col = csrInd[j];
+                val[j] = 1.0 / degree[col];
+                //val[j] = 999;
+            }
+        }
+    }
+
+    template<typename IndexType, typename ValueType>
+    __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+    equi_prob2(const IndexType n,
+               const IndexType e,
+               const IndexType *csrPtr,
+               const IndexType *csrInd,
+               ValueType *val,
+               IndexType *degree) {
+        int row = blockIdx.x * blockDim.x + threadIdx.x;
+        if (row < n) {
+            int row_begin = csrPtr[row];
+            int row_end = csrPtr[row + 1];
+            int col;
+            for (int i = row_begin; i < row_end; i++) {
+                col = csrInd[i];
+                val[i] = 1.0 / degree[col];
+            }
+        }
+    }
+
+// compute the H^T values for an already transposed adjacency matrix, leveraging coo info
+    template<typename IndexType, typename ValueType>
+    void HT_matrix_csc_coo(const IndexType n,
+                           const IndexType e,
+                           const IndexType *csrPtr,
+                           const IndexType *csrInd,
+                           ValueType *val,
+                           ValueType *bookmark) {
+        IndexType *degree;
+        cudaStream_t stream { nullptr };
+        ALLOC_TRY((void**)&degree, sizeof(IndexType) * n, stream);
+        cudaMemset(degree, 0, sizeof(IndexType) * n);
+
+        dim3 nthreads, nblocks;
+        nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS);
+        nthreads.y = 1;
+        nthreads.z = 1;
+        nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
+        nblocks.y = 1;
+        nblocks.z = 1;
+        degree_coo<IndexType, IndexType> <<<nblocks, nthreads>>>(n, e, csrInd, degree);
+        cudaCheckError();
+
+        int y = 4;
+        nthreads.x = 32 / y;
+        nthreads.y = y;
+        nthreads.z = 8;
+        nblocks.x = 1;
+        nblocks.y = 1;
+        nblocks.z = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS); //1;
+        equi_prob3<IndexType, ValueType> <<<nblocks, nthreads>>>(n, e, csrPtr, csrInd, val, degree);
+        cudaCheckError();
+
+        ValueType a = 0.0;
+        fill(n, bookmark, a);
+        cudaCheckError();
+
+        nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS);
+        nthreads.y = 1;
+        nthreads.z = 1;
+        nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS);
+        nblocks.y = 1;
+        nblocks.z = 1;
+        flag_leafs_kernel<IndexType, ValueType> <<<nblocks, nthreads>>>(n, degree, bookmark);
+        cudaCheckError();
+        ALLOC_FREE_TRY(degree, stream);
+    }
+
+    template<typename IndexType, typename ValueType>
+    __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS)
+    permute_vals_kernel(const IndexType e, IndexType *perm, ValueType *in, ValueType *out) {
+        for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x)
+            out[i] = in[perm[i]];
+    }
+
+    template<typename IndexType, typename ValueType>
+    void permute_vals(const IndexType e, IndexType *perm, ValueType *in, ValueType *out) {
+        int nthreads = min(e, CUDA_MAX_KERNEL_THREADS);
+        int nblocks = min((e + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS);
+        permute_vals_kernel<<<nblocks, nthreads>>>(e, perm, in, out);
+    }
+
+// This will remove duplicate along with sorting
+// This will sort the COO Matrix, row will be sorted and each column of same row will be sorted. 
+    template<typename IndexType, typename ValueType, typename SizeT>
+    void remove_duplicate(IndexType* src, IndexType* dest, ValueType* val, SizeT &nnz) {
+        cudaStream_t stream {nullptr};
+        if (val != NULL) {
+            thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
+                                       thrust::raw_pointer_cast(val),
+                                       thrust::raw_pointer_cast(val) + nnz,
+                                       thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src),
+                                       thrust::raw_pointer_cast(dest))));
+            thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
+                                       thrust::raw_pointer_cast(dest),
+                                       thrust::raw_pointer_cast(dest + nnz),
+                                       thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src),
+                                       thrust::raw_pointer_cast(val))));
+            thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
+                                       thrust::raw_pointer_cast(src),
+                                       thrust::raw_pointer_cast(src + nnz),
+                                       thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(dest),
+                                       thrust::raw_pointer_cast(val))));
+
+            typedef thrust::tuple<IndexType*, ValueType*> IteratorTuple;
+            typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+            typedef thrust::tuple<IndexType*, ZipIterator> ZipIteratorTuple;
+            typedef thrust::zip_iterator<ZipIteratorTuple> ZipZipIterator;
+
+            ZipZipIterator newEnd =
+                    thrust::unique(rmm::exec_policy(stream)->on(stream),
+                                   thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src),
+                                   thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(dest),
+                                   thrust::raw_pointer_cast(val))))),
+                                   thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src + nnz),
+                                   thrust::make_zip_iterator(thrust::make_tuple(dest + nnz,
+                                   val + nnz)))));
+
+            ZipIteratorTuple endTuple = newEnd.get_iterator_tuple();
+            IndexType* row_end = thrust::get<0>(endTuple);
+
+            nnz = ((size_t) row_end - (size_t) src) / sizeof(IndexType);
+        }
+        else
+        {
+            thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
+                                       thrust::raw_pointer_cast(dest),
+                                       thrust::raw_pointer_cast(dest + nnz),
+                                       thrust::raw_pointer_cast(src));
+            thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream),
+                                       thrust::raw_pointer_cast(src),
+                                       thrust::raw_pointer_cast(src + nnz),
+                                       thrust::raw_pointer_cast(dest));
+
+            typedef thrust::tuple<IndexType*, IndexType*> IteratorTuple;
+            typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+            ZipIterator newEnd =
+                    thrust::unique(rmm::exec_policy(stream)->on(stream),
+                                   thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src),
+                                   thrust::raw_pointer_cast(dest))),
+                                   thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src + nnz),
+                                   thrust::raw_pointer_cast(dest + nnz))));
+
+            IteratorTuple endTuple = newEnd.get_iterator_tuple();
+            IndexType* row_end = thrust::get<0>(endTuple);
+
+            nnz = ((size_t) row_end - (size_t) src) / sizeof(IndexType);
+        }
+    }
+
+    template<typename IndexType>
+    __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) offsets_to_indices_kernel(const IndexType *offsets,
+                                                                                         IndexType v,
+                                                                                         IndexType *indices) {
+        int tid, ctaStart;
+        tid = threadIdx.x;
+        ctaStart = blockIdx.x;
+
+        for (int j = ctaStart; j < v; j += gridDim.x) {
+            IndexType colStart = offsets[j];
+            IndexType colEnd = offsets[j + 1];
+            IndexType rowNnz = colEnd - colStart;
+
+            for (int i = 0; i < rowNnz; i += blockDim.x) {
+                if ((colStart + tid + i) < colEnd) {
+                    indices[colStart + tid + i] = j;
+                }
+            }
+        }
+    }
+
+    template<typename IndexType>
+    void offsets_to_indices(const IndexType *offsets, IndexType v, IndexType *indices) {
+        int nthreads = min(v, CUDA_MAX_KERNEL_THREADS);
+        int nblocks = min((v + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS);
+        offsets_to_indices_kernel<<<nblocks, nthreads>>>(offsets, v, indices);
+        cudaCheckError();
+    }
+
+    template<typename IndexType>
+    void sequence(IndexType n, IndexType *vec, IndexType init = 0) {
+        thrust::sequence(thrust::device,
+                         thrust::device_pointer_cast(vec),
+                         thrust::device_pointer_cast(vec + n),
+                         init);
+        cudaCheckError();
+    }
+
+} //namespace cugraph
diff --git a/cpp/src/grmat.cu b/cpp/src/utilities/grmat.cu
similarity index 96%
rename from cpp/src/grmat.cu
rename to cpp/src/utilities/grmat.cu
index f0b9b79b456..8b5a50aacd7 100644
--- a/cpp/src/grmat.cu
+++ b/cpp/src/utilities/grmat.cu
@@ -176,15 +176,13 @@ gdf_error main_(gdf_column *src,  gdf_column *dest, gdf_column *val, CommandLine
 
     if (util::SetDevice(gpu_idx[0]))
         return GDF_CUDA_ERROR;
-    //RMM:
-    //
-    cudaStream_t stream{nullptr};
-    rmm_temp_allocator allocator(stream);
-    ALLOC_MANAGED_TRY((void**)&coo.row, sizeof(VertexId) * rmat_all_edges, stream);
-    ALLOC_MANAGED_TRY((void**)&coo.col, sizeof(VertexId) * rmat_all_edges, stream);
+
+    cudaStream_t stream {nullptr};
+    ALLOC_TRY((void**)&coo.row, sizeof(VertexId) * rmat_all_edges, stream);
+    ALLOC_TRY((void**)&coo.col, sizeof(VertexId) * rmat_all_edges, stream);
     if (val != nullptr)
     {
-        ALLOC_MANAGED_TRY((void**)&coo.val, sizeof(Value) * rmat_all_edges, stream);
+        ALLOC_TRY((void**)&coo.val, sizeof(Value) * rmat_all_edges, stream);
     }
     if ((coo.row == NULL) ||(coo.col == NULL))
     {
@@ -247,7 +245,7 @@ gdf_error main_(gdf_column *src,  gdf_column *dest, gdf_column *val, CommandLine
     
     cudaMemcpy((void*)&nodes_row, (void*)&(coo.row[rmat_all_edges-1]), sizeof(VertexId), cudaMemcpyDeviceToHost);
   
-    tmp = thrust::max_element(thrust::cuda::par(allocator).on(stream),
+    tmp = thrust::max_element(rmm::exec_policy(stream)->on(stream),
                                 thrust::device_pointer_cast((VertexId*)(coo.col)), 
                                 thrust::device_pointer_cast((VertexId*)(coo.col + rmat_all_edges)));
     nodes_col = tmp[0];
@@ -348,7 +346,7 @@ gdf_error gdf_grmat_gen (const char* argv, size_t& vertices, size_t& edges, gdf_
         {
             status = main_<long long, long long, double> (src, dest, val, &args, vertices, edges);
         }
-	else
+        else
         {
             status = main_<long long, long long, float> (src, dest, val, &args, vertices, edges);
         }
diff --git a/cpp/src/heap.cuh b/cpp/src/utilities/heap.cuh
similarity index 100%
rename from cpp/src/heap.cuh
rename to cpp/src/utilities/heap.cuh
diff --git a/cpp/src/utilities/nvgraph_error_utils.h b/cpp/src/utilities/nvgraph_error_utils.h
new file mode 100644
index 00000000000..8ece5630d43
--- /dev/null
+++ b/cpp/src/utilities/nvgraph_error_utils.h
@@ -0,0 +1,71 @@
+#ifndef NVGRAPH_ERRORUTILS_H
+#define NVGRAPH_ERRORUTILS_H
+
+#include <nvgraph/nvgraph.h>
+
+#ifdef VERBOSE
+#define NVG_TRY(call)                                           \
+{                                                               \
+  nvgraphStatus_t err_code = (call);                            \
+  if (err_code != NVGRAPH_STATUS_SUCCESS) {                     \
+      switch (err_code) {                                       \
+        case NVGRAPH_STATUS_SUCCESS:                            \
+          return GDF_SUCCESS;                                   \
+        case NVGRAPH_STATUS_NOT_INITIALIZED:                    \
+          return GDF_INVALID_API_CALL;                          \
+        case NVGRAPH_STATUS_INVALID_VALUE:                      \
+          return GDF_INVALID_API_CALL;                          \
+        case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED:                 \
+          return GDF_UNSUPPORTED_DTYPE;                         \
+        case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED:           \
+          return GDF_INVALID_API_CALL;                          \
+        default:                                                \
+          return GDF_CUDA_ERROR;                                \
+      }                                                         \
+  }                                                             \
+}
+#else
+#define NVG_TRY(call)                                           \
+{                                                               \
+  nvgraphStatus_t err_code = (call);                            \
+  if (err_code != NVGRAPH_STATUS_SUCCESS) {                     \
+      switch (err_code) {                                       \
+        case NVGRAPH_STATUS_NOT_INITIALIZED:                    \
+          std::cerr << "nvGRAPH not initialized";               \
+          return GDF_CUDA_ERROR;                                \
+        case NVGRAPH_STATUS_ALLOC_FAILED:                       \
+          std::cerr << "nvGRAPH alloc failed";                  \
+          return GDF_CUDA_ERROR;                                \
+        case NVGRAPH_STATUS_INVALID_VALUE:                      \
+          std::cerr << "nvGRAPH invalid value";                 \
+          return GDF_CUDA_ERROR;                                \
+        case NVGRAPH_STATUS_ARCH_MISMATCH:                      \
+          std::cerr << "nvGRAPH arch mismatch";                 \
+          return GDF_CUDA_ERROR;                                \
+        case NVGRAPH_STATUS_MAPPING_ERROR:                      \
+          std::cerr << "nvGRAPH mapping error";                 \
+          return GDF_CUDA_ERROR;                                \
+        case NVGRAPH_STATUS_EXECUTION_FAILED:                   \
+          std::cerr << "nvGRAPH execution failed";              \
+          return GDF_CUDA_ERROR;                                \
+        case NVGRAPH_STATUS_INTERNAL_ERROR:                     \
+          std::cerr << "nvGRAPH internal error";                \
+          return GDF_CUDA_ERROR;                                \
+        case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED:                 \
+          std::cerr << "nvGRAPH type not supported";            \
+          return GDF_CUDA_ERROR;                                \
+        case NVGRAPH_STATUS_NOT_CONVERGED:                      \
+          std::cerr << "nvGRAPH algorithm failed to converge";  \
+          return GDF_CUDA_ERROR;                                \
+        case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED:           \
+          std::cerr << "nvGRAPH graph type not supported";      \
+          return GDF_CUDA_ERROR;                                \
+        default:                                                \
+          std::cerr << "Unknown nvGRAPH Status";                \
+          return GDF_CUDA_ERROR;                                \
+      }                                                         \
+    }                                                           \
+}
+#endif
+
+#endif
diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh
index a416baf7256..aa59d5d4c20 100644
--- a/datasets/get_test_data.sh
+++ b/datasets/get_test_data.sh
@@ -6,15 +6,22 @@ cd tmp
 wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/test/datasets.tgz
 wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/test/ref/pagerank.tgz
 wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/test/ref/sssp.tgz
+wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/benchmark/hibench/hibench_1_huge.tgz
+wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/benchmark/hibench/hibench_1_large.tgz
+wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/benchmark/hibench/hibench_1_small.tgz
 cd ..
 
 mkdir test
 mkdir test/ref
+mkdir benchmark
 
 echo Decompressing ...
 tar xvzf tmp/datasets.tgz -C test
 tar xvzf tmp/pagerank.tgz -C test/ref
 tar xvzf tmp/sssp.tgz -C test/ref
+tar xvzf tmp/hibench_1_huge.tgz -C benchmark
+tar xvzf tmp/hibench_1_large.tgz -C benchmark
+tar xvzf tmp/hibench_1_small.tgz -C benchmark
 
 rm -rf tmp
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index affc1c0ec6e..1551147de15 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -64,9 +64,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.7'
+version = '0.8'
 # The full version, including alpha/beta/rc tags.
-release = '0.7.0.dev0'
+release = '0.8.0a'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.