diff --git a/.gitmodules b/.gitmodules index 605fac63cc4..ecd69af403b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,3 @@ -[submodule "cpp/nvgraph/cpp/thirdparty/cnmem"] - path = cpp/nvgraph/cpp/thirdparty/cnmem - url = https://github.com/NVIDIA/cnmem.git - branch = master [submodule "cpp/nvgraph/cpp/thirdparty/cub"] path = cpp/nvgraph/cpp/thirdparty/cub url = https://github.com/NVlabs/cub.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 05bb075c7b7..c5d81c232ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,19 @@ +# cuGraph 0.8.0 (Date TBD) + +## New Features +- PR #287 SNMG power iteration step1 +- PR #297 SNMG degree calculation + +## Improvements +- PR #291 nvGraph is updated to use RMM instead of directly invoking cnmem functions. +- PR #286 Reorganized cugraph source directory + + +## Bug Fixes +- PR #283 Automerge fix +- PR #291 Fixed a RMM memory allocation failure due to duplicate copies of cnmem.o +- PR #291 Fixed a cub CsrMV call error when RMM pool allocator is used. + # cuGraph 0.7.0 (Date TBD) ## New Features @@ -54,6 +70,7 @@ - PR #262 Removed networkx conda dependency for both build and runtime - PR #271 Removed nvgraph conda dependency - PR #276 Removed libgdf_cffi import from bindings +- PR #288 Add boost as a conda dependency # cuGraph 0.6.0 (22 Mar 2019) diff --git a/README.md b/README.md index 85183d35f27..d2f699ab7b6 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ These limitations are being addressed and will be fixed future versions. ## Getting cuGraph ### Intro -There are 4 ways to get cuGraph : +There are 3 ways to get cuGraph : 1. [Quick start with Docker Demo Repo](#quick) 1. [Conda Installation](#conda) 1. [Build from Source](#source) @@ -133,5 +133,4 @@ The RAPIDS suite of open source software libraries aim to enable execution of en ### Apache Arrow on GPU -The GPU version of [Apache Arrow](https://arrow.apache.org/) is a common API that enables efficient interchange of tabular data between processes running on the GPU. End-to-end computation on the GPU avoids unnecessary copying and converting of data off the GPU, reducing compute time and cost for high-performance analytics common in artificial intelligence workloads. As the name implies, cuDF uses the Apache Arrow columnar data format on the GPU. Currently, a subset of the features in Apache Arrow are supported. - +The GPU version of [Apache Arrow](https://arrow.apache.org/) is a common API that enables efficient interchange of tabular data between processes running on the GPU. End-to-end computation on the GPU avoids unnecessary copying and converting of data off the GPU, reducing compute time and cost for high-performance analytics common in artificial intelligence workloads. As the name implies, cuDF uses the Apache Arrow columnar data format on the GPU. Currently, a subset of the features in Apache Arrow are supported. \ No newline at end of file diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml index 25a7d426bd2..156db3777a3 100644 --- a/conda/recipes/cugraph/meta.yaml +++ b/conda/recipes/cugraph/meta.yaml @@ -26,11 +26,11 @@ requirements: build: - python x.x - libcugraph={{ version }} - - cudf=0.7* + - cudf=0.8* run: - python x.x - libcugraph={{ version }} - - cudf=0.7* + - cudf=0.8* #test: # commands: diff --git a/conda/recipes/libcugraph/meta.yaml b/conda/recipes/libcugraph/meta.yaml index 0111fef13c8..18bb757eaf9 100644 --- a/conda/recipes/libcugraph/meta.yaml +++ b/conda/recipes/libcugraph/meta.yaml @@ -25,15 +25,15 @@ build: requirements: build: - cmake>=3.12.4 - - libcudf=0.7* + - libcudf=0.8* - cython - cudatoolkit {{ cuda_version }}.* - - boost-cpp + - boost run: - - libcudf=0.7* + - libcudf=0.8* - cython - {{ pin_compatible('cudatoolkit', max_pin='x.x') }} - - boost-cpp + - boost #test: # commands: diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 464fed08342..f37b0024489 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -16,7 +16,7 @@ #============================================================================= cmake_minimum_required(VERSION 3.12 FATAL_ERROR) -project(cuGraph VERSION 0.6.0 LANGUAGES C CXX CUDA) +project(cuGraph VERSION 0.8.0 LANGUAGES C CXX CUDA) ################################################################################################### # - cmake modules --------------------------------------------------------------------------------- @@ -136,7 +136,7 @@ include(ConfigureNvgraph) ################################################################################################### # - Find and add different modules and supporting repos ------------------------------------------- -find_package(Boost 1.45.0 COMPONENTS system) +find_package(Boost REQUIRED) find_package(OpenMP) if (OPENMP_FOUND) @@ -229,14 +229,21 @@ link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT ################################################################################################### # - library targets ------------------------------------------------------------------------------- add_library(cugraph SHARED - src/grmat.cu - src/cugraph.cu - src/pagerank.cu - src/bfs.cu - src/jaccard.cu - src/overlap.cu - src/nvgraph_gdf.cu - src/two_hop_neighbors.cu + src/utilities/grmat.cu + src/utilities/degree.cu + src/structure/cugraph.cu + src/link_analysis/pagerank.cu + src/traversal/bfs.cu + src/link_prediction/jaccard.cu + src/link_prediction/overlap.cu + src/converters/nvgraph.cu + src/converters/renumber.cu + src/community/nvgraph_gdf.cu + src/traversal/nvgraph_sssp.cu + src/traversal/two_hop_neighbors.cu + src/snmg/blas/spmv.cu + src/snmg/degree/degree.cu + src/snmg/utils.cu ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/test_utils.cu ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/error_utils.cu ${CMAKE_CURRENT_BINARY_DIR}/gunrock/gunrock/util/misc_utils.cu diff --git a/cpp/include/algorithms.h b/cpp/include/algorithms.h index 77fbc6525f5..277f2599512 100644 --- a/cpp/include/algorithms.h +++ b/cpp/include/algorithms.h @@ -170,3 +170,19 @@ gdf_error gdf_louvain(gdf_graph *graph, void *final_modularity, void *num_level, gdf_column *louvain_parts); + +/** + * Computes the in-degree, out-degree, or the sum of both (determined by x) for the given graph. This is + * a multi-gpu operation operating on a partitioned graph. + * @param x 0 for in+out, 1 for in, 2 for out + * @param part_offsets Contains the start/end of each partitions vertex id range + * @param off The local partition offsets + * @param ind The local partition indices + * @param x_cols The results (located on each GPU) + * @return Error code + */ +gdf_error gdf_snmg_degree(int x, + size_t* part_offsets, + gdf_column* off, + gdf_column* ind, + gdf_column** x_cols); diff --git a/cpp/include/rmm_utils.h b/cpp/include/rmm_utils.h index 12b1b988fb6..d5ca2b3c346 100755 --- a/cpp/include/rmm_utils.h +++ b/cpp/include/rmm_utils.h @@ -14,7 +14,6 @@ * limitations under the License. */ #pragma once -///#define DEBUG_NO_RMM #include #include @@ -27,58 +26,10 @@ throw std::runtime_error(ss.str()); \ } -#ifdef DEBUG_NO_RMM - -#include -#include -#include -#include - -template -//using rmm_allocator = thrust::device_malloc_allocator; -class rmm_allocator : public thrust::device_malloc_allocator -{ - public: - using value_type = T; - - rmm_allocator(cudaStream_t stream = 0) : stream(stream) {} - ~rmm_allocator() {} - -private: - cudaStream_t stream; -}; - -using rmm_temp_allocator = rmm_allocator; // Use this alias for thrust::cuda::par(allocator).on(stream) - -#define ALLOC_TRY(ptr, sz, stream){ \ - if (stream == nullptr) ; \ - cudaMalloc((ptr), (sz)); \ -} - -#define ALLOC_MANAGED_TRY(ptr, sz, stream){ \ - if (stream == nullptr) ; \ - cudaMallocManaged((ptr), (sz)); \ -} - - //#define REALLOC_TRY(ptr, new_sz, stream) - -#define ALLOC_FREE_TRY(ptr, stream){ \ - if (stream == nullptr) ; \ - cudaFree( (ptr) ); \ -} -#else - #include #include -using rmm_temp_allocator = rmm_allocator; - -#define ALLOC_TRY( ptr, sz, stream ){ \ - RMM_TRY_THROW( RMM_ALLOC((ptr), (sz), (stream)) ) \ - } - -//TODO: change this when RMM alloc managed will be available !!!!! -#define ALLOC_MANAGED_TRY(ptr, sz, stream){ \ +#define ALLOC_TRY( ptr, sz, stream ){ \ RMM_TRY_THROW( RMM_ALLOC((ptr), (sz), (stream)) ) \ } @@ -86,9 +37,6 @@ using rmm_temp_allocator = rmm_allocator; RMM_TRY_THROW( RMM_REALLOC((ptr), (sz), (stream)) ) \ } -#define ALLOC_FREE_TRY(ptr, stream){ \ +#define ALLOC_FREE_TRY(ptr, stream){ \ RMM_TRY_THROW( RMM_FREE( (ptr), (stream) ) ) \ } - -#endif - diff --git a/cpp/nvgraph/cpp/CMakeLists.txt b/cpp/nvgraph/cpp/CMakeLists.txt index 42d365400e6..27342aeea70 100644 --- a/cpp/nvgraph/cpp/CMakeLists.txt +++ b/cpp/nvgraph/cpp/CMakeLists.txt @@ -20,12 +20,12 @@ project(NV_GRAPH VERSION 0.4.0 LANGUAGES C CXX CUDA) ################################################################################################### # - compiler options ------------------------------------------------------------------------------ -set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 14) set(CMAKE_C_COMPILER $ENV{CC}) set(CMAKE_CXX_COMPILER $ENV{CXX}) set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CUDA_STANDARD 11) +set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_STANDARD_REQUIRED ON) if(CMAKE_COMPILER_IS_GNUCXX) @@ -47,7 +47,7 @@ option(BUILD_TESTS "Configure CMake to build tests" if(CMAKE_COMPILER_IS_GNUCXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") - option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" OFF) + option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON) if(CMAKE_CXX11_ABI) message(STATUS "nvGraph: Enabling the GLIBCXX11 ABI") else() @@ -67,6 +67,25 @@ include(FeatureSummary) include(CheckIncludeFiles) include(CheckLibraryExists) +################################################################################################### +# - add rmm -------------------------------------------------------------------------------------- +find_path(RMM_INCLUDE "rmm" + HINTS "$ENV{RMM_ROOT}/include" + "$ENV{CONDA_PREFIX}/include/rmm" + "$ENV{CONDA_PREFIX}/include") + +find_library(RMM_LIBRARY "rmm" + HINTS "$ENV{RMM_ROOT}/lib" + "$ENV{CONDA_PREFIX}/lib") + +message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}") +message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}") + +add_library(rmm SHARED IMPORTED ${RMM_LIBRARY}) +if (RMM_INCLUDE AND RMM_LIBRARY) + set_target_properties(rmm PROPERTIES IMPORTED_LOCATION ${RMM_LIBRARY}) +endif (RMM_INCLUDE AND RMM_LIBRARY) + ################################################################################################### # - add gtest ------------------------------------------------------------------------------------- @@ -90,9 +109,9 @@ include_directories( "${CMAKE_BINARY_DIR}/include" "${CMAKE_SOURCE_DIR}/include" "${CMAKE_SOURCE_DIR}/thirdparty/cub" - "${CMAKE_SOURCE_DIR}/thirdparty/cnmem/include" "${CMAKE_SOURCE_DIR}/../external" "${CMAKE_SOURCE_DIR}/../external/cusp" + "${RMM_INCLUDE}" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" ) @@ -101,13 +120,13 @@ include_directories( link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc "${CMAKE_BINARY_DIR}/lib" - "${GTEST_LIBRARY_DIR}") + "${GTEST_LIBRARY_DIR}" + "${RMM_LIBRARY}") ################################################################################################### # - library targets ------------------------------------------------------------------------------- if(NVGRAPH_LIGHT MATCHES True) add_library(nvgraph_rapids SHARED - thirdparty/cnmem/src/cnmem.cpp src/arnoldi.cu src/bfs.cu src/bfs2d.cu @@ -141,7 +160,6 @@ if(NVGRAPH_LIGHT MATCHES True) ) else(NVGRAPH_LIGHT MATCHES True) add_library(nvgraph_rapids SHARED - thirdparty/cnmem/src/cnmem.cpp src/arnoldi.cu src/bfs.cu src/bfs2d.cu @@ -203,7 +221,7 @@ endif(NVGRAPH_LIGHT MATCHES True) ################################################################################################### # - link libraries -------------------------------------------------------------------------------- -target_link_libraries(nvgraph_rapids cublas cusparse curand cusolver cudart ) +target_link_libraries(nvgraph_rapids cublas cusparse curand cusolver rmm cudart cuda) ################################################################################################### # - install targets ------------------------------------------------------------------------------- diff --git a/cpp/nvgraph/cpp/include/2d_partitioning.h b/cpp/nvgraph/cpp/include/2d_partitioning.h index c344990db12..ca2be7a8b1f 100644 --- a/cpp/nvgraph/cpp/include/2d_partitioning.h +++ b/cpp/nvgraph/cpp/include/2d_partitioning.h @@ -42,1335 +42,1344 @@ namespace nvgraph { - template - struct CSR_Result_Weighted { - int64_t size; - int64_t nnz; - T* rowOffsets; - T* colIndices; - W* edgeWeights; - - CSR_Result_Weighted() : - size(0), nnz(0), rowOffsets(NULL), colIndices(NULL), edgeWeights(NULL) { - } - - void Destroy() { - if (rowOffsets) - cudaFree(rowOffsets); - if (colIndices) - cudaFree(colIndices); - if (edgeWeights) - cudaFree(edgeWeights); - } - }; - - // Define kernel for copying run length encoded values into offset slots. - template - __global__ void offsetsKernel(T runCounts, T* unique, T* counts, T* offsets) { - for (int32_t idx = blockDim.x * blockIdx.x + threadIdx.x; - idx < runCounts; - idx += gridDim.x * blockDim.x) { - offsets[unique[idx]] = counts[idx]; - } - } - - /** - * Method for converting COO to CSR format - * @param sources The array of source indices - * @param destinations The array of destination indices - * @param edgeWeights The array of edge weights - * @param nnz The number of non zero values - * @param maxId The largest id contained in the matrix - * @param result The result is stored here. - */ - template - void ConvertCOOtoCSR_weighted(T* sources, - T* destinations, - W* edgeWeights, - int64_t nnz, - T maxId, - CSR_Result_Weighted& result) { - // Sort source and destination columns by source - // Allocate local memory for operating on - T* srcs, *dests; - W* weights = NULL; - cudaMalloc(&srcs, sizeof(T) * nnz); - cudaMalloc(&dests, sizeof(T) * nnz); - if (edgeWeights) - cudaMalloc(&weights, sizeof(W) * nnz); - cudaMemcpy(srcs, sources, sizeof(T) * nnz, cudaMemcpyDefault); - cudaMemcpy(dests, destinations, sizeof(T) * nnz, cudaMemcpyDefault); - if (edgeWeights) - cudaMemcpy(weights, edgeWeights, sizeof(W) * nnz, cudaMemcpyDefault); - - // Call Thrust::sort_by_key to sort the arrays with srcs as keys: - if (edgeWeights) - thrust::sort_by_key(thrust::device, - srcs, - srcs + nnz, - thrust::make_zip_iterator(thrust::make_tuple(dests, weights))); - else - thrust::sort_by_key(thrust::device, srcs, srcs + nnz, dests); - - result.size = maxId + 1; - - // Allocate offsets array - cudaMalloc(&result.rowOffsets, (maxId + 2) * sizeof(T)); - - // Set all values in offsets array to zeros - cudaMemset(result.rowOffsets, 0, (maxId + 2) * sizeof(T)); - - // Allocate temporary arrays same size as sources array, and single value to get run counts - T* unique, *counts, *runCount; - cudaMalloc(&unique, (maxId + 1) * sizeof(T)); - cudaMalloc(&counts, (maxId + 1) * sizeof(T)); - cudaMalloc(&runCount, sizeof(T)); - - // Use CUB run length encoding to get unique values and run lengths - void *tmpStorage = NULL; - size_t tmpBytes = 0; - cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz); - cudaMalloc(&tmpStorage, tmpBytes); - cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz); - cudaFree(tmpStorage); - - // Set offsets to run sizes for each index - T runCount_h; - cudaMemcpy(&runCount_h, runCount, sizeof(T), cudaMemcpyDefault); - int threadsPerBlock = 1024; - int numBlocks = min(65535, (runCount_h + threadsPerBlock - 1) / threadsPerBlock); - offsetsKernel<<>>(runCount_h, unique, counts, result.rowOffsets); - - // Scan offsets to get final offsets - thrust::exclusive_scan(thrust::device, - result.rowOffsets, - result.rowOffsets + maxId + 2, - result.rowOffsets); - - // Clean up temporary allocations - result.nnz = nnz; - result.colIndices = dests; - result.edgeWeights = weights; - cudaFree(srcs); - cudaFree(unique); - cudaFree(counts); - cudaFree(runCount); - } - - /** - * Describes the 2D decomposition of a partitioned matrix. - */ - template - class MatrixDecompositionDescription { - protected: - GlobalType numRows; // Global number of rows in matrix - GlobalType numCols; // Global number of columns in matrix - GlobalType nnz; // Global number of non-zeroes in matrix - GlobalType blockRows; // Number of rows of blocks in the decomposition - GlobalType blockCols; // Number of columns of rows in the decomposition - LocalType offset; - // Offsets-like arrays for rows and columns defining the start/end of the - // sections of the global id space belonging to each row and column. - std::vector rowOffsets; - std::vector colOffsets; - // Array of integers one for each block, defining the device it is assigned to - std::vector deviceAssignments; - std::vector blockStreams; - public: - - MatrixDecompositionDescription() : - numRows(0), numCols(0), nnz(0), blockRows(0), blockCols(0) { - rowOffsets.push_back(0); - colOffsets.push_back(0); - deviceAssignments.push_back(0); - } - - // Basic constructor, just takes in the values of its members. - MatrixDecompositionDescription(GlobalType numRows, - GlobalType numCols, - GlobalType nnz, - GlobalType blockRows, - GlobalType blockCols, - std::vector rowOffsets, - std::vector colOffsets, - std::vector deviceAssignments) : - numRows(numRows), numCols(numCols), nnz(nnz), blockRows(blockRows), - blockCols(blockCols), rowOffsets(rowOffsets), colOffsets(colOffsets), - deviceAssignments(deviceAssignments) { - } - - // Constructs a MatrixDecompositionDescription for a square matrix given the - // number of rows in the matrix and number of rows of blocks. - MatrixDecompositionDescription(GlobalType numRows, - GlobalType numBlockRows, - GlobalType nnz, - std::vector devices) : - numRows(numRows), - numCols(numRows), - blockRows(numBlockRows), - blockCols(numBlockRows), - nnz(nnz) { - // Tracking the current set device to change back - int currentDevice; - cudaGetDevice(¤tDevice); - - // Setting up the row and col offsets into equally sized chunks - GlobalType remainder = numRows % blockRows; - if (remainder != 0) - offset = (numRows + blockRows - remainder) / blockRows; - else - offset = numRows / blockRows; - - rowOffsets.resize(blockRows + 1); - colOffsets.resize(blockRows + 1); - for (int i = 0; i < blockRows; i++) { - rowOffsets[i] = i * offset; - colOffsets[i] = i * offset; - } - rowOffsets.back() = blockRows * offset; - colOffsets.back() = blockCols * offset; - - // Setting up the device assignments using the given device ids and also - // setting up the stream associated with each block. - deviceAssignments.resize(getNumBlocks()); - blockStreams.resize(getNumBlocks()); - for (int i = 0; i < getNumBlocks(); i++) { - int device = devices[i % devices.size()]; - deviceAssignments[i] = device; - cudaSetDevice(device); - cudaStream_t stream; - cudaStreamCreate(&stream); - blockStreams[i] = stream; - } - - // Restoring to current device when called - cudaSetDevice(currentDevice); - } - - // Gets the row id for the block containing the given global row id - int32_t getRowId(GlobalType val) const { - return std::upper_bound(rowOffsets.begin(), rowOffsets.end(), val) - rowOffsets.begin() - 1; - } - - // Gets the column id for the block containing the given global column id - int32_t getColId(GlobalType val) const { - return std::upper_bound(colOffsets.begin(), colOffsets.end(), val) - colOffsets.begin() - 1; - } - - // Gets the number of blocks in the decomposition: - int32_t getNumBlocks() const { - return blockRows * blockCols; - } - - // Getter for offset - LocalType getOffset() const { - return offset; - } - - // Getter for deviceAssignments - const std::vector& getDeviceAssignments() const { - return deviceAssignments; - } - - /** - * Getter for vector of streams for each block. - * @return Reference to vector of streams for each block - */ - const std::vector& getBlockStreams() const { - return blockStreams; - } - - /** - * Getter for nnz - * @return The global number of non-zero elements - */ - GlobalType getNnz() const { - return nnz; - } - - /** - * Getter method for numRows - * @return The number of global rows in the matrix - */ - GlobalType getNumRows() const { - return numRows; - } - - /** - * Getter for BlockRows - * @return The number of blocks in a row in the decomposition. - */ - GlobalType getBlockRows() const { - return blockRows; - } - - /** - * Getter for BlockCols - * @return The number of blocks in a column in the decomposition. - */ - GlobalType getBlockCols() const { - return blockCols; - } - - /** - * Given a block id, returns the row which that block is in. - * @param bId The block ID - * @return The row number - */ - int32_t getBlockRow(int32_t bId) const { - return bId / blockCols; - } - - /** - * Given a block id, returns the column which that block is in. - * @param bId The block ID - * @return The column number - */ - int32_t getBlockCol(int32_t bId) const { - return bId % blockCols; - } - - /** - * Takes a COO global row and produces the COO local row and the block to which it belongs. - * @param globalRow The global row ID - * @param globalCol The global column ID - * @param localRow The block local row ID (return) - * @param localCol The block local column ID (return) - * @param blockId The block ID (return) - */ - void convertGlobaltoLocalRow(GlobalType globalRow, - GlobalType globalCol, - LocalType& localRow, - LocalType& localCol, - int32_t& blockId) const { - int32_t rowId = getRowId(globalRow); - int32_t colId = getColId(globalCol); - blockId = rowId * blockCols + colId; - localRow = globalRow - rowOffsets[rowId]; - localCol = globalCol - colOffsets[colId]; - } - - /** - * Takes in a row ID and column ID and returns the corresponding block ID - * @param rowId The row ID - * @param colId The column ID - * @return The ID of the corresponding block - */ - int32_t getBlockId(int32_t rowId, int32_t colId) const { - return rowId * blockCols + colId; - } - - /** - * Helper method to synchronize all streams after operations are issued. - */ - void syncAllStreams() const { - int32_t numBlocks = getNumBlocks(); - int32_t current_device; - cudaGetDevice(¤t_device); - for (int32_t i = 0; i < numBlocks; i++) { - cudaSetDevice(deviceAssignments[i]); - cudaStreamSynchronize(blockStreams[i]); - } - cudaSetDevice(current_device); - } - - /** - * This method is only for testing and debugging use. - * @return A human readable string representation of the object - */ - std::string toString() const { - std::stringstream ss; - ss << "Global Info:\n\tnumRows: " << numRows << ", numCols: " << numCols << ", nnz: " - << nnz; - ss << "\n"; - ss << "Block Info:\n\tblockRows: " << blockRows << ", blockCols: " << blockCols; - ss << "\n"; - ss << "rowOffsets: ["; - for (int i = 0; i < (int) rowOffsets.size(); i++) - ss << rowOffsets[i] << (i == (int) rowOffsets.size() - 1 ? "]\n" : ", "); - ss << "colOffsets: ["; - for (int i = 0; i < (int) colOffsets.size(); i++) - ss << colOffsets[i] << (i == (int) colOffsets.size() - 1 ? "]\n" : ", "); - ss << "deviceAssignments: ["; - for (int i = 0; i < (int) deviceAssignments.size(); i++) - ss << deviceAssignments[i] << (i == (int) deviceAssignments.size() - 1 ? "]\n" : ", "); - return ss.str(); - } - }; - - template - class Matrix2d { - protected: - // Description of the matrix decomposition - MatrixDecompositionDescription description; - - // Array of block matrices forming the decomposition - std::vector*> blocks; - public: - Matrix2d() { - } - Matrix2d(MatrixDecompositionDescription descr, - std::vector*> blocks) : - description(descr), blocks(blocks) { - } - - const MatrixDecompositionDescription& getMatrixDecompositionDescription() { - return description; - } - - MultiValuedCsrGraph* getBlockMatrix(int32_t bId) { - return blocks[bId]; - } - - std::string toString() { - std::stringstream ss; - ss << "MatrixDecompositionDescription:\n" << description.toString(); - for (int i = 0; i < (int) blocks.size(); i++) { - ss << "Block " << i << ":\n"; - size_t numVerts = blocks[i]->get_num_vertices(); - size_t numEdges = blocks[i]->get_num_edges(); - size_t numValues = blocks[i]->getNumValues(); - ss << "numVerts: " << numVerts << ", numEdges: " << numEdges << "\n"; - LocalType* rowOffsets = (LocalType*) malloc((numVerts + 1) * sizeof(LocalType)); - LocalType* colIndices = (LocalType*) malloc(numEdges * sizeof(LocalType)); - ValueType* values = NULL; - if (numValues > 0) - values = (ValueType*) malloc(numEdges * sizeof(ValueType)); - cudaMemcpy(rowOffsets, - blocks[i]->get_raw_row_offsets(), - (numVerts + 1) * sizeof(LocalType), - cudaMemcpyDefault); - cudaMemcpy(colIndices, - blocks[i]->get_raw_column_indices(), - numEdges * sizeof(LocalType), - cudaMemcpyDefault); - if (values) - cudaMemcpy(values, - blocks[i]->get_raw_edge_dim(0), - numEdges * sizeof(ValueType), - cudaMemcpyDefault); - int idxCount = numEdges >= (numVerts + 1) ? numEdges : (numVerts + 1); - ss << "Idx\tOffset\tColInd\tValue\n"; - for (int j = 0; j < idxCount; j++) { - if (j < (int) numVerts + 1 && j < (int) numEdges) - ss << j << ":\t" << rowOffsets[j] << "\t" << colIndices[j] << "\t" - << (values ? values[j] : 0) - << "\n"; - else if (j < (int) numVerts + 1 && j >= (int) numEdges) - ss << j << ":\t" << rowOffsets[j] << "\n"; - else if (j >= (int) numVerts + 1 && j < (int) numEdges) - ss << j << ":\t" << "\t" << colIndices[j] << "\t" << (values ? values[j] : 0) - << "\n"; - } - free(rowOffsets); - free(colIndices); - free(values); - } - return ss.str(); - } - }; - - template - class VertexData2D { - const MatrixDecompositionDescription* description; - int32_t n; - std::vector > values; - public: - /** - * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription - * object which describes the matrix the data is attached to. Data buffers are - * allocated for each block using the offset from the description to size the - * buffers, and to locate the buffers on the same GPU as the matrix block. - */ - VertexData2D(const MatrixDecompositionDescription* descr) : - description(descr) { - // Resize the values array to be the same size as number of blocks - values.resize(descr->getNumBlocks()); - - // Grab the current device id to switch back after allocations are done - int current_device; - cudaGetDevice(¤t_device); - LocalType allocSize = descr->getOffset(); - n = allocSize; - // Allocate the data for each block - for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) { - int device = descr->getDeviceAssignments()[i]; - cudaSetDevice(device); - ValueType* d_current, *d_alternate; - cudaMalloc(&d_current, sizeof(ValueType) * n); - cudaMalloc(&d_alternate, sizeof(ValueType) * n); - values[i].d_buffers[0] = d_current; - values[i].d_buffers[1] = d_alternate; - } - - // Set the device back to what it was initially - cudaSetDevice(current_device); - } - - /** - * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription - * object, which describes the matrix the data is attached to, and an integer which indicates - * how many data elements should be allocated for each block. Data buffers are allocated - * for each block using the offset from the description to size the buffers, and to locate - * the buffers on the same GPU as the matrix block. - */ - VertexData2D(const MatrixDecompositionDescription* descr, size_t _n) : - description(descr) { - // Resize the values array to be the same size as number of blocks - values.resize(descr->getNumBlocks()); - - // Grab the current device id to switch back after allocations are done - int current_device; - cudaGetDevice(¤t_device); - LocalType allocSize = _n; - n = allocSize; - // Allocate the data for each block - for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) { - int device = descr->getDeviceAssignments()[i]; - cudaSetDevice(device); - ValueType* d_current, *d_alternate; - cudaMalloc(&d_current, sizeof(ValueType) * n); - cudaMalloc(&d_alternate, sizeof(ValueType) * n); - values[i].d_buffers[0] = d_current; - values[i].d_buffers[1] = d_alternate; - } - - // Set the device back to what it was initially - cudaSetDevice(current_device); - } - - ~VertexData2D() { - for (size_t i = 0; i < values.size(); i++) { - if (values[i].Current()) - cudaFree(values[i].Current()); - if (values[i].Alternate()) - cudaFree(values[i].Alternate()); - } - } - - /** - * Getter for n the size of each block's allocation in elements. - * @return The value of n - */ - int32_t getN() { - return n; - } - - /** - * Getter for the MatrixDecompositionDescription associated with this VertexData2D - * @return Pointer to the MatrixDecompositionDescription for this VertexData2D - */ - const MatrixDecompositionDescription* getDescription() { - return description; - } - - /** - * Gets the current buffer corresponding to the given block ID - */ - ValueType* getCurrent(int bId) { - return values[bId].Current(); - } - - /** - * Gets the alternate buffer corresponding to the given block ID - */ - ValueType* getAlternate(int bId) { - return values[bId].Alternate(); - } - - /** - * Swaps the current and alternate buffers for all block IDs - */ - void swapBuffers() { - for (size_t i = 0; i < values.size(); i++) - values[i].selector ^= 1; - } - - /** - * Sets an element in the global array, assuming that the data is currently - * valid and in the diagonal blocks. After calling this method either columnScatter - * or rowScatter should be called to propagate the change to all blocks. - */ - void setElement(GlobalType globalIndex, ValueType val) { - LocalType blockId = globalIndex / n; - LocalType blockOffset = globalIndex % n; - int32_t bId = description->getBlockId(blockId, blockId); - ValueType* copyTo = values[bId].Current() + blockOffset; - cudaMemcpy(copyTo, &val, sizeof(ValueType), cudaMemcpyDefault); - } - - /** - * Sets the elements of the global array, using the provided array of values. The values - * are set in the blocks of the diagonal, columnScatter or rowScatter should be called - * to propogate to all blocks. - * @param vals Pointer to an array with the values to be set. - */ - void setElements(ValueType* vals) { - LocalType offset = description->getOffset(); - int32_t numRows = description->getBlockRows(); - for (int i = 0; i < numRows; i++) { - int32_t id = description->getBlockId(i, i); - cudaStream_t stream = description->getBlockStreams()[id]; - ValueType* copyFrom = vals + i * n; - ValueType* copyTo = values[id].Current(); - cudaMemcpyAsync(copyTo, copyFrom, sizeof(ValueType) * n, cudaMemcpyDefault, stream); - } - description->syncAllStreams(); - } - - /** - * Fills the elements of the data array with the given value. - * The elements on the diagonal are filled with the given value. After filling, - * either rowScatter or columnScatter will copy the values across the blocks in - * either the rows or columns depending on the use. - * @param val The value to fill the array with - */ - void fillElements(ValueType val) { - int current_device; - cudaGetDevice(¤t_device); - int32_t numRows = description->getBlockRows(); - for (int32_t i = 0; i < numRows; i++) { - int32_t blockId = description->getBlockId(i, i); - ValueType* vals = getCurrent(blockId); - int deviceId = description->getDeviceAssignments()[blockId]; - cudaStream_t stream = description->getBlockStreams()[blockId]; - cudaSetDevice(deviceId); - thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val); - } - description->syncAllStreams(); - cudaSetDevice(current_device); - } - - /** - * Copies the values of the diagonal blocks in this VertexData2D into the - * VertexData2D specified. - * @param other Pointer to the VertexData2D to copy into - */ - void copyTo(VertexData2D* other) { - const MatrixDecompositionDescription* otherDescr = - other->getDescription(); - // Do a quick check that the sizes of both block arrays are the same. - if (description->getBlockRows() == otherDescr->getBlockRows() && n == other->getN()) { - // Issue asynchronous copies for each block's data - for (int i = 0; i < description->getBlockRows(); i++) { - int32_t bId = description->getBlockId(i, i); - ValueType* copyFrom = getCurrent(bId); - ValueType* copyTo = other->getCurrent(bId); - cudaStream_t stream = description->getBlockStreams()[bId]; - cudaMemcpyAsync(copyTo, copyFrom, n * sizeof(ValueType), cudaMemcpyDefault, stream); - } - // Synchronize the streams after the copies are done - for (int i = 0; i < description->getBlockRows(); i++) { - int32_t bId = description->getBlockId(i, i); - cudaStream_t stream = description->getBlockStreams()[bId]; - cudaStreamSynchronize(stream); - } - } - } - - /** - * This method implements a row-wise reduction of each blocks data into a - * single array for each row. The block on the diagonal will have the result. - */ - template - void rowReduce() { - int current_device; - cudaGetDevice(¤t_device); - Operator op; - - // For each row in the decomposition: - int32_t numRows = description->getBlockRows(); - std::vector blockIds; - for (int32_t i = 0; i < numRows; i++) { - // Put all the block ids for the row into a vector, with the ID of the diagonal block - // at index 0. - std::vector blockIds; - blockIds.push_back(-1); - for (int32_t j = 0; j < numRows; j++) { - if (i == j) { - blockIds[0] = description->getBlockId(i, j); - } - else { - blockIds.push_back(description->getBlockId(i, j)); - } - } - - // Do a binary tree reduction. At each step the primary buffer of the sender is - // copied into the secondary buffer of the receiver. After the copy is done - // each receiver performs the reduction operator and stores the result in it's - // primary buffer. - for (int32_t j = 2; (j / 2) < numRows; j *= 2) { - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id] is the receiver - int32_t receiverId = blockIds[id]; - - // blockIds[id + j/2] is the sender - int32_t senderId = blockIds[id + j / 2]; - - // Get the stream associated with the receiver's block id - cudaStream_t stream = description->getBlockStreams()[receiverId]; - - // Copy from the sender to the receiver (use stream associated with receiver) - cudaMemcpyAsync(values[receiverId].Alternate(), - values[senderId].Current(), - sizeof(ValueType) * n, - cudaMemcpyDefault, - stream); - - // Invoke the reduction operator on the receiver's GPU and values arrays. - cudaSetDevice(description->getDeviceAssignments()[receiverId]); - ValueType* input1 = values[receiverId].Alternate(); - ValueType* input2 = values[receiverId].Current(); - thrust::transform(thrust::cuda::par.on(stream), - input1, - input1 + n, - input2, - input2, - op); - } - } - // Sync all active streams before next step - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id] is the receiver - int32_t receiverId = blockIds[id]; - - // Set the device to the receiver and sync the stream - cudaSetDevice(description->getDeviceAssignments()[receiverId]); - cudaStreamSynchronize(description->getBlockStreams()[receiverId]); - } - } - } - } - - cudaSetDevice(current_device); - } - - /** - * This method implements a column-wise reduction of each blocks data into a - * single array for each column. The block on the diagonal will have the result. - */ - template - void columnReduce() { - int current_device; - cudaGetDevice(¤t_device); - Operator op; - - // For each column in the decomposition: - int32_t numRows = description->getBlockRows(); - std::vector blockIds; - for (int32_t i = 0; i < numRows; i++) { - // Put all the block ids for the row into a vector, with the ID of the diagonal block - // at index 0. - std::vector blockIds; - blockIds.push_back(-1); - for (int32_t j = 0; j < numRows; j++) { - if (i == j) { - blockIds[0] = description->getBlockId(j, i); - } - else { - blockIds.push_back(description->getBlockId(j, i)); - } - } - - // Do a binary tree reduction. At each step the primary buffer of the sender is - // copied into the secondary buffer of the receiver. After the copy is done - // each receiver performs the reduction operator and stores the result in it's - // primary buffer. - for (int32_t j = 2; (j / 2) < numRows; j *= 2) { - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id] is the receiver - int32_t receiverId = blockIds[id]; - - // blockIds[id + j/2] is the sender - int32_t senderId = blockIds[id + j / 2]; - - // Get the stream associated with the receiver's block id - cudaStream_t stream = description->getBlockStreams()[receiverId]; - - // Copy from the sender to the receiver (use stream associated with receiver) - cudaMemcpyAsync(values[receiverId].Alternate(), - values[senderId].Current(), - sizeof(ValueType) * n, - cudaMemcpyDefault, - stream); - - // Invoke the reduction operator on the receiver's GPU and values arrays. - cudaSetDevice(description->getDeviceAssignments()[receiverId]); - ValueType* input1 = values[receiverId].Alternate(); - ValueType* input2 = values[receiverId].Current(); - thrust::transform(thrust::cuda::par.on(stream), - input1, - input1 + n, - input2, - input2, - op); - } - } - // Sync all active streams before next step - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id] is the receiver - int32_t receiverId = blockIds[id]; - - // Set the device to the receiver and sync the stream - cudaSetDevice(description->getDeviceAssignments()[receiverId]); - cudaStreamSynchronize(description->getBlockStreams()[receiverId]); - } - } - } - } - - cudaSetDevice(current_device); - } - - /** - * This implements a column-wise scatter of the global data from the corresponding - * row. i.e. The data reduced from row 1 is broadcast to all blocks in - * column 1. It is assumed that the data to broadcast is located in the block on - * the diagonal. - */ - void columnScatter() { - int current_device; - cudaGetDevice(¤t_device); - - // For each column in the decomposition: - int32_t numRows = description->getBlockRows(); - std::vector blockIds; - for (int32_t i = 0; i < numRows; i++) { - // Put all the block ids for the column into a vector, with the ID of the diagonal block - // at index 0. - std::vector blockIds; - blockIds.push_back(-1); - for (int32_t j = 0; j < numRows; j++) { - if (i == j) { - blockIds[0] = description->getBlockId(j, i); - } - else { - blockIds.push_back(description->getBlockId(j, i)); - } - } - - // Do a binary tree scatter. At each step the primary buffer of the sender is - // copied into the primary buffer of the receiver. - int32_t max2pow = 2; - while (max2pow < numRows) { - max2pow *= 2; - } - for (int32_t j = max2pow; j >= 2; j /= 2) { - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id] is the sender - int32_t senderId = blockIds[id]; - - // blockIds[id + j/2] is the sender - int32_t receiverId = blockIds[id + j / 2]; - - // Get the stream associated with the receiver's block id - cudaStream_t stream = description->getBlockStreams()[receiverId]; - - // Copy from the sender to the receiver (use stream associated with receiver) - cudaMemcpyAsync(values[receiverId].Current(), - values[senderId].Current(), - sizeof(ValueType) * n, - cudaMemcpyDefault, - stream); - } - } - // Synchronize all the active streams before next step. - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id + j/2] is the sender - int32_t receiverId = blockIds[id + j / 2]; - - // Set device and sync receiver's stream - cudaSetDevice(description->getDeviceAssignments()[receiverId]); - cudaStreamSynchronize(description->getBlockStreams()[receiverId]); - } - } - } - } - - cudaSetDevice(current_device); - } - - /** - * This implements a row-wise scatter of the global data from the corresponding - * column. i.e. The data reduced from column 1 is broadcast to all blocks in - * row 1. It is assumed that the data to broadcast is located in the block on - * the diagonal. - */ - void rowScatter() { - int current_device; - cudaGetDevice(¤t_device); - - // For each row in the decomposition: - int32_t numRows = description->getBlockRows(); - std::vector blockIds; - for (int32_t i = 0; i < numRows; i++) { - // Put all the block ids for the column into a vector, with the ID of the diagonal block - // at index 0. - std::vector blockIds; - blockIds.push_back(-1); - for (int32_t j = 0; j < numRows; j++) { - if (i == j) { - blockIds[0] = description->getBlockId(i, j); - } - else { - blockIds.push_back(description->getBlockId(i, j)); - } - } - - // Do a binary tree scatter. At each step the primary buffer of the sender is - // copied into the primary buffer of the receiver. - int32_t max2pow = 2; - while (max2pow < numRows) { - max2pow *= 2; - } - for (int32_t j = max2pow; j >= 2; j /= 2) { - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id] is the sender - int32_t senderId = blockIds[id]; - - // blockIds[id + j/2] is the receiver - int32_t receiverId = blockIds[id + j / 2]; - - // Get the stream associated with the receiver's block id - cudaStream_t stream = description->getBlockStreams()[receiverId]; - - // Copy from the sender to the receiver (use stream associated with receiver) - cudaMemcpyAsync(values[receiverId].Current(), - values[senderId].Current(), - sizeof(ValueType) * n, - cudaMemcpyDefault, - stream); - } - } - // Sync all the active streams before next step - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id + j/2] is the receiver - int32_t receiverId = blockIds[id + j / 2]; - - // Set device and sync receiver's stream - cudaSetDevice(description->getDeviceAssignments()[receiverId]); - cudaStreamSynchronize(description->getBlockStreams()[receiverId]); - } - } - } - } - - cudaSetDevice(current_device); - } - - /** - * Outputs a human readable string representation of this Vertex2d object. This is only - * intended to be used for de-bugging. - * @return Human readable string representation - */ - std::string toString() { - std::stringstream ss; - ValueType* c = (ValueType*) malloc(sizeof(ValueType) * n); - ValueType* a = (ValueType*) malloc(sizeof(ValueType) * n); - - int32_t numBlocks = description->getNumBlocks(); - - ss << "Vertex2d:\n"; - for (int32_t i = 0; i < numBlocks; i++) { - ss << "Block " << i << ":\n"; - ss << "Idx\tCur\tAlt\n"; - cudaMemcpy(c, values[i].Current(), sizeof(ValueType) * n, cudaMemcpyDefault); - cudaMemcpy(a, values[i].Alternate(), sizeof(ValueType) * n, cudaMemcpyDefault); - for (int32_t j = 0; j < n; j++) { - ss << j << ":\t" << c[j] << "\t" << a[j] << "\n"; - } - } - - free(c); - free(a); - - return ss.str(); - } - }; - - template - class VertexData2D_Unbuffered { - const MatrixDecompositionDescription* description; - int32_t n; - std::vector values; - - public: - /** - * Sets up a VertexData2D_Unbuffered object with an element allocated for each vertex - * in each block. - * @param descr Pointer to a MatrixDecompositionDescription object describing the layout - * of the 2D blocks. - */ - VertexData2D_Unbuffered(const MatrixDecompositionDescription* descr) : - description(descr) { - // Resize the values array to be the same size as number of blocks - values.resize(descr->getNumBlocks()); - - // Grab the current device id to switch back after allocations are done - int current_device; - cudaGetDevice(¤t_device); - LocalType allocSize = descr->getOffset(); - n = allocSize; - // Allocate the data for each block - for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) { - int device = descr->getDeviceAssignments()[i]; - cudaSetDevice(device); - cudaMalloc(&(values[i]), sizeof(ValueType) * n); - } - - // Set the device back to what it was initially - cudaSetDevice(current_device); - } - - /** - * Sets up a VertexData2D_Unbuffered object with _n elements allocated per block. - * @param descr Pointer to a MatrixDecompositionDescription object describing the layout - * of the 2D blocks. - * @param _n The number of elements to allocate per block. - */ - VertexData2D_Unbuffered(const MatrixDecompositionDescription* descr, - size_t _n) : - description(descr), n(_n) { - // Resize the values array to be the same size as number of blocks - values.resize(descr->getNumBlocks()); - - // Grab the current device id to switch back after allocations are done - int current_device; - cudaGetDevice(¤t_device); - // Allocate the data for each block - for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) { - int device = descr->getDeviceAssignments()[i]; - cudaSetDevice(device); - cudaMalloc(&(values[i]), sizeof(ValueType) * n); - } - - // Set the device back to what it was initially - cudaSetDevice(current_device); - } - - /** - * Destructor. Frees all allocated memory. - */ - ~VertexData2D_Unbuffered() { - for (size_t i = 0; i < values.size(); i++) { - if (values[i]) { - cudaFree(values[i]); - } - } - } - - /** - * Fills the elements of the data array with the given value. - * The elements on the diagonal are filled with the given value. After filling, - * either rowScatter or columnScatter will copy the values across the blocks in - * either the rows or columns depending on the use. - * @param val The value to fill the array with - */ - void fillElements(ValueType val) { - int current_device; - cudaGetDevice(¤t_device); - int32_t numRows = description->getBlockRows(); - for (int32_t i = 0; i < numRows; i++) { - int32_t blockId = description->getBlockId(i, i); - ValueType* vals = get(blockId); - int deviceId = description->getDeviceAssignments()[blockId]; - cudaStream_t stream = description->getBlockStreams()[blockId]; - cudaSetDevice(deviceId); - thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val); - } - description->syncAllStreams(); - cudaSetDevice(current_device); - } - - /** - * This implements a column-wise scatter of the global data from the corresponding - * row. i.e. The data reduced from row 1 is broadcast to all blocks in - * column 1. It is assumed that the data to broadcast is located in the block on - * the diagonal. - */ - void columnScatter() { - int current_device; - cudaGetDevice(¤t_device); - - // For each column in the decomposition: - int32_t numRows = description->getBlockRows(); - std::vector blockIds; - for (int32_t i = 0; i < numRows; i++) { - // Put all the block ids for the column into a vector, with the ID of the diagonal block - // at index 0. - std::vector blockIds; - blockIds.push_back(-1); - for (int32_t j = 0; j < numRows; j++) { - if (i == j) { - blockIds[0] = description->getBlockId(j, i); - } - else { - blockIds.push_back(description->getBlockId(j, i)); - } - } - - // Do a binary tree scatter. At each step the primary buffer of the sender is - // copied into the primary buffer of the receiver. - int32_t max2pow = 2; - while (max2pow < numRows) { - max2pow *= 2; - } - for (int32_t j = max2pow; j >= 2; j /= 2) { - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id] is the sender - int32_t senderId = blockIds[id]; - - // blockIds[id + j/2] is the sender - int32_t receiverId = blockIds[id + j / 2]; - - // Get the stream associated with the receiver's block id - cudaStream_t stream = description->getBlockStreams()[receiverId]; - - // Copy from the sender to the receiver (use stream associated with receiver) - cudaMemcpyAsync(values[receiverId], - values[senderId], - sizeof(ValueType) * n, - cudaMemcpyDefault, - stream); - } - } - // Synchronize all the active streams before next step. - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id + j/2] is the sender - int32_t receiverId = blockIds[id + j / 2]; - - // Set device and sync receiver's stream - cudaSetDevice(description->getDeviceAssignments()[receiverId]); - cudaStreamSynchronize(description->getBlockStreams()[receiverId]); - } - } - } - } - - cudaSetDevice(current_device); - } - - /** - * This implements a row-wise scatter of the global data from the corresponding - * column. i.e. The data reduced from column 1 is broadcast to all blocks in - * row 1. It is assumed that the data to broadcast is located in the block on - * the diagonal. - */ - void rowScatter() { - int current_device; - cudaGetDevice(¤t_device); - - // For each row in the decomposition: - int32_t numRows = description->getBlockRows(); - std::vector blockIds; - for (int32_t i = 0; i < numRows; i++) { - // Put all the block ids for the column into a vector, with the ID of the diagonal block - // at index 0. - std::vector blockIds; - blockIds.push_back(-1); - for (int32_t j = 0; j < numRows; j++) { - if (i == j) { - blockIds[0] = description->getBlockId(i, j); - } - else { - blockIds.push_back(description->getBlockId(i, j)); - } - } - - // Do a binary tree scatter. At each step the primary buffer of the sender is - // copied into the primary buffer of the receiver. - int32_t max2pow = 2; - while (max2pow < numRows) { - max2pow *= 2; - } - for (int32_t j = max2pow; j >= 2; j /= 2) { - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id] is the sender - int32_t senderId = blockIds[id]; - - // blockIds[id + j/2] is the receiver - int32_t receiverId = blockIds[id + j / 2]; - - // Get the stream associated with the receiver's block id - cudaStream_t stream = description->getBlockStreams()[receiverId]; - - // Copy from the sender to the receiver (use stream associated with receiver) - cudaMemcpyAsync(values[receiverId], - values[senderId], - sizeof(ValueType) * n, - cudaMemcpyDefault, - stream); - } - } - // Sync all the active streams before next step - for (int32_t id = 0; id < numRows; id++) { - if (id % j == 0 && id + j / 2 < numRows) { - // blockIds[id + j/2] is the receiver - int32_t receiverId = blockIds[id + j / 2]; - - // Set device and sync receiver's stream - cudaSetDevice(description->getDeviceAssignments()[receiverId]); - cudaStreamSynchronize(description->getBlockStreams()[receiverId]); - } - } - } - } - - cudaSetDevice(current_device); - } - - /** - * Getter for n - * @return The value of n - */ - int32_t getN() { - return n; - } - - /** - * Gets the pointer to the allocated memory for a specified block. - * @param bId The block id to get the memory for. - * @return A pointer to the allocated memory for the given block. - */ - ValueType* get(int32_t bId) { - return values[bId]; - } - }; - - /** - * This method takes in COO format matrix data and a MatrixDecompositionDescription and - * returns a Matrix2d object containing the given data. - */ - template - Matrix2d COOto2d(MatrixDecompositionDescription descr, - GlobalType* rowIds, - GlobalType* colIds, - ValueType* values) { - // Grab the current device id to switch back after allocations are done - int current_device; - cudaGetDevice(¤t_device); - - int32_t blockCount = descr.getNumBlocks(); - - // Allocate array of size global nnz to hold the block labels - int32_t* blockLabels = (int32_t*) malloc(descr.getNnz() * sizeof(int32_t)); - - // Allocate array to contain row counts for each block and initialize to zero - // Allocate array to contain position offsets for writing each blocks data - LocalType* blockCounts = (LocalType*) malloc(blockCount * sizeof(LocalType)); - LocalType* blockPos = (LocalType*) malloc(blockCount * sizeof(LocalType)); - for (int i = 0; i < blockCount; i++) { - blockCounts[i] = 0; - blockPos[i] = 0; - } - - // For each edge mark in the array the id of the block to which it will belong - int32_t blockId; - LocalType localRow; - LocalType localCol; - for (int i = 0; i < descr.getNnz(); i++) { - descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId); - blockLabels[i] = blockId; - blockCounts[blockId]++; - } - - // Allocate arrays for putting each blocks data into - LocalType** blockRowIds = (LocalType**) malloc(blockCount * sizeof(LocalType*)); - LocalType** blockColIds = (LocalType**) malloc(blockCount * sizeof(LocalType*)); - ValueType** blockValues = NULL; - if (values) - blockValues = (ValueType**) malloc(blockCount * sizeof(ValueType*)); - for (int i = 0; i < blockCount; i++) { - blockRowIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType)); - blockColIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType)); - if (values) - blockValues[i] = (ValueType*) malloc(blockCounts[i] * sizeof(ValueType)); - } - - // Convert each blocks global rows to local ids and copy into block arrays - for (int i = 0; i < descr.getNnz(); i++) { - descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId); - blockRowIds[blockId][blockPos[blockId]] = localRow; - blockColIds[blockId][blockPos[blockId]] = localCol; - if (values) - blockValues[blockId][blockPos[blockId]] = values[i]; - blockPos[blockId]++; - } - - // Allocate the result blocks vector - std::vector*> blockVector(blockCount); - - // Convert each blocks COO rows into CSR and create it's graph object. - for (int i = 0; i < blockCount; i++) { - // Set the device as indicated so the data ends up on the right GPU - cudaSetDevice(descr.getDeviceAssignments()[i]); - cudaStream_t stream = descr.getBlockStreams()[i]; - - if (blockCounts[i] > 0) { - CSR_Result_Weighted result; - ConvertCOOtoCSR_weighted(blockRowIds[i], - blockColIds[i], - values ? blockValues[i] : NULL, - (int64_t) blockCounts[i], - (descr.getOffset() - 1), - result); - MultiValuedCsrGraph* csrGraph = new MultiValuedCsrGraph((size_t) result.size, (size_t) result.nnz, stream); - if (values) - csrGraph->allocateEdgeData(1, NULL); - cudaMemcpy(csrGraph->get_raw_row_offsets(), - result.rowOffsets, - (result.size + 1) * sizeof(LocalType), - cudaMemcpyDefault); - cudaMemcpy(csrGraph->get_raw_column_indices(), - result.colIndices, - result.nnz * sizeof(LocalType), - cudaMemcpyDefault); - if (values) - cudaMemcpy(csrGraph->get_raw_edge_dim(0), - result.edgeWeights, - result.nnz * sizeof(LocalType), - cudaMemcpyDefault); - blockVector[i] = csrGraph; - result.Destroy(); - } - else { - MultiValuedCsrGraph* csrGraph = new MultiValuedCsrGraph((size_t) descr.getOffset(), (size_t) 0, stream); - cudaMemset( csrGraph->get_raw_row_offsets(), - 0, - sizeof(LocalType) * (descr.getOffset() + 1)); - blockVector[i] = csrGraph; - } - } - - // Free temporary memory - for (int i = 0; i < blockCount; i++) { - free(blockRowIds[i]); - free(blockColIds[i]); - if (values) - free(blockValues[i]); - } - free(blockRowIds); - free(blockColIds); - if (values) - free(blockValues); - - cudaSetDevice(current_device); - - // Put it all together into a Matrix2d object for return - return Matrix2d(descr, blockVector); - } + template + struct CSR_Result_Weighted { + int64_t size; + int64_t nnz; + T* rowOffsets; + T* colIndices; + W* edgeWeights; + + CSR_Result_Weighted() : + size(0), nnz(0), rowOffsets(NULL), colIndices(NULL), edgeWeights(NULL) { + } + + void Destroy() { + cudaStream_t stream{nullptr}; + if (rowOffsets) + RMM_FREE(rowOffsets, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + if (colIndices) + RMM_FREE(colIndices, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + if (edgeWeights) + RMM_FREE(edgeWeights, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + } + }; + + // Define kernel for copying run length encoded values into offset slots. + template + __global__ void offsetsKernel(T runCounts, T* unique, T* counts, T* offsets) { + for (int32_t idx = blockDim.x * blockIdx.x + threadIdx.x; + idx < runCounts; + idx += gridDim.x * blockDim.x) { + offsets[unique[idx]] = counts[idx]; + } + } + + /** + * Method for converting COO to CSR format + * @param sources The array of source indices + * @param destinations The array of destination indices + * @param edgeWeights The array of edge weights + * @param nnz The number of non zero values + * @param maxId The largest id contained in the matrix + * @param result The result is stored here. + */ + template + void ConvertCOOtoCSR_weighted(T* sources, + T* destinations, + W* edgeWeights, + int64_t nnz, + T maxId, + CSR_Result_Weighted& result) { + // Sort source and destination columns by source + // Allocate local memory for operating on + T* srcs, *dests; + W* weights = NULL; + cudaStream_t stream{nullptr}; + + RMM_ALLOC(&srcs, sizeof(T) * nnz, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_ALLOC(&dests, sizeof(T) * nnz, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + if (edgeWeights) + RMM_ALLOC(&weights, sizeof(W) * nnz, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + cudaMemcpy(srcs, sources, sizeof(T) * nnz, cudaMemcpyDefault); + cudaMemcpy(dests, destinations, sizeof(T) * nnz, cudaMemcpyDefault); + if (edgeWeights) + cudaMemcpy(weights, edgeWeights, sizeof(W) * nnz, cudaMemcpyDefault); + + // Call Thrust::sort_by_key to sort the arrays with srcs as keys: + if (edgeWeights) + thrust::sort_by_key(thrust::device, + srcs, + srcs + nnz, + thrust::make_zip_iterator(thrust::make_tuple(dests, weights))); + else + thrust::sort_by_key(thrust::device, srcs, srcs + nnz, dests); + + result.size = maxId + 1; + + // Allocate offsets array + RMM_ALLOC(&result.rowOffsets, (maxId + 2) * sizeof(T), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + + // Set all values in offsets array to zeros + cudaMemset(result.rowOffsets, 0, (maxId + 2) * sizeof(T)); + + // Allocate temporary arrays same size as sources array, and single value to get run counts + T* unique, *counts, *runCount; + RMM_ALLOC(&unique, (maxId + 1) * sizeof(T), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_ALLOC(&counts, (maxId + 1) * sizeof(T), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_ALLOC(&runCount, sizeof(T), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + + // Use CUB run length encoding to get unique values and run lengths + void *tmpStorage = NULL; + size_t tmpBytes = 0; + cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz); + RMM_ALLOC(&tmpStorage, tmpBytes, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz); + RMM_FREE(tmpStorage, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + + // Set offsets to run sizes for each index + T runCount_h; + cudaMemcpy(&runCount_h, runCount, sizeof(T), cudaMemcpyDefault); + int threadsPerBlock = 1024; + int numBlocks = min(65535, (runCount_h + threadsPerBlock - 1) / threadsPerBlock); + offsetsKernel<<>>(runCount_h, unique, counts, result.rowOffsets); + + // Scan offsets to get final offsets + thrust::exclusive_scan(thrust::device, + result.rowOffsets, + result.rowOffsets + maxId + 2, + result.rowOffsets); + + // Clean up temporary allocations + result.nnz = nnz; + result.colIndices = dests; + result.edgeWeights = weights; + RMM_FREE(srcs, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(unique, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(counts, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(runCount, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + } + + /** + * Describes the 2D decomposition of a partitioned matrix. + */ + template + class MatrixDecompositionDescription { + protected: + GlobalType numRows; // Global number of rows in matrix + GlobalType numCols; // Global number of columns in matrix + GlobalType nnz; // Global number of non-zeroes in matrix + GlobalType blockRows; // Number of rows of blocks in the decomposition + GlobalType blockCols; // Number of columns of rows in the decomposition + LocalType offset; + // Offsets-like arrays for rows and columns defining the start/end of the + // sections of the global id space belonging to each row and column. + std::vector rowOffsets; + std::vector colOffsets; + // Array of integers one for each block, defining the device it is assigned to + std::vector deviceAssignments; + std::vector blockStreams; + public: + + MatrixDecompositionDescription() : + numRows(0), numCols(0), nnz(0), blockRows(0), blockCols(0) { + rowOffsets.push_back(0); + colOffsets.push_back(0); + deviceAssignments.push_back(0); + } + + // Basic constructor, just takes in the values of its members. + MatrixDecompositionDescription(GlobalType numRows, + GlobalType numCols, + GlobalType nnz, + GlobalType blockRows, + GlobalType blockCols, + std::vector rowOffsets, + std::vector colOffsets, + std::vector deviceAssignments) : + numRows(numRows), numCols(numCols), nnz(nnz), blockRows(blockRows), + blockCols(blockCols), rowOffsets(rowOffsets), colOffsets(colOffsets), + deviceAssignments(deviceAssignments) { + } + + // Constructs a MatrixDecompositionDescription for a square matrix given the + // number of rows in the matrix and number of rows of blocks. + MatrixDecompositionDescription(GlobalType numRows, + GlobalType numBlockRows, + GlobalType nnz, + std::vector devices) : + numRows(numRows), + numCols(numRows), + blockRows(numBlockRows), + blockCols(numBlockRows), + nnz(nnz) { + // Tracking the current set device to change back + int currentDevice; + cudaGetDevice(¤tDevice); + + // Setting up the row and col offsets into equally sized chunks + GlobalType remainder = numRows % blockRows; + if (remainder != 0) + offset = (numRows + blockRows - remainder) / blockRows; + else + offset = numRows / blockRows; + + rowOffsets.resize(blockRows + 1); + colOffsets.resize(blockRows + 1); + for (int i = 0; i < blockRows; i++) { + rowOffsets[i] = i * offset; + colOffsets[i] = i * offset; + } + rowOffsets.back() = blockRows * offset; + colOffsets.back() = blockCols * offset; + + // Setting up the device assignments using the given device ids and also + // setting up the stream associated with each block. + deviceAssignments.resize(getNumBlocks()); + blockStreams.resize(getNumBlocks()); + for (int i = 0; i < getNumBlocks(); i++) { + int device = devices[i % devices.size()]; + deviceAssignments[i] = device; + cudaSetDevice(device); + cudaStream_t stream; + cudaStreamCreate(&stream); + blockStreams[i] = stream; + } + + // Restoring to current device when called + cudaSetDevice(currentDevice); + } + + // Gets the row id for the block containing the given global row id + int32_t getRowId(GlobalType val) const { + return std::upper_bound(rowOffsets.begin(), rowOffsets.end(), val) - rowOffsets.begin() - 1; + } + + // Gets the column id for the block containing the given global column id + int32_t getColId(GlobalType val) const { + return std::upper_bound(colOffsets.begin(), colOffsets.end(), val) - colOffsets.begin() - 1; + } + + // Gets the number of blocks in the decomposition: + int32_t getNumBlocks() const { + return blockRows * blockCols; + } + + // Getter for offset + LocalType getOffset() const { + return offset; + } + + // Getter for deviceAssignments + const std::vector& getDeviceAssignments() const { + return deviceAssignments; + } + + /** + * Getter for vector of streams for each block. + * @return Reference to vector of streams for each block + */ + const std::vector& getBlockStreams() const { + return blockStreams; + } + + /** + * Getter for nnz + * @return The global number of non-zero elements + */ + GlobalType getNnz() const { + return nnz; + } + + /** + * Getter method for numRows + * @return The number of global rows in the matrix + */ + GlobalType getNumRows() const { + return numRows; + } + + /** + * Getter for BlockRows + * @return The number of blocks in a row in the decomposition. + */ + GlobalType getBlockRows() const { + return blockRows; + } + + /** + * Getter for BlockCols + * @return The number of blocks in a column in the decomposition. + */ + GlobalType getBlockCols() const { + return blockCols; + } + + /** + * Given a block id, returns the row which that block is in. + * @param bId The block ID + * @return The row number + */ + int32_t getBlockRow(int32_t bId) const { + return bId / blockCols; + } + + /** + * Given a block id, returns the column which that block is in. + * @param bId The block ID + * @return The column number + */ + int32_t getBlockCol(int32_t bId) const { + return bId % blockCols; + } + + /** + * Takes a COO global row and produces the COO local row and the block to which it belongs. + * @param globalRow The global row ID + * @param globalCol The global column ID + * @param localRow The block local row ID (return) + * @param localCol The block local column ID (return) + * @param blockId The block ID (return) + */ + void convertGlobaltoLocalRow(GlobalType globalRow, + GlobalType globalCol, + LocalType& localRow, + LocalType& localCol, + int32_t& blockId) const { + int32_t rowId = getRowId(globalRow); + int32_t colId = getColId(globalCol); + blockId = rowId * blockCols + colId; + localRow = globalRow - rowOffsets[rowId]; + localCol = globalCol - colOffsets[colId]; + } + + /** + * Takes in a row ID and column ID and returns the corresponding block ID + * @param rowId The row ID + * @param colId The column ID + * @return The ID of the corresponding block + */ + int32_t getBlockId(int32_t rowId, int32_t colId) const { + return rowId * blockCols + colId; + } + + /** + * Helper method to synchronize all streams after operations are issued. + */ + void syncAllStreams() const { + int32_t numBlocks = getNumBlocks(); + int32_t current_device; + cudaGetDevice(¤t_device); + for (int32_t i = 0; i < numBlocks; i++) { + cudaSetDevice(deviceAssignments[i]); + cudaStreamSynchronize(blockStreams[i]); + } + cudaSetDevice(current_device); + } + + /** + * This method is only for testing and debugging use. + * @return A human readable string representation of the object + */ + std::string toString() const { + std::stringstream ss; + ss << "Global Info:\n\tnumRows: " << numRows << ", numCols: " << numCols << ", nnz: " + << nnz; + ss << "\n"; + ss << "Block Info:\n\tblockRows: " << blockRows << ", blockCols: " << blockCols; + ss << "\n"; + ss << "rowOffsets: ["; + for (int i = 0; i < (int) rowOffsets.size(); i++) + ss << rowOffsets[i] << (i == (int) rowOffsets.size() - 1 ? "]\n" : ", "); + ss << "colOffsets: ["; + for (int i = 0; i < (int) colOffsets.size(); i++) + ss << colOffsets[i] << (i == (int) colOffsets.size() - 1 ? "]\n" : ", "); + ss << "deviceAssignments: ["; + for (int i = 0; i < (int) deviceAssignments.size(); i++) + ss << deviceAssignments[i] << (i == (int) deviceAssignments.size() - 1 ? "]\n" : ", "); + return ss.str(); + } + }; + + template + class Matrix2d { + protected: + // Description of the matrix decomposition + MatrixDecompositionDescription description; + + // Array of block matrices forming the decomposition + std::vector*> blocks; + public: + Matrix2d() { + } + Matrix2d(MatrixDecompositionDescription descr, + std::vector*> blocks) : + description(descr), blocks(blocks) { + } + + const MatrixDecompositionDescription& getMatrixDecompositionDescription() { + return description; + } + + MultiValuedCsrGraph* getBlockMatrix(int32_t bId) { + return blocks[bId]; + } + + std::string toString() { + std::stringstream ss; + ss << "MatrixDecompositionDescription:\n" << description.toString(); + for (int i = 0; i < (int) blocks.size(); i++) { + ss << "Block " << i << ":\n"; + size_t numVerts = blocks[i]->get_num_vertices(); + size_t numEdges = blocks[i]->get_num_edges(); + size_t numValues = blocks[i]->getNumValues(); + ss << "numVerts: " << numVerts << ", numEdges: " << numEdges << "\n"; + LocalType* rowOffsets = (LocalType*) malloc((numVerts + 1) * sizeof(LocalType)); + LocalType* colIndices = (LocalType*) malloc(numEdges * sizeof(LocalType)); + ValueType* values = NULL; + if (numValues > 0) + values = (ValueType*) malloc(numEdges * sizeof(ValueType)); + cudaMemcpy(rowOffsets, + blocks[i]->get_raw_row_offsets(), + (numVerts + 1) * sizeof(LocalType), + cudaMemcpyDefault); + cudaMemcpy(colIndices, + blocks[i]->get_raw_column_indices(), + numEdges * sizeof(LocalType), + cudaMemcpyDefault); + if (values) + cudaMemcpy(values, + blocks[i]->get_raw_edge_dim(0), + numEdges * sizeof(ValueType), + cudaMemcpyDefault); + int idxCount = numEdges >= (numVerts + 1) ? numEdges : (numVerts + 1); + ss << "Idx\tOffset\tColInd\tValue\n"; + for (int j = 0; j < idxCount; j++) { + if (j < (int) numVerts + 1 && j < (int) numEdges) + ss << j << ":\t" << rowOffsets[j] << "\t" << colIndices[j] << "\t" + << (values ? values[j] : 0) + << "\n"; + else if (j < (int) numVerts + 1 && j >= (int) numEdges) + ss << j << ":\t" << rowOffsets[j] << "\n"; + else if (j >= (int) numVerts + 1 && j < (int) numEdges) + ss << j << ":\t" << "\t" << colIndices[j] << "\t" << (values ? values[j] : 0) + << "\n"; + } + free(rowOffsets); + free(colIndices); + free(values); + } + return ss.str(); + } + }; + + template + class VertexData2D { + const MatrixDecompositionDescription* description; + int32_t n; + std::vector > values; + public: + /** + * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription + * object which describes the matrix the data is attached to. Data buffers are + * allocated for each block using the offset from the description to size the + * buffers, and to locate the buffers on the same GPU as the matrix block. + */ + VertexData2D(const MatrixDecompositionDescription* descr) : + description(descr) { + // Resize the values array to be the same size as number of blocks + values.resize(descr->getNumBlocks()); + + // Grab the current device id to switch back after allocations are done + int current_device; + cudaGetDevice(¤t_device); + LocalType allocSize = descr->getOffset(); + n = allocSize; + // Allocate the data for each block + cudaStream_t stream{nullptr}; + for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) { + int device = descr->getDeviceAssignments()[i]; + cudaSetDevice(device); + ValueType* d_current, *d_alternate; + RMM_ALLOC(&d_current, sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_ALLOC(&d_alternate, sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + values[i].d_buffers[0] = d_current; + values[i].d_buffers[1] = d_alternate; + } + + // Set the device back to what it was initially + cudaSetDevice(current_device); + } + + /** + * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription + * object, which describes the matrix the data is attached to, and an integer which indicates + * how many data elements should be allocated for each block. Data buffers are allocated + * for each block using the offset from the description to size the buffers, and to locate + * the buffers on the same GPU as the matrix block. + */ + VertexData2D(const MatrixDecompositionDescription* descr, size_t _n) : + description(descr) { + // Resize the values array to be the same size as number of blocks + values.resize(descr->getNumBlocks()); + + // Grab the current device id to switch back after allocations are done + int current_device; + cudaGetDevice(¤t_device); + LocalType allocSize = _n; + n = allocSize; + // Allocate the data for each block + cudaStream_t stream{nullptr}; + for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) { + int device = descr->getDeviceAssignments()[i]; + cudaSetDevice(device); + ValueType* d_current, *d_alternate; + RMM_ALLOC(&d_current, sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_ALLOC(&d_alternate, sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + values[i].d_buffers[0] = d_current; + values[i].d_buffers[1] = d_alternate; + } + + // Set the device back to what it was initially + cudaSetDevice(current_device); + } + + ~VertexData2D() { + cudaStream_t stream{nullptr}; + for (size_t i = 0; i < values.size(); i++) { + if (values[i].Current()) + RMM_FREE(values[i].Current(), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + if (values[i].Alternate()) + RMM_FREE(values[i].Alternate(), stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + } + } + + /** + * Getter for n the size of each block's allocation in elements. + * @return The value of n + */ + int32_t getN() { + return n; + } + + /** + * Getter for the MatrixDecompositionDescription associated with this VertexData2D + * @return Pointer to the MatrixDecompositionDescription for this VertexData2D + */ + const MatrixDecompositionDescription* getDescription() { + return description; + } + + /** + * Gets the current buffer corresponding to the given block ID + */ + ValueType* getCurrent(int bId) { + return values[bId].Current(); + } + + /** + * Gets the alternate buffer corresponding to the given block ID + */ + ValueType* getAlternate(int bId) { + return values[bId].Alternate(); + } + + /** + * Swaps the current and alternate buffers for all block IDs + */ + void swapBuffers() { + for (size_t i = 0; i < values.size(); i++) + values[i].selector ^= 1; + } + + /** + * Sets an element in the global array, assuming that the data is currently + * valid and in the diagonal blocks. After calling this method either columnScatter + * or rowScatter should be called to propagate the change to all blocks. + */ + void setElement(GlobalType globalIndex, ValueType val) { + LocalType blockId = globalIndex / n; + LocalType blockOffset = globalIndex % n; + int32_t bId = description->getBlockId(blockId, blockId); + ValueType* copyTo = values[bId].Current() + blockOffset; + cudaMemcpy(copyTo, &val, sizeof(ValueType), cudaMemcpyDefault); + } + + /** + * Sets the elements of the global array, using the provided array of values. The values + * are set in the blocks of the diagonal, columnScatter or rowScatter should be called + * to propogate to all blocks. + * @param vals Pointer to an array with the values to be set. + */ + void setElements(ValueType* vals) { + LocalType offset = description->getOffset(); + int32_t numRows = description->getBlockRows(); + for (int i = 0; i < numRows; i++) { + int32_t id = description->getBlockId(i, i); + cudaStream_t stream = description->getBlockStreams()[id]; + ValueType* copyFrom = vals + i * n; + ValueType* copyTo = values[id].Current(); + cudaMemcpyAsync(copyTo, copyFrom, sizeof(ValueType) * n, cudaMemcpyDefault, stream); + } + description->syncAllStreams(); + } + + /** + * Fills the elements of the data array with the given value. + * The elements on the diagonal are filled with the given value. After filling, + * either rowScatter or columnScatter will copy the values across the blocks in + * either the rows or columns depending on the use. + * @param val The value to fill the array with + */ + void fillElements(ValueType val) { + int current_device; + cudaGetDevice(¤t_device); + int32_t numRows = description->getBlockRows(); + for (int32_t i = 0; i < numRows; i++) { + int32_t blockId = description->getBlockId(i, i); + ValueType* vals = getCurrent(blockId); + int deviceId = description->getDeviceAssignments()[blockId]; + cudaStream_t stream = description->getBlockStreams()[blockId]; + cudaSetDevice(deviceId); + thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val); + } + description->syncAllStreams(); + cudaSetDevice(current_device); + } + + /** + * Copies the values of the diagonal blocks in this VertexData2D into the + * VertexData2D specified. + * @param other Pointer to the VertexData2D to copy into + */ + void copyTo(VertexData2D* other) { + const MatrixDecompositionDescription* otherDescr = + other->getDescription(); + // Do a quick check that the sizes of both block arrays are the same. + if (description->getBlockRows() == otherDescr->getBlockRows() && n == other->getN()) { + // Issue asynchronous copies for each block's data + for (int i = 0; i < description->getBlockRows(); i++) { + int32_t bId = description->getBlockId(i, i); + ValueType* copyFrom = getCurrent(bId); + ValueType* copyTo = other->getCurrent(bId); + cudaStream_t stream = description->getBlockStreams()[bId]; + cudaMemcpyAsync(copyTo, copyFrom, n * sizeof(ValueType), cudaMemcpyDefault, stream); + } + // Synchronize the streams after the copies are done + for (int i = 0; i < description->getBlockRows(); i++) { + int32_t bId = description->getBlockId(i, i); + cudaStream_t stream = description->getBlockStreams()[bId]; + cudaStreamSynchronize(stream); + } + } + } + + /** + * This method implements a row-wise reduction of each blocks data into a + * single array for each row. The block on the diagonal will have the result. + */ + template + void rowReduce() { + int current_device; + cudaGetDevice(¤t_device); + Operator op; + + // For each row in the decomposition: + int32_t numRows = description->getBlockRows(); + std::vector blockIds; + for (int32_t i = 0; i < numRows; i++) { + // Put all the block ids for the row into a vector, with the ID of the diagonal block + // at index 0. + std::vector blockIds; + blockIds.push_back(-1); + for (int32_t j = 0; j < numRows; j++) { + if (i == j) { + blockIds[0] = description->getBlockId(i, j); + } + else { + blockIds.push_back(description->getBlockId(i, j)); + } + } + + // Do a binary tree reduction. At each step the primary buffer of the sender is + // copied into the secondary buffer of the receiver. After the copy is done + // each receiver performs the reduction operator and stores the result in it's + // primary buffer. + for (int32_t j = 2; (j / 2) < numRows; j *= 2) { + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id] is the receiver + int32_t receiverId = blockIds[id]; + + // blockIds[id + j/2] is the sender + int32_t senderId = blockIds[id + j / 2]; + + // Get the stream associated with the receiver's block id + cudaStream_t stream = description->getBlockStreams()[receiverId]; + + // Copy from the sender to the receiver (use stream associated with receiver) + cudaMemcpyAsync(values[receiverId].Alternate(), + values[senderId].Current(), + sizeof(ValueType) * n, + cudaMemcpyDefault, + stream); + + // Invoke the reduction operator on the receiver's GPU and values arrays. + cudaSetDevice(description->getDeviceAssignments()[receiverId]); + ValueType* input1 = values[receiverId].Alternate(); + ValueType* input2 = values[receiverId].Current(); + thrust::transform(thrust::cuda::par.on(stream), + input1, + input1 + n, + input2, + input2, + op); + } + } + // Sync all active streams before next step + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id] is the receiver + int32_t receiverId = blockIds[id]; + + // Set the device to the receiver and sync the stream + cudaSetDevice(description->getDeviceAssignments()[receiverId]); + cudaStreamSynchronize(description->getBlockStreams()[receiverId]); + } + } + } + } + + cudaSetDevice(current_device); + } + + /** + * This method implements a column-wise reduction of each blocks data into a + * single array for each column. The block on the diagonal will have the result. + */ + template + void columnReduce() { + int current_device; + cudaGetDevice(¤t_device); + Operator op; + + // For each column in the decomposition: + int32_t numRows = description->getBlockRows(); + std::vector blockIds; + for (int32_t i = 0; i < numRows; i++) { + // Put all the block ids for the row into a vector, with the ID of the diagonal block + // at index 0. + std::vector blockIds; + blockIds.push_back(-1); + for (int32_t j = 0; j < numRows; j++) { + if (i == j) { + blockIds[0] = description->getBlockId(j, i); + } + else { + blockIds.push_back(description->getBlockId(j, i)); + } + } + + // Do a binary tree reduction. At each step the primary buffer of the sender is + // copied into the secondary buffer of the receiver. After the copy is done + // each receiver performs the reduction operator and stores the result in it's + // primary buffer. + for (int32_t j = 2; (j / 2) < numRows; j *= 2) { + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id] is the receiver + int32_t receiverId = blockIds[id]; + + // blockIds[id + j/2] is the sender + int32_t senderId = blockIds[id + j / 2]; + + // Get the stream associated with the receiver's block id + cudaStream_t stream = description->getBlockStreams()[receiverId]; + + // Copy from the sender to the receiver (use stream associated with receiver) + cudaMemcpyAsync(values[receiverId].Alternate(), + values[senderId].Current(), + sizeof(ValueType) * n, + cudaMemcpyDefault, + stream); + + // Invoke the reduction operator on the receiver's GPU and values arrays. + cudaSetDevice(description->getDeviceAssignments()[receiverId]); + ValueType* input1 = values[receiverId].Alternate(); + ValueType* input2 = values[receiverId].Current(); + thrust::transform(thrust::cuda::par.on(stream), + input1, + input1 + n, + input2, + input2, + op); + } + } + // Sync all active streams before next step + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id] is the receiver + int32_t receiverId = blockIds[id]; + + // Set the device to the receiver and sync the stream + cudaSetDevice(description->getDeviceAssignments()[receiverId]); + cudaStreamSynchronize(description->getBlockStreams()[receiverId]); + } + } + } + } + + cudaSetDevice(current_device); + } + + /** + * This implements a column-wise scatter of the global data from the corresponding + * row. i.e. The data reduced from row 1 is broadcast to all blocks in + * column 1. It is assumed that the data to broadcast is located in the block on + * the diagonal. + */ + void columnScatter() { + int current_device; + cudaGetDevice(¤t_device); + + // For each column in the decomposition: + int32_t numRows = description->getBlockRows(); + std::vector blockIds; + for (int32_t i = 0; i < numRows; i++) { + // Put all the block ids for the column into a vector, with the ID of the diagonal block + // at index 0. + std::vector blockIds; + blockIds.push_back(-1); + for (int32_t j = 0; j < numRows; j++) { + if (i == j) { + blockIds[0] = description->getBlockId(j, i); + } + else { + blockIds.push_back(description->getBlockId(j, i)); + } + } + + // Do a binary tree scatter. At each step the primary buffer of the sender is + // copied into the primary buffer of the receiver. + int32_t max2pow = 2; + while (max2pow < numRows) { + max2pow *= 2; + } + for (int32_t j = max2pow; j >= 2; j /= 2) { + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id] is the sender + int32_t senderId = blockIds[id]; + + // blockIds[id + j/2] is the sender + int32_t receiverId = blockIds[id + j / 2]; + + // Get the stream associated with the receiver's block id + cudaStream_t stream = description->getBlockStreams()[receiverId]; + + // Copy from the sender to the receiver (use stream associated with receiver) + cudaMemcpyAsync(values[receiverId].Current(), + values[senderId].Current(), + sizeof(ValueType) * n, + cudaMemcpyDefault, + stream); + } + } + // Synchronize all the active streams before next step. + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id + j/2] is the sender + int32_t receiverId = blockIds[id + j / 2]; + + // Set device and sync receiver's stream + cudaSetDevice(description->getDeviceAssignments()[receiverId]); + cudaStreamSynchronize(description->getBlockStreams()[receiverId]); + } + } + } + } + + cudaSetDevice(current_device); + } + + /** + * This implements a row-wise scatter of the global data from the corresponding + * column. i.e. The data reduced from column 1 is broadcast to all blocks in + * row 1. It is assumed that the data to broadcast is located in the block on + * the diagonal. + */ + void rowScatter() { + int current_device; + cudaGetDevice(¤t_device); + + // For each row in the decomposition: + int32_t numRows = description->getBlockRows(); + std::vector blockIds; + for (int32_t i = 0; i < numRows; i++) { + // Put all the block ids for the column into a vector, with the ID of the diagonal block + // at index 0. + std::vector blockIds; + blockIds.push_back(-1); + for (int32_t j = 0; j < numRows; j++) { + if (i == j) { + blockIds[0] = description->getBlockId(i, j); + } + else { + blockIds.push_back(description->getBlockId(i, j)); + } + } + + // Do a binary tree scatter. At each step the primary buffer of the sender is + // copied into the primary buffer of the receiver. + int32_t max2pow = 2; + while (max2pow < numRows) { + max2pow *= 2; + } + for (int32_t j = max2pow; j >= 2; j /= 2) { + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id] is the sender + int32_t senderId = blockIds[id]; + + // blockIds[id + j/2] is the receiver + int32_t receiverId = blockIds[id + j / 2]; + + // Get the stream associated with the receiver's block id + cudaStream_t stream = description->getBlockStreams()[receiverId]; + + // Copy from the sender to the receiver (use stream associated with receiver) + cudaMemcpyAsync(values[receiverId].Current(), + values[senderId].Current(), + sizeof(ValueType) * n, + cudaMemcpyDefault, + stream); + } + } + // Sync all the active streams before next step + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id + j/2] is the receiver + int32_t receiverId = blockIds[id + j / 2]; + + // Set device and sync receiver's stream + cudaSetDevice(description->getDeviceAssignments()[receiverId]); + cudaStreamSynchronize(description->getBlockStreams()[receiverId]); + } + } + } + } + + cudaSetDevice(current_device); + } + + /** + * Outputs a human readable string representation of this Vertex2d object. This is only + * intended to be used for de-bugging. + * @return Human readable string representation + */ + std::string toString() { + std::stringstream ss; + ValueType* c = (ValueType*) malloc(sizeof(ValueType) * n); + ValueType* a = (ValueType*) malloc(sizeof(ValueType) * n); + + int32_t numBlocks = description->getNumBlocks(); + + ss << "Vertex2d:\n"; + for (int32_t i = 0; i < numBlocks; i++) { + ss << "Block " << i << ":\n"; + ss << "Idx\tCur\tAlt\n"; + cudaMemcpy(c, values[i].Current(), sizeof(ValueType) * n, cudaMemcpyDefault); + cudaMemcpy(a, values[i].Alternate(), sizeof(ValueType) * n, cudaMemcpyDefault); + for (int32_t j = 0; j < n; j++) { + ss << j << ":\t" << c[j] << "\t" << a[j] << "\n"; + } + } + + free(c); + free(a); + + return ss.str(); + } + }; + + template + class VertexData2D_Unbuffered { + const MatrixDecompositionDescription* description; + int32_t n; + std::vector values; + + public: + /** + * Sets up a VertexData2D_Unbuffered object with an element allocated for each vertex + * in each block. + * @param descr Pointer to a MatrixDecompositionDescription object describing the layout + * of the 2D blocks. + */ + VertexData2D_Unbuffered(const MatrixDecompositionDescription* descr) : + description(descr) { + // Resize the values array to be the same size as number of blocks + values.resize(descr->getNumBlocks()); + + // Grab the current device id to switch back after allocations are done + int current_device; + cudaGetDevice(¤t_device); + LocalType allocSize = descr->getOffset(); + n = allocSize; + // Allocate the data for each block + cudaStream_t stream{nullptr}; + for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) { + int device = descr->getDeviceAssignments()[i]; + cudaSetDevice(device); + RMM_ALLOC(&(values[i]), sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + } + + // Set the device back to what it was initially + cudaSetDevice(current_device); + } + + /** + * Sets up a VertexData2D_Unbuffered object with _n elements allocated per block. + * @param descr Pointer to a MatrixDecompositionDescription object describing the layout + * of the 2D blocks. + * @param _n The number of elements to allocate per block. + */ + VertexData2D_Unbuffered(const MatrixDecompositionDescription* descr, + size_t _n) : + description(descr), n(_n) { + // Resize the values array to be the same size as number of blocks + values.resize(descr->getNumBlocks()); + + // Grab the current device id to switch back after allocations are done + int current_device; + cudaGetDevice(¤t_device); + // Allocate the data for each block + cudaStream_t stream{nullptr}; + for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) { + int device = descr->getDeviceAssignments()[i]; + cudaSetDevice(device); + RMM_ALLOC(&(values[i]), sizeof(ValueType) * n, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + } + + // Set the device back to what it was initially + cudaSetDevice(current_device); + } + + /** + * Destructor. Frees all allocated memory. + */ + ~VertexData2D_Unbuffered() { + cudaStream_t stream{nullptr}; + for (size_t i = 0; i < values.size(); i++) { + if (values[i]) { + RMM_FREE(values[i], stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + } + } + } + + /** + * Fills the elements of the data array with the given value. + * The elements on the diagonal are filled with the given value. After filling, + * either rowScatter or columnScatter will copy the values across the blocks in + * either the rows or columns depending on the use. + * @param val The value to fill the array with + */ + void fillElements(ValueType val) { + int current_device; + cudaGetDevice(¤t_device); + int32_t numRows = description->getBlockRows(); + for (int32_t i = 0; i < numRows; i++) { + int32_t blockId = description->getBlockId(i, i); + ValueType* vals = get(blockId); + int deviceId = description->getDeviceAssignments()[blockId]; + cudaStream_t stream = description->getBlockStreams()[blockId]; + cudaSetDevice(deviceId); + thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val); + } + description->syncAllStreams(); + cudaSetDevice(current_device); + } + + /** + * This implements a column-wise scatter of the global data from the corresponding + * row. i.e. The data reduced from row 1 is broadcast to all blocks in + * column 1. It is assumed that the data to broadcast is located in the block on + * the diagonal. + */ + void columnScatter() { + int current_device; + cudaGetDevice(¤t_device); + + // For each column in the decomposition: + int32_t numRows = description->getBlockRows(); + std::vector blockIds; + for (int32_t i = 0; i < numRows; i++) { + // Put all the block ids for the column into a vector, with the ID of the diagonal block + // at index 0. + std::vector blockIds; + blockIds.push_back(-1); + for (int32_t j = 0; j < numRows; j++) { + if (i == j) { + blockIds[0] = description->getBlockId(j, i); + } + else { + blockIds.push_back(description->getBlockId(j, i)); + } + } + + // Do a binary tree scatter. At each step the primary buffer of the sender is + // copied into the primary buffer of the receiver. + int32_t max2pow = 2; + while (max2pow < numRows) { + max2pow *= 2; + } + for (int32_t j = max2pow; j >= 2; j /= 2) { + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id] is the sender + int32_t senderId = blockIds[id]; + + // blockIds[id + j/2] is the sender + int32_t receiverId = blockIds[id + j / 2]; + + // Get the stream associated with the receiver's block id + cudaStream_t stream = description->getBlockStreams()[receiverId]; + + // Copy from the sender to the receiver (use stream associated with receiver) + cudaMemcpyAsync(values[receiverId], + values[senderId], + sizeof(ValueType) * n, + cudaMemcpyDefault, + stream); + } + } + // Synchronize all the active streams before next step. + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id + j/2] is the sender + int32_t receiverId = blockIds[id + j / 2]; + + // Set device and sync receiver's stream + cudaSetDevice(description->getDeviceAssignments()[receiverId]); + cudaStreamSynchronize(description->getBlockStreams()[receiverId]); + } + } + } + } + + cudaSetDevice(current_device); + } + + /** + * This implements a row-wise scatter of the global data from the corresponding + * column. i.e. The data reduced from column 1 is broadcast to all blocks in + * row 1. It is assumed that the data to broadcast is located in the block on + * the diagonal. + */ + void rowScatter() { + int current_device; + cudaGetDevice(¤t_device); + + // For each row in the decomposition: + int32_t numRows = description->getBlockRows(); + std::vector blockIds; + for (int32_t i = 0; i < numRows; i++) { + // Put all the block ids for the column into a vector, with the ID of the diagonal block + // at index 0. + std::vector blockIds; + blockIds.push_back(-1); + for (int32_t j = 0; j < numRows; j++) { + if (i == j) { + blockIds[0] = description->getBlockId(i, j); + } + else { + blockIds.push_back(description->getBlockId(i, j)); + } + } + + // Do a binary tree scatter. At each step the primary buffer of the sender is + // copied into the primary buffer of the receiver. + int32_t max2pow = 2; + while (max2pow < numRows) { + max2pow *= 2; + } + for (int32_t j = max2pow; j >= 2; j /= 2) { + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id] is the sender + int32_t senderId = blockIds[id]; + + // blockIds[id + j/2] is the receiver + int32_t receiverId = blockIds[id + j / 2]; + + // Get the stream associated with the receiver's block id + cudaStream_t stream = description->getBlockStreams()[receiverId]; + + // Copy from the sender to the receiver (use stream associated with receiver) + cudaMemcpyAsync(values[receiverId], + values[senderId], + sizeof(ValueType) * n, + cudaMemcpyDefault, + stream); + } + } + // Sync all the active streams before next step + for (int32_t id = 0; id < numRows; id++) { + if (id % j == 0 && id + j / 2 < numRows) { + // blockIds[id + j/2] is the receiver + int32_t receiverId = blockIds[id + j / 2]; + + // Set device and sync receiver's stream + cudaSetDevice(description->getDeviceAssignments()[receiverId]); + cudaStreamSynchronize(description->getBlockStreams()[receiverId]); + } + } + } + } + + cudaSetDevice(current_device); + } + + /** + * Getter for n + * @return The value of n + */ + int32_t getN() { + return n; + } + + /** + * Gets the pointer to the allocated memory for a specified block. + * @param bId The block id to get the memory for. + * @return A pointer to the allocated memory for the given block. + */ + ValueType* get(int32_t bId) { + return values[bId]; + } + }; + + /** + * This method takes in COO format matrix data and a MatrixDecompositionDescription and + * returns a Matrix2d object containing the given data. + */ + template + Matrix2d COOto2d(MatrixDecompositionDescription descr, + GlobalType* rowIds, + GlobalType* colIds, + ValueType* values) { + // Grab the current device id to switch back after allocations are done + int current_device; + cudaGetDevice(¤t_device); + + int32_t blockCount = descr.getNumBlocks(); + + // Allocate array of size global nnz to hold the block labels + int32_t* blockLabels = (int32_t*) malloc(descr.getNnz() * sizeof(int32_t)); + + // Allocate array to contain row counts for each block and initialize to zero + // Allocate array to contain position offsets for writing each blocks data + LocalType* blockCounts = (LocalType*) malloc(blockCount * sizeof(LocalType)); + LocalType* blockPos = (LocalType*) malloc(blockCount * sizeof(LocalType)); + for (int i = 0; i < blockCount; i++) { + blockCounts[i] = 0; + blockPos[i] = 0; + } + + // For each edge mark in the array the id of the block to which it will belong + int32_t blockId; + LocalType localRow; + LocalType localCol; + for (int i = 0; i < descr.getNnz(); i++) { + descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId); + blockLabels[i] = blockId; + blockCounts[blockId]++; + } + + // Allocate arrays for putting each blocks data into + LocalType** blockRowIds = (LocalType**) malloc(blockCount * sizeof(LocalType*)); + LocalType** blockColIds = (LocalType**) malloc(blockCount * sizeof(LocalType*)); + ValueType** blockValues = NULL; + if (values) + blockValues = (ValueType**) malloc(blockCount * sizeof(ValueType*)); + for (int i = 0; i < blockCount; i++) { + blockRowIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType)); + blockColIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType)); + if (values) + blockValues[i] = (ValueType*) malloc(blockCounts[i] * sizeof(ValueType)); + } + + // Convert each blocks global rows to local ids and copy into block arrays + for (int i = 0; i < descr.getNnz(); i++) { + descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId); + blockRowIds[blockId][blockPos[blockId]] = localRow; + blockColIds[blockId][blockPos[blockId]] = localCol; + if (values) + blockValues[blockId][blockPos[blockId]] = values[i]; + blockPos[blockId]++; + } + + // Allocate the result blocks vector + std::vector*> blockVector(blockCount); + + // Convert each blocks COO rows into CSR and create it's graph object. + for (int i = 0; i < blockCount; i++) { + // Set the device as indicated so the data ends up on the right GPU + cudaSetDevice(descr.getDeviceAssignments()[i]); + cudaStream_t stream = descr.getBlockStreams()[i]; + + if (blockCounts[i] > 0) { + CSR_Result_Weighted result; + ConvertCOOtoCSR_weighted(blockRowIds[i], + blockColIds[i], + values ? blockValues[i] : NULL, + (int64_t) blockCounts[i], + (descr.getOffset() - 1), + result); + MultiValuedCsrGraph* csrGraph = new MultiValuedCsrGraph((size_t) result.size, (size_t) result.nnz, stream); + if (values) + csrGraph->allocateEdgeData(1, NULL); + cudaMemcpy(csrGraph->get_raw_row_offsets(), + result.rowOffsets, + (result.size + 1) * sizeof(LocalType), + cudaMemcpyDefault); + cudaMemcpy(csrGraph->get_raw_column_indices(), + result.colIndices, + result.nnz * sizeof(LocalType), + cudaMemcpyDefault); + if (values) + cudaMemcpy(csrGraph->get_raw_edge_dim(0), + result.edgeWeights, + result.nnz * sizeof(LocalType), + cudaMemcpyDefault); + blockVector[i] = csrGraph; + result.Destroy(); + } + else { + MultiValuedCsrGraph* csrGraph = new MultiValuedCsrGraph((size_t) descr.getOffset(), (size_t) 0, stream); + cudaMemset( csrGraph->get_raw_row_offsets(), + 0, + sizeof(LocalType) * (descr.getOffset() + 1)); + blockVector[i] = csrGraph; + } + } + + // Free temporary memory + for (int i = 0; i < blockCount; i++) { + free(blockRowIds[i]); + free(blockColIds[i]); + if (values) + free(blockValues[i]); + } + free(blockRowIds); + free(blockColIds); + if (values) + free(blockValues); + + cudaSetDevice(current_device); + + // Put it all together into a Matrix2d object for return + return Matrix2d(descr, blockVector); + } } diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_app.cu b/cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_app.cu similarity index 100% rename from cpp/nvgraph/cpp/include/app/nvlouvain_app.cu rename to cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_app.cu diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_app_hierarchy.cu b/cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_app_hierarchy.cu similarity index 100% rename from cpp/nvgraph/cpp/include/app/nvlouvain_app_hierarchy.cu rename to cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_app_hierarchy.cu diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu b/cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_sample.cu similarity index 100% rename from cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu rename to cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_sample.cu diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu b/cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_sample_hierarchy.cu similarity index 100% rename from cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu rename to cpp/nvgraph/cpp/include/app_to_be_removed/nvlouvain_sample_hierarchy.cu diff --git a/cpp/nvgraph/cpp/include/csr_graph.hxx b/cpp/nvgraph/cpp/include/csr_graph.hxx index db77baed371..3abd3adc71b 100644 --- a/cpp/nvgraph/cpp/include/csr_graph.hxx +++ b/cpp/nvgraph/cpp/include/csr_graph.hxx @@ -17,7 +17,7 @@ #pragma once #include "graph.hxx" -#include // interface with CuMem (memory pool lib) for shared ptr +#include "rmm_shared_ptr.hxx" namespace nvgraph { @@ -41,11 +41,11 @@ protected: /*! Storage for the row offsets of the CSR data structure. Also called the "row pointer" array. */ - SHARED_PREFIX::shared_ptr row_offsets; + std::shared_ptr row_offsets; /*! Storage for the column indices of the CSR data structure. */ - SHARED_PREFIX::shared_ptr column_indices; + std::shared_ptr column_indices; public: @@ -109,8 +109,30 @@ public: } inline IndexType* get_raw_row_offsets() { return row_offsets.get(); } inline IndexType* get_raw_column_indices() { return column_indices.get(); } - inline void set_raw_row_offsets(IndexType* ptr) { row_offsets = attachDevicePtr(ptr, stream_); } - inline void set_raw_column_indices(IndexType* ptr) {column_indices = attachDevicePtr(ptr, stream_); } + + inline void set_raw_row_offsets(IndexType* ptr) { + // This abuses std::shared_ptr. In this context, row_offsets does not + // participate in ownership (attachDevicePtr returns std::shared_ptr + // with a dummy deleter). row_offsets just work as a raw pointer, and + // this can be very misleading. However, to properly fix this, we need + // to modify gdf_column and gdf_graph as well, and we do not know yet + // how cudf people will modify gdf_column to address currently broken + // memory ownership model. So, we may leave this as is, but htis needs + // to be revisited, later. + row_offsets = attachDevicePtr(ptr, stream_); + } + + inline void set_raw_column_indices(IndexType* ptr) { + // This abuses std::shared_ptr. In this context, column_indices does not + // participate in ownership (attachDevicePtr returns std::shared_ptr + // with a dummy deleter). column_indices just work as a raw pointer, and + // this can be very misleading. However, to properly fix this, we need + // to modify gdf_column and gdf_graph as well, and we do not know yet + // how cudf people will modify gdf_column to address currently broken + // memory ownership model. So, we may leave this as is, but htis needs + column_indices = attachDevicePtr(ptr, stream_); + } + inline const IndexType* get_raw_row_offsets() const { return row_offsets.get(); } inline const IndexType* get_raw_column_indices() const { return column_indices.get(); } inline cudaStream_t get_stream() const { return stream_; } diff --git a/cpp/nvgraph/cpp/include/delta_modularity.cuh b/cpp/nvgraph/cpp/include/delta_modularity.cuh index b396757b30b..e7ad9466dd2 100644 --- a/cpp/nvgraph/cpp/include/delta_modularity.cuh +++ b/cpp/nvgraph/cpp/include/delta_modularity.cuh @@ -22,14 +22,16 @@ #include #include #include +#include + +#include +#include #include "util.cuh" #include "graph_utils.cuh" #include "functor.cuh" //#include "block_delta_modularity.cuh" -#include - namespace nvlouvain{ @@ -371,11 +373,11 @@ max_delta_modularity_vec(const int n_vertex, // Not used template void build_delta_modularity_vector_old(const int n_vertex, const int c_size, ValType m2, bool updated, - thrust::device_vector& csr_ptr_d, thrust::device_vector& csr_ind_d, thrust::device_vector& csr_val_d, - thrust::device_vector& cluster_d, + rmm::device_vector& csr_ptr_d, rmm::device_vector& csr_ind_d, rmm::device_vector& csr_val_d, + rmm::device_vector& cluster_d, IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr, // precompute cluster inverse ValType* k_vec_ptr, // precompute ki's - thrust::device_vector& temp_vec, // temp global memory with size n_vertex + rmm::device_vector& temp_vec, // temp global memory with size n_vertex ValType* cluster_sum_vec_ptr, ValType* delta_Q_arr_ptr){ @@ -425,8 +427,8 @@ void build_delta_modularity_vector_old(const int n_vertex, const int c_size, Val // template void build_delta_modularity_vector(cusparseHandle_t cusp_handle, const int n_vertex, const int c_size, ValType m2, bool updated, - thrust::device_vector& csr_ptr_d, thrust::device_vector& csr_ind_d, thrust::device_vector& csr_val_d, - thrust::device_vector& cluster_d, + rmm::device_vector& csr_ptr_d, rmm::device_vector& csr_ind_d, rmm::device_vector& csr_val_d, + rmm::device_vector& cluster_d, IdxType* cluster_inv_ptr_ptr, IdxType* cluster_inv_ind_ptr, // precompute cluster inverse ValType* k_vec_ptr, // precompute ki's ValType* cluster_sum_vec_ptr, @@ -449,7 +451,7 @@ void build_delta_modularity_vector(cusparseHandle_t cusp_handle, const int n_ver IdxType *cluster_ptr = thrust::raw_pointer_cast(cluster_d.data()); // pre compute coo row indices using cusparse - thrust::device_vector coo_row_ind(n_edges); + rmm::device_vector coo_row_ind(n_edges); IdxType* coo_row_ind_ptr = thrust::raw_pointer_cast(coo_row_ind.data()); cusparseXcsr2coo(cusp_handle, csr_ptr_ptr, n_edges, n_vertex, coo_row_ind_ptr, diff --git a/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx b/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx index 4b2222422fe..cb51ff8b9de 100644 --- a/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx +++ b/cpp/nvgraph/cpp/include/graph_concrete_visitors.hxx @@ -1420,8 +1420,8 @@ namespace nvgraph CsrGraph* extract_from_vertex_subset(CsrGraph& graph, IndexT* pV, size_t n, cudaStream_t stream) { - typedef thrust::device_vector VectorI; - typedef thrust::device_vector VectorV; + typedef rmm::device_vector VectorI; + typedef rmm::device_vector VectorV; VectorI vSub(pV, pV+n); validate_input(vSub, graph.get_num_vertices()); @@ -1435,8 +1435,8 @@ namespace nvgraph CsrGraph* extract_from_edge_subset(CsrGraph& graph, IndexT* pV, size_t n, cudaStream_t stream) { - typedef thrust::device_vector VectorI; - typedef thrust::device_vector VectorV; + typedef rmm::device_vector VectorI; + typedef rmm::device_vector VectorV; VectorI vSub(pV, pV+n); validate_input(vSub, graph.get_num_edges()); diff --git a/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx b/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx index 36d3fced642..cacd7746e03 100644 --- a/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx +++ b/cpp/nvgraph/cpp/include/graph_contracting_structs.hxx @@ -1692,13 +1692,13 @@ namespace nvgraph // The size of the GMEM buffers (number of elements). size_t m_gmem_size; // The status: OK if count_non_zeroes succeeded, FAILED otherwise. - SHARED_PREFIX::shared_ptr m_status; + std::shared_ptr m_status; // The work queue for dynamic load balancing in the kernels. - SHARED_PREFIX::shared_ptr m_work_queue; + std::shared_ptr m_work_queue; // The buffer to store keys in GMEM. - SHARED_PREFIX::shared_ptr m_keys; + std::shared_ptr m_keys; // The buffer to store values in GMEM. - SHARED_PREFIX::shared_ptr m_vals; + std::shared_ptr m_vals; public: // Create a workspace. @@ -2198,8 +2198,8 @@ namespace nvgraph //AMGX uses pool allocator thrust::global_thread_handle::cudaMallocHost(), here... // - SHARED_PREFIX::shared_ptr h_status(new IndexT); - SHARED_PREFIX::shared_ptr h_work_offset(new IndexT); + std::shared_ptr h_status(new IndexT); + std::shared_ptr h_work_offset(new IndexT); cudaStream_t stream = 0; // for now... diff --git a/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx b/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx index e958a27ed0c..3b06c1cd567 100644 --- a/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx +++ b/cpp/nvgraph/cpp/include/graph_contracting_visitor.hxx @@ -42,6 +42,9 @@ #include #include // +#include +#include + //debugging only: #include @@ -1624,8 +1627,8 @@ namespace{ //unnamed.. const SemiRingFunctorTypes& eCombine, const SemiRingFunctorTypes& eReduce) { - typedef thrust::device_vector VectorI; - typedef thrust::device_vector VectorV; + typedef rmm::device_vector VectorI; + typedef rmm::device_vector VectorV; VectorI aggregates(p_aggregates, p_aggregates+n); @@ -1664,8 +1667,8 @@ namespace{ //unnamed.. const SemiRingFunctorTypes& eCombine, const SemiRingFunctorTypes& eReduce) { - typedef thrust::device_vector VectorI; - typedef thrust::device_vector VectorV; + typedef rmm::device_vector VectorI; + typedef rmm::device_vector VectorV; VectorI aggregates(p_aggregates, p_aggregates+n); diff --git a/cpp/nvgraph/cpp/include/graph_utils.cuh b/cpp/nvgraph/cpp/include/graph_utils.cuh index 29350213dcf..f57d0322fcb 100644 --- a/cpp/nvgraph/cpp/include/graph_utils.cuh +++ b/cpp/nvgraph/cpp/include/graph_utils.cuh @@ -31,6 +31,9 @@ #include #include +#include +#include + #define USE_CG 1 #define DEBUG 1 @@ -59,6 +62,20 @@ namespace nvlouvain #define WHERE "" #endif +// This is a gap filler, and should be replaced with a RAPIDS-wise error handling mechanism. +#undef rmmCheckError +#ifdef DEBUG + #define WHERE " at: " << __FILE__ << ':' << __LINE__ + #define rmmCheckError(e) { \ + if(e != RMM_SUCCESS) { \ + std::cerr << "RMM failure: " << WHERE << std::endl; \ + } \ + } +#else + #define rmmCheckError(e) + #define WHERE "" +#endif + template static __device__ __forceinline__ T shfl_up(T r, int offset, int bound = 32, int mask = DEFAULT_MASK) { @@ -279,7 +296,7 @@ flag_leafs ( const IndexType n, IndexType *degree, ValueType *bookmark) { //notice that in the transposed matrix/csc a dangling node is a node without incomming edges template void google_matrix ( const IndexType n, const IndexType e, const IndexType *cooColInd, ValueType *cooVal, ValueType *bookmark) { - thrust::device_vector degree(n,0); + rmm::device_vector degree(n,0); dim3 nthreads, nblocks; nthreads.x = min(e,CUDA_MAX_KERNEL_THREADS); nthreads.y = 1; diff --git a/cpp/nvgraph/cpp/include/modularity.cuh b/cpp/nvgraph/cpp/include/modularity.cuh index 49917ce30d7..cc58771d04b 100644 --- a/cpp/nvgraph/cpp/include/modularity.cuh +++ b/cpp/nvgraph/cpp/include/modularity.cuh @@ -24,6 +24,9 @@ #include #include +#include +#include + #include "util.cuh" #include "graph_utils.cuh" #include "functor.cuh" @@ -226,8 +229,8 @@ template void generate_cluster_inv(const int n_vertex, const int c_size, IdxIter cluster_iter, - thrust::device_vector& cluster_inv_ptr, - thrust::device_vector& cluster_inv_ind){ + rmm::device_vector& cluster_inv_ptr, + rmm::device_vector& cluster_inv_ind){ int nthreads = min(n_vertex,CUDA_MAX_KERNEL_THREADS); int nblocks = min((n_vertex + nthreads - 1)/nthreads,CUDA_MAX_BLOCKS); diff --git a/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx b/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx index 2af20f252af..55a63c1295b 100644 --- a/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx +++ b/cpp/nvgraph/cpp/include/multi_valued_csr_graph.hxx @@ -38,8 +38,8 @@ protected: //std::vector *> values_dim; //std::vector *> vertex_dim; - std::vector > > values_dim; - std::vector > > vertex_dim; + std::vector > > values_dim; + std::vector > > vertex_dim; public: /*! Storage for the nonzero entries of the Multi-CSR data structure.*/ @@ -78,28 +78,28 @@ public: { vertex_dim.resize(v_dim); for (size_t i = 0; i < vertex_dim.size(); ++i) - vertex_dim[i] = SHARED_PREFIX::shared_ptr >(new Vector(this->num_vertices, stream)); + vertex_dim[i] = std::shared_ptr >(new Vector(this->num_vertices, stream)); } inline void allocateEdgeData(size_t edges_dim, cudaStream_t stream) { values_dim.resize(edges_dim); for (size_t i = 0; i < values_dim.size(); ++i) - values_dim[i] = SHARED_PREFIX::shared_ptr >(new Vector(this->num_edges, stream)); + values_dim[i] = std::shared_ptr >(new Vector(this->num_edges, stream)); } inline void attachVertexData(size_t i, ValueType* data, cudaStream_t stream) { if (vertex_dim.size() <= i) vertex_dim.resize(i+1); - vertex_dim[i] = SHARED_PREFIX::shared_ptr >(new Vector(this->num_vertices, data, stream)); + vertex_dim[i] = std::shared_ptr >(new Vector(this->num_vertices, data, stream)); } inline void attachEdgeData(size_t i, ValueType* data, cudaStream_t stream) { if (values_dim.size() <= i) values_dim.resize(i+1); - values_dim[i] = SHARED_PREFIX::shared_ptr >(new Vector(this->num_edges, data, stream)); + values_dim[i] = std::shared_ptr >(new Vector(this->num_edges, data, stream)); } inline size_t getNumValues() { @@ -124,7 +124,7 @@ public: //ValuedCsrGraph *v = new ValuedCsrGraph(static_cast >(*this), *values_dim[dim_index]); //return *v; - //SHARED_PREFIX::shared_ptr > svcsr = SHARED_PREFIX::shared_ptr >(new ValuedCsrGraph(static_cast >(*this), *values_dim[dim_index])); + //std::shared_ptr > svcsr = std::shared_ptr >(new ValuedCsrGraph(static_cast >(*this), *values_dim[dim_index])); //return svcsr; //segfaults ///return ValuedCsrGraph(static_cast >(*this), *values_dim[dim_index]);//segfaults diff --git a/cpp/nvgraph/cpp/include/nvgraph.h b/cpp/nvgraph/cpp/include/nvgraph.h index f51daf68b0a..479c3faa51d 100644 --- a/cpp/nvgraph/cpp/include/nvgraph.h +++ b/cpp/nvgraph/cpp/include/nvgraph.h @@ -17,8 +17,10 @@ #ifndef _NVGRAPH_H_ #define _NVGRAPH_H_ -#include "stddef.h" -#include "stdint.h" +#include +#include + +#include #include "library_types.h" @@ -26,7 +28,13 @@ #define NVG_CUDA_TRY(T) {\ if (T != cudaSuccess)\ return NVGRAPH_STATUS_ALLOC_FAILED;\ - } + } + +// This is a gap filler, and should be replaced with a RAPIDS-wise error handling mechanism. +#define NVG_RMM_TRY(T) {\ + if (T != RMM_SUCCESS)\ + return NVGRAPH_STATUS_ALLOC_FAILED;\ + } #ifndef NVGRAPH_API #ifdef _WIN32 @@ -40,478 +48,477 @@ extern "C" { #endif - /* nvGRAPH status type returns */ - typedef enum - { - NVGRAPH_STATUS_SUCCESS = 0, - NVGRAPH_STATUS_NOT_INITIALIZED = 1, - NVGRAPH_STATUS_ALLOC_FAILED = 2, - NVGRAPH_STATUS_INVALID_VALUE = 3, - NVGRAPH_STATUS_ARCH_MISMATCH = 4, - NVGRAPH_STATUS_MAPPING_ERROR = 5, - NVGRAPH_STATUS_EXECUTION_FAILED = 6, - NVGRAPH_STATUS_INTERNAL_ERROR = 7, - NVGRAPH_STATUS_TYPE_NOT_SUPPORTED = 8, - NVGRAPH_STATUS_NOT_CONVERGED = 9, - NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED = 10 - - } nvgraphStatus_t; - - const char* nvgraphStatusGetString(nvgraphStatus_t status); - - /* Opaque structure holding nvGRAPH library context */ - struct nvgraphContext; - typedef struct nvgraphContext *nvgraphHandle_t; - - /* Opaque structure holding the graph descriptor */ - struct nvgraphGraphDescr; - typedef struct nvgraphGraphDescr *nvgraphGraphDescr_t; - - /* Semi-ring types */ - typedef enum - { - NVGRAPH_PLUS_TIMES_SR = 0, - NVGRAPH_MIN_PLUS_SR = 1, - NVGRAPH_MAX_MIN_SR = 2, - NVGRAPH_OR_AND_SR = 3, - } nvgraphSemiring_t; - - /* Topology types */ - typedef enum - { - NVGRAPH_CSR_32 = 0, - NVGRAPH_CSC_32 = 1, - NVGRAPH_COO_32 = 2, - NVGRAPH_2D_32I_32I = 3, - NVGRAPH_2D_64I_32I = 4 - } nvgraphTopologyType_t; - - typedef enum - { - NVGRAPH_DEFAULT = 0, // Default is unsorted. - NVGRAPH_UNSORTED = 1, // - NVGRAPH_SORTED_BY_SOURCE = 2, // CSR - NVGRAPH_SORTED_BY_DESTINATION = 3 // CSC - } nvgraphTag_t; - - typedef enum - { - NVGRAPH_MULTIPLY = 0, - NVGRAPH_SUM = 1, - NVGRAPH_MIN = 2, - NVGRAPH_MAX = 3 - } nvgraphSemiringOps_t; - - typedef enum - { - NVGRAPH_MODULARITY_MAXIMIZATION = 0, //maximize modularity with Lanczos solver - NVGRAPH_BALANCED_CUT_LANCZOS = 1, //minimize balanced cut with Lanczos solver - NVGRAPH_BALANCED_CUT_LOBPCG = 2 //minimize balanced cut with LOPCG solver - } nvgraphSpectralClusteringType_t; - - struct SpectralClusteringParameter { - int n_clusters; //number of clusters - int n_eig_vects; // //number of eigenvectors - nvgraphSpectralClusteringType_t algorithm; // algorithm to use - float evs_tolerance; // tolerance of the eigensolver - int evs_max_iter; // maximum number of iterations of the eigensolver - float kmean_tolerance; // tolerance of kmeans - int kmean_max_iter; // maximum number of iterations of kemeans - void * opt; // optional parameter that can be used for preconditioning in the future - }; - - typedef enum - { - NVGRAPH_MODULARITY, // clustering score telling how good the clustering is compared to random assignment. - NVGRAPH_EDGE_CUT, // total number of edges between clusters. - NVGRAPH_RATIO_CUT // sum for all clusters of the number of edges going outside of the cluster divided by the number of vertex inside the cluster - } nvgraphClusteringMetric_t; - - struct nvgraphCSRTopology32I_st { - int nvertices; // n+1 - int nedges; // nnz - int *source_offsets; // rowPtr - int *destination_indices; // colInd - }; - typedef struct nvgraphCSRTopology32I_st *nvgraphCSRTopology32I_t; - - struct nvgraphCSCTopology32I_st { - int nvertices; // n+1 - int nedges; // nnz - int *destination_offsets; // colPtr - int *source_indices; // rowInd - }; - typedef struct nvgraphCSCTopology32I_st *nvgraphCSCTopology32I_t; - - struct nvgraphCOOTopology32I_st { - int nvertices; // n+1 - int nedges; // nnz - int *source_indices; // rowInd - int *destination_indices; // colInd - nvgraphTag_t tag; - }; - typedef struct nvgraphCOOTopology32I_st *nvgraphCOOTopology32I_t; - - struct nvgraph2dCOOTopology32I_st { - int nvertices; - int nedges; - int *source_indices; // Row Indices - int *destination_indices; // Column Indices - cudaDataType_t valueType; // The type of values being given. - void *values; // Pointer to array of values. - int numDevices; // Gives the number of devices to be used. - int *devices; // Array of device IDs to use. - int blockN; // Specifies the value of n for an n x n matrix decomposition. - nvgraphTag_t tag; - }; - typedef struct nvgraph2dCOOTopology32I_st *nvgraph2dCOOTopology32I_t; - - /* Return properties values for the nvGraph library, such as library version */ - nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value); - - /* Open the library and create the handle */ - nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle); - nvgraphStatus_t NVGRAPH_API nvgraphCreateMulti( nvgraphHandle_t *handle, - int numDevices, - int* devices); - - /* Close the library and destroy the handle */ - nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle); - - /* Create an empty graph descriptor */ - nvgraphStatus_t NVGRAPH_API nvgraphCreateGraphDescr( nvgraphHandle_t handle, - nvgraphGraphDescr_t *descrG); - - /* Destroy a graph descriptor */ - nvgraphStatus_t NVGRAPH_API nvgraphDestroyGraphDescr( nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG); - - /* Set size, topology data in the graph descriptor */ - nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure( nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void* topologyData, - nvgraphTopologyType_t TType); - - /* Query size and topology information from the graph descriptor */ - nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure( nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void* topologyData, - nvgraphTopologyType_t* TType); - - /* Allocate numsets vectors of size V representing Vertex Data and attached them the graph. - * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */ - nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t numsets, - cudaDataType_t *settypes); - - /* Allocate numsets vectors of size E representing Edge Data and attached them the graph. - * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */ - nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData( nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t numsets, - cudaDataType_t *settypes); - - /* Update the vertex set #setnum with the data in *vertexData, sets have 0-based index - * Conversions are not supported so nvgraphTopologyType_t should match the graph structure */ - nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData( nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *vertexData, - size_t setnum); - - /* Copy the edge set #setnum in *edgeData, sets have 0-based index - * Conversions are not supported so nvgraphTopologyType_t should match the graph structure */ - nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData( nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *vertexData, - size_t setnum); - - /* Convert the edge data to another topology - */ - nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology(nvgraphHandle_t handle, - nvgraphTopologyType_t srcTType, - void *srcTopology, - void *srcEdgeData, - cudaDataType_t *dataType, - nvgraphTopologyType_t dstTType, - void *dstTopology, - void *dstEdgeData); - - /* Convert graph to another structure - */ - nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph(nvgraphHandle_t handle, - nvgraphGraphDescr_t srcDescrG, - nvgraphGraphDescr_t dstDescrG, - nvgraphTopologyType_t dstTType); - - /* Update the edge set #setnum with the data in *edgeData, sets have 0-based index - */ - nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData( nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *edgeData, - size_t setnum); - - /* Copy the edge set #setnum in *edgeData, sets have 0-based index - */ - nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData( nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *edgeData, - size_t setnum); - - /* create a new graph by extracting a subgraph given a list of vertices - */ - nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex( nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - nvgraphGraphDescr_t subdescrG, - int *subvertices, - size_t numvertices); - /* create a new graph by extracting a subgraph given a list of edges - */ - nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - nvgraphGraphDescr_t subdescrG, - int *subedges, - size_t numedges); - - /* nvGRAPH Semi-ring sparse matrix vector multiplication - */ - nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const void *alpha, - const size_t x_index, - const void *beta, - const size_t y_index, - const nvgraphSemiring_t SR); - - /* Helper struct for Traversal parameters - */ - typedef struct { - size_t pad[128]; - } nvgraphTraversalParameter_t; - - /* Initializes traversal parameters with default values - */ - nvgraphStatus_t NVGRAPH_API nvgraphTraversalParameterInit(nvgraphTraversalParameter_t *param); - - /* Stores/retrieves index of a vertex data where target distances will be stored - */ - nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetDistancesIndex( nvgraphTraversalParameter_t *param, - const size_t value); - - nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex( const nvgraphTraversalParameter_t param, - size_t *value); - - /* Stores/retrieves index of a vertex data where path predecessors will be stored - */ - nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetPredecessorsIndex( nvgraphTraversalParameter_t *param, - const size_t value); - - nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex( const nvgraphTraversalParameter_t param, - size_t *value); - - /* Stores/retrieves index of an edge data which tells traversal algorithm whether path can go through an edge or not - */ - nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetEdgeMaskIndex( nvgraphTraversalParameter_t *param, - const size_t value); - - nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex( const nvgraphTraversalParameter_t param, - size_t *value); - - /* Stores/retrieves flag that tells an algorithm whether the graph is directed or not - */ - nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetUndirectedFlag( nvgraphTraversalParameter_t *param, - const size_t value); - - nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag( const nvgraphTraversalParameter_t param, - size_t *value); - - /* Stores/retrieves 'alpha' and 'beta' parameters for BFS traversal algorithm - */ - nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetAlpha( nvgraphTraversalParameter_t *param, - const size_t value); - - nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetAlpha( const nvgraphTraversalParameter_t param, - size_t *value); - - nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetBeta( nvgraphTraversalParameter_t *param, - const size_t value); - - nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetBeta( const nvgraphTraversalParameter_t param, - size_t *value); + /* nvGRAPH status type returns */ + typedef enum + { + NVGRAPH_STATUS_SUCCESS = 0, + NVGRAPH_STATUS_NOT_INITIALIZED = 1, + NVGRAPH_STATUS_ALLOC_FAILED = 2, + NVGRAPH_STATUS_INVALID_VALUE = 3, + NVGRAPH_STATUS_ARCH_MISMATCH = 4, + NVGRAPH_STATUS_MAPPING_ERROR = 5, + NVGRAPH_STATUS_EXECUTION_FAILED = 6, + NVGRAPH_STATUS_INTERNAL_ERROR = 7, + NVGRAPH_STATUS_TYPE_NOT_SUPPORTED = 8, + NVGRAPH_STATUS_NOT_CONVERGED = 9, + NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED = 10 + + } nvgraphStatus_t; + + const char* nvgraphStatusGetString(nvgraphStatus_t status); + + /* Opaque structure holding nvGRAPH library context */ + struct nvgraphContext; + typedef struct nvgraphContext *nvgraphHandle_t; + + /* Opaque structure holding the graph descriptor */ + struct nvgraphGraphDescr; + typedef struct nvgraphGraphDescr *nvgraphGraphDescr_t; + + /* Semi-ring types */ + typedef enum + { + NVGRAPH_PLUS_TIMES_SR = 0, + NVGRAPH_MIN_PLUS_SR = 1, + NVGRAPH_MAX_MIN_SR = 2, + NVGRAPH_OR_AND_SR = 3, + } nvgraphSemiring_t; + + /* Topology types */ + typedef enum + { + NVGRAPH_CSR_32 = 0, + NVGRAPH_CSC_32 = 1, + NVGRAPH_COO_32 = 2, + NVGRAPH_2D_32I_32I = 3, + NVGRAPH_2D_64I_32I = 4 + } nvgraphTopologyType_t; + + typedef enum + { + NVGRAPH_DEFAULT = 0, // Default is unsorted. + NVGRAPH_UNSORTED = 1, // + NVGRAPH_SORTED_BY_SOURCE = 2, // CSR + NVGRAPH_SORTED_BY_DESTINATION = 3 // CSC + } nvgraphTag_t; + + typedef enum + { + NVGRAPH_MULTIPLY = 0, + NVGRAPH_SUM = 1, + NVGRAPH_MIN = 2, + NVGRAPH_MAX = 3 + } nvgraphSemiringOps_t; + + typedef enum + { + NVGRAPH_MODULARITY_MAXIMIZATION = 0, //maximize modularity with Lanczos solver + NVGRAPH_BALANCED_CUT_LANCZOS = 1, //minimize balanced cut with Lanczos solver + NVGRAPH_BALANCED_CUT_LOBPCG = 2 //minimize balanced cut with LOPCG solver + } nvgraphSpectralClusteringType_t; + + struct SpectralClusteringParameter { + int n_clusters; //number of clusters + int n_eig_vects; // //number of eigenvectors + nvgraphSpectralClusteringType_t algorithm; // algorithm to use + float evs_tolerance; // tolerance of the eigensolver + int evs_max_iter; // maximum number of iterations of the eigensolver + float kmean_tolerance; // tolerance of kmeans + int kmean_max_iter; // maximum number of iterations of kemeans + void * opt; // optional parameter that can be used for preconditioning in the future + }; + + typedef enum + { + NVGRAPH_MODULARITY, // clustering score telling how good the clustering is compared to random assignment. + NVGRAPH_EDGE_CUT, // total number of edges between clusters. + NVGRAPH_RATIO_CUT // sum for all clusters of the number of edges going outside of the cluster divided by the number of vertex inside the cluster + } nvgraphClusteringMetric_t; + + struct nvgraphCSRTopology32I_st { + int nvertices; // n+1 + int nedges; // nnz + int *source_offsets; // rowPtr + int *destination_indices; // colInd + }; + typedef struct nvgraphCSRTopology32I_st *nvgraphCSRTopology32I_t; + + struct nvgraphCSCTopology32I_st { + int nvertices; // n+1 + int nedges; // nnz + int *destination_offsets; // colPtr + int *source_indices; // rowInd + }; + typedef struct nvgraphCSCTopology32I_st *nvgraphCSCTopology32I_t; + + struct nvgraphCOOTopology32I_st { + int nvertices; // n+1 + int nedges; // nnz + int *source_indices; // rowInd + int *destination_indices; // colInd + nvgraphTag_t tag; + }; + typedef struct nvgraphCOOTopology32I_st *nvgraphCOOTopology32I_t; + + struct nvgraph2dCOOTopology32I_st { + int nvertices; + int nedges; + int *source_indices; // Row Indices + int *destination_indices; // Column Indices + cudaDataType_t valueType; // The type of values being given. + void *values; // Pointer to array of values. + int numDevices; // Gives the number of devices to be used. + int *devices; // Array of device IDs to use. + int blockN; // Specifies the value of n for an n x n matrix decomposition. + nvgraphTag_t tag; + }; + typedef struct nvgraph2dCOOTopology32I_st *nvgraph2dCOOTopology32I_t; + + /* Return properties values for the nvGraph library, such as library version */ + nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value); + + /* Open the library and create the handle */ + nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle); + nvgraphStatus_t NVGRAPH_API nvgraphCreateMulti(nvgraphHandle_t *handle, + int numDevices, + int* devices); + + /* Close the library and destroy the handle */ + nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle); + + /* Create an empty graph descriptor */ + nvgraphStatus_t NVGRAPH_API nvgraphCreateGraphDescr(nvgraphHandle_t handle, + nvgraphGraphDescr_t *descrG); + + /* Destroy a graph descriptor */ + nvgraphStatus_t NVGRAPH_API nvgraphDestroyGraphDescr(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG); + + /* Set size, topology data in the graph descriptor */ + nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void* topologyData, + nvgraphTopologyType_t TType); + + /* Query size and topology information from the graph descriptor */ + nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void* topologyData, + nvgraphTopologyType_t* TType); + + /* Allocate numsets vectors of size V representing Vertex Data and attached them the graph. + * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */ + nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + size_t numsets, + cudaDataType_t *settypes); + + /* Allocate numsets vectors of size E representing Edge Data and attached them the graph. + * settypes[i] is the type of vector #i, currently all Vertex and Edge data should have the same type */ + nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + size_t numsets, + cudaDataType_t *settypes); + + /* Update the vertex set #setnum with the data in *vertexData, sets have 0-based index + * Conversions are not supported so nvgraphTopologyType_t should match the graph structure */ + nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void *vertexData, + size_t setnum); + + /* Copy the edge set #setnum in *edgeData, sets have 0-based index + * Conversions are not supported so nvgraphTopologyType_t should match the graph structure */ + nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void *vertexData, + size_t setnum); + + /* Convert the edge data to another topology + */ + nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology(nvgraphHandle_t handle, + nvgraphTopologyType_t srcTType, + void *srcTopology, + void *srcEdgeData, + cudaDataType_t *dataType, + nvgraphTopologyType_t dstTType, + void *dstTopology, + void *dstEdgeData); + + /* Convert graph to another structure + */ + nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph(nvgraphHandle_t handle, + nvgraphGraphDescr_t srcDescrG, + nvgraphGraphDescr_t dstDescrG, + nvgraphTopologyType_t dstTType); + + /* Update the edge set #setnum with the data in *edgeData, sets have 0-based index + */ + nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void *edgeData, + size_t setnum); + + /* Copy the edge set #setnum in *edgeData, sets have 0-based index + */ + nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void *edgeData, + size_t setnum); + + /* create a new graph by extracting a subgraph given a list of vertices + */ + nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + nvgraphGraphDescr_t subdescrG, + int *subvertices, + size_t numvertices); + /* create a new graph by extracting a subgraph given a list of edges + */ + nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + nvgraphGraphDescr_t subdescrG, + int *subedges, + size_t numedges); + + /* nvGRAPH Semi-ring sparse matrix vector multiplication + */ + nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const void *alpha, + const size_t x_index, + const void *beta, + const size_t y_index, + const nvgraphSemiring_t SR); + + /* Helper struct for Traversal parameters + */ + typedef struct { + size_t pad[128]; + } nvgraphTraversalParameter_t; + + /* Initializes traversal parameters with default values + */ + nvgraphStatus_t NVGRAPH_API nvgraphTraversalParameterInit(nvgraphTraversalParameter_t *param); + + /* Stores/retrieves index of a vertex data where target distances will be stored + */ + nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetDistancesIndex(nvgraphTraversalParameter_t *param, + const size_t value); + + nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex(const nvgraphTraversalParameter_t param, + size_t *value); + + /* Stores/retrieves index of a vertex data where path predecessors will be stored + */ + nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetPredecessorsIndex(nvgraphTraversalParameter_t *param, + const size_t value); + + nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex(const nvgraphTraversalParameter_t param, + size_t *value); + + /* Stores/retrieves index of an edge data which tells traversal algorithm whether path can go through an edge or not + */ + nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetEdgeMaskIndex(nvgraphTraversalParameter_t *param, + const size_t value); + + nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex(const nvgraphTraversalParameter_t param, + size_t *value); + + /* Stores/retrieves flag that tells an algorithm whether the graph is directed or not + */ + nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetUndirectedFlag(nvgraphTraversalParameter_t *param, + const size_t value); + + nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag(const nvgraphTraversalParameter_t param, + size_t *value); + + /* Stores/retrieves 'alpha' and 'beta' parameters for BFS traversal algorithm + */ + nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetAlpha(nvgraphTraversalParameter_t *param, + const size_t value); + + nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetAlpha(const nvgraphTraversalParameter_t param, + size_t *value); + + nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetBeta(nvgraphTraversalParameter_t *param, + const size_t value); + + nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetBeta(const nvgraphTraversalParameter_t param, + size_t *value); //Traversal available - typedef enum { - NVGRAPH_TRAVERSAL_BFS = 0 - } nvgraphTraversal_t; - - /* nvGRAPH Traversal API - * Compute a traversal of the graph from a single vertex using algorithm specified by traversalT parameter - */ - nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const nvgraphTraversal_t traversalT, - const int *source_vert, - const nvgraphTraversalParameter_t params); - - /** - * CAPI Method for calling 2d BFS algorithm. - * @param handle Nvgraph context handle. - * @param descrG Graph handle (must be 2D partitioned) - * @param source_vert The source vertex ID - * @param distances Pointer to memory allocated to store the distances. - * @param predecessors Pointer to memory allocated to store the predecessors - * @return Status code. - */ - nvgraphStatus_t NVGRAPH_API nvgraph2dBfs( nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const int32_t source_vert, - int32_t* distances, - int32_t* predecessors); - - /* nvGRAPH Single Source Shortest Path (SSSP) - * Calculate the shortest path distance from a single vertex in the graph to all other vertices. - */ - nvgraphStatus_t NVGRAPH_API nvgraphSssp(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int *source_vert, - const size_t sssp_index); - - /* nvGRAPH WidestPath - * Find widest path potential from source_index to every other vertices. - */ - nvgraphStatus_t NVGRAPH_API nvgraphWidestPath(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int *source_vert, - const size_t widest_path_index); - - /* nvGRAPH PageRank - * Find PageRank for each vertex of a graph with a given transition probabilities, a bookmark vector of dangling vertices, and the damping factor. - */ - nvgraphStatus_t NVGRAPH_API nvgraphPagerank(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const void *alpha, - const size_t bookmark_index, - const int has_guess, - const size_t pagerank_index, - const float tolerance, - const int max_iter); - - /* nvGRAPH contraction - * given array of agregates contract graph with - * given (Combine, Reduce) operators for Vertex Set - * and Edge Set; - */ - nvgraphStatus_t NVGRAPH_API nvgraphContractGraph(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - nvgraphGraphDescr_t contrdescrG, - int *aggregates, - size_t numaggregates, - nvgraphSemiringOps_t VertexCombineOp, - nvgraphSemiringOps_t VertexReduceOp, - nvgraphSemiringOps_t EdgeCombineOp, - nvgraphSemiringOps_t EdgeReduceOp, - int flag); - - /* nvGRAPH spectral clustering - * given a graph and solver parameters of struct SpectralClusteringParameter, - * assign vertices to groups such as - * intra-group connections are strong and/or inter-groups connections are weak - * using spectral technique. - */ - nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering(nvgraphHandle_t handle, - const nvgraphGraphDescr_t graph_descr, - const size_t weight_index, - const struct SpectralClusteringParameter *params, - int* clustering, - void* eig_vals, - void* eig_vects); - - /* nvGRAPH analyze clustering - * Given a graph, a clustering, and a metric - * compute the score that measures the clustering quality according to the metric. - */ - nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering(nvgraphHandle_t handle, - const nvgraphGraphDescr_t graph_descr, - const size_t weight_index, - const int n_clusters, - const int* clustering, - nvgraphClusteringMetric_t metric, - float * score); - - /* nvGRAPH Triangles counting - * count number of triangles (cycles of size 3) formed by graph edges - */ - nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount(nvgraphHandle_t handle, - const nvgraphGraphDescr_t graph_descr, - uint64_t* result); - - /* nvGRAPH Louvain implementation - */ - nvgraphStatus_t NVGRAPH_API nvgraphLouvain ( cudaDataType_t index_type, - cudaDataType_t val_type, - const size_t num_vertex, - const size_t num_edges, - void* csr_ptr, - void* csr_ind, - void* csr_val, - int weighted, - int has_init_cluster, - void* init_cluster, - void* final_modularity, - void* best_cluster_vec, - void* num_level); - - - /* nvGRAPH Jaccard implementation - */ - nvgraphStatus_t NVGRAPH_API nvgraphJaccard ( cudaDataType_t index_type, - cudaDataType_t val_type, - const size_t n, - const size_t e, - void* csr_ptr, - void *csr_ind, - void* csr_val, - int weighted, - void* v, - void* gamma, - void* weight_j); - - /* nvGRAPH attach structure - * Warp external device data into a nvgraphGraphDescr_t - * Warning : this data remain owned by the user - */ - nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void* topologyData, - nvgraphTopologyType_t TT); - - /* nvGRAPH attach Vertex Data - * Warp external device data into a vertex dim - * Warning : this data remain owned by the user - */ - nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t setnum, - cudaDataType_t settype, - void *vertexData); - - /* nvGRAPH attach Edge Data - * Warp external device data into an edge dim - * Warning : this data remain owned by the user - */ - nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t setnum, - cudaDataType_t settype, - void *edgeData); + typedef enum { + NVGRAPH_TRAVERSAL_BFS = 0 + } nvgraphTraversal_t; + + /* nvGRAPH Traversal API + * Compute a traversal of the graph from a single vertex using algorithm specified by traversalT parameter + */ + nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const nvgraphTraversal_t traversalT, + const int *source_vert, + const nvgraphTraversalParameter_t params); + + /** + * CAPI Method for calling 2d BFS algorithm. + * @param handle Nvgraph context handle. + * @param descrG Graph handle (must be 2D partitioned) + * @param source_vert The source vertex ID + * @param distances Pointer to memory allocated to store the distances. + * @param predecessors Pointer to memory allocated to store the predecessors + * @return Status code. + */ + nvgraphStatus_t NVGRAPH_API nvgraph2dBfs(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const int32_t source_vert, + int32_t* distances, + int32_t* predecessors); + + /* nvGRAPH Single Source Shortest Path (SSSP) + * Calculate the shortest path distance from a single vertex in the graph to all other vertices. + */ + nvgraphStatus_t NVGRAPH_API nvgraphSssp(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int *source_vert, + const size_t sssp_index); + + /* nvGRAPH WidestPath + * Find widest path potential from source_index to every other vertices. + */ + nvgraphStatus_t NVGRAPH_API nvgraphWidestPath(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int *source_vert, + const size_t widest_path_index); + + /* nvGRAPH PageRank + * Find PageRank for each vertex of a graph with a given transition probabilities, a bookmark vector of dangling vertices, and the damping factor. + */ + nvgraphStatus_t NVGRAPH_API nvgraphPagerank(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const void *alpha, + const size_t bookmark_index, + const int has_guess, + const size_t pagerank_index, + const float tolerance, + const int max_iter); + + /* nvGRAPH contraction + * given array of agregates contract graph with + * given (Combine, Reduce) operators for Vertex Set + * and Edge Set; + */ + nvgraphStatus_t NVGRAPH_API nvgraphContractGraph(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + nvgraphGraphDescr_t contrdescrG, + int *aggregates, + size_t numaggregates, + nvgraphSemiringOps_t VertexCombineOp, + nvgraphSemiringOps_t VertexReduceOp, + nvgraphSemiringOps_t EdgeCombineOp, + nvgraphSemiringOps_t EdgeReduceOp, + int flag); + + /* nvGRAPH spectral clustering + * given a graph and solver parameters of struct SpectralClusteringParameter, + * assign vertices to groups such as + * intra-group connections are strong and/or inter-groups connections are weak + * using spectral technique. + */ + nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering(nvgraphHandle_t handle, + const nvgraphGraphDescr_t graph_descr, + const size_t weight_index, + const struct SpectralClusteringParameter *params, + int* clustering, + void* eig_vals, + void* eig_vects); + + /* nvGRAPH analyze clustering + * Given a graph, a clustering, and a metric + * compute the score that measures the clustering quality according to the metric. + */ + nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering(nvgraphHandle_t handle, + const nvgraphGraphDescr_t graph_descr, + const size_t weight_index, + const int n_clusters, + const int* clustering, + nvgraphClusteringMetric_t metric, + float * score); + + /* nvGRAPH Triangles counting + * count number of triangles (cycles of size 3) formed by graph edges + */ + nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount(nvgraphHandle_t handle, + const nvgraphGraphDescr_t graph_descr, + uint64_t* result); + + /* nvGRAPH Louvain implementation + */ + nvgraphStatus_t NVGRAPH_API nvgraphLouvain(cudaDataType_t index_type, + cudaDataType_t val_type, + const size_t num_vertex, + const size_t num_edges, + void* csr_ptr, + void* csr_ind, + void* csr_val, + int weighted, + int has_init_cluster, + void* init_cluster, + void* final_modularity, + void* best_cluster_vec, + void* num_level); + + + /* nvGRAPH Jaccard implementation + */ + nvgraphStatus_t NVGRAPH_API nvgraphJaccard(cudaDataType_t index_type, + cudaDataType_t val_type, + const size_t n, + const size_t e, + void* csr_ptr, + void *csr_ind, + void* csr_val, + int weighted, + void* v, + void* gamma, + void* weight_j); + + /* nvGRAPH attach structure + * Warp external device data into a nvgraphGraphDescr_t + * Warning : this data remain owned by the user + */ + nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void* topologyData, + nvgraphTopologyType_t TT); + + /* nvGRAPH attach Vertex Data + * Warp external device data into a vertex dim + * Warning : this data remain owned by the user + */ + nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + size_t setnum, + cudaDataType_t settype, + void *vertexData); + + /* nvGRAPH attach Edge Data + * Warp external device data into an edge dim + * Warning : this data remain owned by the user + */ + nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + size_t setnum, + cudaDataType_t settype, + void *edgeData); #if defined(__cplusplus) } /* extern "C" */ #endif #endif /* _NVGRAPH_H_ */ - diff --git a/cpp/nvgraph/cpp/include/nvgraphP.h b/cpp/nvgraph/cpp/include/nvgraphP.h index 8e6080e874d..5ca1c369b1b 100644 --- a/cpp/nvgraph/cpp/include/nvgraphP.h +++ b/cpp/nvgraph/cpp/include/nvgraphP.h @@ -24,7 +24,7 @@ #pragma once #include "nvgraph.h" -#include "cnmem.h" +#include "rmm/rmm.h" #if defined(__cplusplus) extern "C" { @@ -41,7 +41,6 @@ typedef enum struct nvgraphContext { cudaStream_t stream; - cnmemDevice_t cnmem_device; int nvgraphIsInitialized; }; diff --git a/cpp/nvgraph/cpp/include/nvgraph_convert.hxx b/cpp/nvgraph/cpp/include/nvgraph_convert.hxx index f0c5620e7e7..0cd29195470 100644 --- a/cpp/nvgraph/cpp/include/nvgraph_convert.hxx +++ b/cpp/nvgraph/cpp/include/nvgraph_convert.hxx @@ -17,7 +17,6 @@ #include #include -#include namespace nvgraph{ void csr2coo( const int *csrSortedRowPtr, diff --git a/cpp/nvgraph/cpp/include/nvgraph_error.hxx b/cpp/nvgraph/cpp/include/nvgraph_error.hxx index 14815c83acd..a8fe364ebff 100644 --- a/cpp/nvgraph/cpp/include/nvgraph_error.hxx +++ b/cpp/nvgraph/cpp/include/nvgraph_error.hxx @@ -136,6 +136,25 @@ int NVGRAPH_GetErrorString( NVGRAPH_ERROR error, char* buffer, int buf_len); } #endif +// This is a gap filler, and should be replaced with a RAPIDS-wise error handling mechanism. +#undef rmmCheckError +#if defined(DEBUG) || defined(VERBOSE_DIAG) +#define rmmCheckError(e) { \ + if (e != RMM_SUCCESS) { \ + std::stringstream _error; \ + _error << "RMM failure."; \ + FatalError(_error.str(), NVGRAPH_ERR_CUDA_FAILURE); \ + } \ +} +#else // NO DEBUG +#define rmmCheckError(e) \ + { \ + if (e != RMM_SUCCESS) { \ + FatalError("", NVGRAPH_ERR_CUDA_FAILURE); \ + } \ + } +#endif + #define CHECK_CUDA(call) \ { \ cudaError_t _e = (call); \ diff --git a/cpp/nvgraph/cpp/include/nvgraph_vector.hxx b/cpp/nvgraph/cpp/include/nvgraph_vector.hxx index 33a69e9c1a1..5e03ccbde73 100644 --- a/cpp/nvgraph/cpp/include/nvgraph_vector.hxx +++ b/cpp/nvgraph/cpp/include/nvgraph_vector.hxx @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include "nvgraph_error.hxx" #include "nvgraph_vector_kernels.hxx" @@ -36,7 +36,7 @@ public: protected: /*! Storage for the values. */ - SHARED_PREFIX::shared_ptr values; + std::shared_ptr values; /*! Size of the array */ diff --git a/cpp/nvgraph/cpp/include/nvlouvain.cuh b/cpp/nvgraph/cpp/include/nvlouvain.cuh index 9644a17d40d..cabc923575f 100644 --- a/cpp/nvgraph/cpp/include/nvlouvain.cuh +++ b/cpp/nvgraph/cpp/include/nvlouvain.cuh @@ -30,6 +30,9 @@ #include #include +#include +#include + #include "graph_utils.cuh" #include "modularity.cuh" #include "delta_modularity.cuh" @@ -66,12 +69,12 @@ NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val, int n_edges = num_edges; int n_vertex = num_vertex; - thrust::device_vector csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1); - thrust::device_vector csr_ind_d(csr_ind, csr_ind + n_edges); - thrust::device_vector csr_val_d(csr_val, csr_val + n_edges); + rmm::device_vector csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1); + rmm::device_vector csr_ind_d(csr_ind, csr_ind + n_edges); + rmm::device_vector csr_val_d(csr_val, csr_val + n_edges); //std::vector clustering(n_vertex); - thrust::device_vector clustering(n_vertex); + rmm::device_vector clustering(n_vertex); int upper_bound = 100; HighResClock hr_clock; @@ -87,18 +90,18 @@ NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val, ValType best_modularity = -1; - thrust::device_vector new_csr_ptr(n_vertex, 0); - thrust::device_vector new_csr_ind(n_edges, 0); - thrust::device_vector new_csr_val(n_edges, 0); - - thrust::device_vector cluster_d(n_vertex); - thrust::device_vector aggregates_tmp_d(n_vertex, 0); - thrust::device_vector cluster_inv_ptr(c_size + 1, 0); - thrust::device_vector cluster_inv_ind(n_vertex, 0); - thrust::device_vector k_vec(n_vertex, 0); - thrust::device_vector Q_arr(n_vertex, 0); - thrust::device_vector delta_Q_arr(n_edges, 0); - thrust::device_vector cluster_sum_vec(c_size, 0); + rmm::device_vector new_csr_ptr(n_vertex, 0); + rmm::device_vector new_csr_ind(n_edges, 0); + rmm::device_vector new_csr_val(n_edges, 0); + + rmm::device_vector cluster_d(n_vertex); + rmm::device_vector aggregates_tmp_d(n_vertex, 0); + rmm::device_vector cluster_inv_ptr(c_size + 1, 0); + rmm::device_vector cluster_inv_ind(n_vertex, 0); + rmm::device_vector k_vec(n_vertex, 0); + rmm::device_vector Q_arr(n_vertex, 0); + rmm::device_vector delta_Q_arr(n_edges, 0); + rmm::device_vector cluster_sum_vec(c_size, 0); thrust::host_vector best_cluster_h(n_vertex, 0); Vector aggregates((int) current_n_vertex, 0); @@ -454,9 +457,9 @@ NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val, int n_edges = num_edges; int n_vertex = num_vertex; - thrust::device_vector csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1); - thrust::device_vector csr_ind_d(csr_ind, csr_ind + n_edges); - thrust::device_vector csr_val_d(csr_val, csr_val + n_edges); + rmm::device_vector csr_ptr_d(csr_ptr, csr_ptr + n_vertex + 1); + rmm::device_vector csr_ind_d(csr_ind, csr_ind + n_edges); + rmm::device_vector csr_val_d(csr_val, csr_val + n_edges); int upper_bound = 100; @@ -472,18 +475,18 @@ NVLOUVAIN_STATUS louvain(IdxType* csr_ptr, IdxType* csr_ind, ValType* csr_val, ValType best_modularity = -1; - thrust::device_vector new_csr_ptr(n_vertex, 0); - thrust::device_vector new_csr_ind(n_edges, 0); - thrust::device_vector new_csr_val(n_edges, 0); - - thrust::device_vector cluster_d(n_vertex); - thrust::device_vector aggregates_tmp_d(n_vertex, 0); - thrust::device_vector cluster_inv_ptr(c_size + 1, 0); - thrust::device_vector cluster_inv_ind(n_vertex, 0); - thrust::device_vector k_vec(n_vertex, 0); - thrust::device_vector Q_arr(n_vertex, 0); - thrust::device_vector delta_Q_arr(n_edges, 0); - thrust::device_vector cluster_sum_vec(c_size, 0); + rmm::device_vector new_csr_ptr(n_vertex, 0); + rmm::device_vector new_csr_ind(n_edges, 0); + rmm::device_vector new_csr_val(n_edges, 0); + + rmm::device_vector cluster_d(n_vertex); + rmm::device_vector aggregates_tmp_d(n_vertex, 0); + rmm::device_vector cluster_inv_ptr(c_size + 1, 0); + rmm::device_vector cluster_inv_ind(n_vertex, 0); + rmm::device_vector k_vec(n_vertex, 0); + rmm::device_vector Q_arr(n_vertex, 0); + rmm::device_vector delta_Q_arr(n_edges, 0); + rmm::device_vector cluster_sum_vec(c_size, 0); std::vector best_cluster_h(n_vertex, 0); Vector aggregates(current_n_vertex, 0); diff --git a/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx b/cpp/nvgraph/cpp/include/rmm_shared_ptr.hxx similarity index 60% rename from cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx rename to cpp/nvgraph/cpp/include/rmm_shared_ptr.hxx index 2143ec8e4ac..da777bfdd86 100644 --- a/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx +++ b/cpp/nvgraph/cpp/include/rmm_shared_ptr.hxx @@ -13,42 +13,30 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - + #pragma once -#include #include - - -// - -#if __cplusplus > 199711L +#include #include -#define SHARED_PREFIX std -#else -#include -#define SHARED_PREFIX boost +#include "rmm/rmm.h" -#endif - -#include #include "nvgraph_error.hxx" namespace nvgraph { template< typename T > -class DeviceDeleter +class DeviceDeleter { cudaStream_t mStream; public: DeviceDeleter(cudaStream_t stream) : mStream(stream) {} - void operator()(T *ptr) + void operator()(T *ptr) { - cnmemStatus_t status = cnmemFree(ptr, mStream); - if( status != CNMEM_STATUS_SUCCESS ) - { + auto status = RMM_FREE(ptr, mStream); + if (status != RMM_SUCCESS) { FatalError("Memory manager internal error (free)", NVGRAPH_ERR_UNKNOWN); } } @@ -56,38 +44,36 @@ public: template< typename T > -inline SHARED_PREFIX::shared_ptr allocateDevice(size_t n, cudaStream_t stream) +inline std::shared_ptr allocateDevice(size_t n, cudaStream_t stream) { T *ptr = NULL; - cnmemStatus_t status = cnmemMalloc((void**) &ptr, n*sizeof(T), stream); - if( status == CNMEM_STATUS_OUT_OF_MEMORY) - { + auto status = RMM_ALLOC(&ptr, n * sizeof(T), stream); + if (status == RMM_ERROR_OUT_OF_MEMORY) { FatalError("Not enough memory", NVGRAPH_ERR_NO_MEMORY); } - else if (status != CNMEM_STATUS_SUCCESS) - { - FatalError("Memory manager internal error (alloc)", NVGRAPH_ERR_UNKNOWN); + else if (status != RMM_SUCCESS) { + FatalError("Memory manager internal error (alloc)", NVGRAPH_ERR_UNKNOWN); } - return SHARED_PREFIX::shared_ptr(ptr, DeviceDeleter(stream)); + return std::shared_ptr(ptr, DeviceDeleter(stream)); } template< typename T > -class DeviceReleaser +class DeviceReleaser { cudaStream_t mStream; public: DeviceReleaser(cudaStream_t stream) : mStream(stream) {} - void operator()(T *ptr) + void operator()(T *ptr) { } }; template< typename T > -inline SHARED_PREFIX::shared_ptr attachDevicePtr(T * ptr_in, cudaStream_t stream) +inline std::shared_ptr attachDevicePtr(T * ptr_in, cudaStream_t stream) { T *ptr = ptr_in; - return SHARED_PREFIX::shared_ptr(ptr, DeviceReleaser(stream)); + return std::shared_ptr(ptr, DeviceReleaser(stream)); } diff --git a/cpp/nvgraph/cpp/include/test/cluster_inv.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/cluster_inv.cuh similarity index 100% rename from cpp/nvgraph/cpp/include/test/cluster_inv.cuh rename to cpp/nvgraph/cpp/include/test_to_be_removed/cluster_inv.cuh diff --git a/cpp/nvgraph/cpp/include/test/delta_modularity_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/delta_modularity_test.cuh similarity index 100% rename from cpp/nvgraph/cpp/include/test/delta_modularity_test.cuh rename to cpp/nvgraph/cpp/include/test_to_be_removed/delta_modularity_test.cuh diff --git a/cpp/nvgraph/cpp/include/test/k_compute_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/k_compute_test.cuh similarity index 100% rename from cpp/nvgraph/cpp/include/test/k_compute_test.cuh rename to cpp/nvgraph/cpp/include/test_to_be_removed/k_compute_test.cuh diff --git a/cpp/nvgraph/cpp/include/test/k_in_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/k_in_test.cuh similarity index 100% rename from cpp/nvgraph/cpp/include/test/k_in_test.cuh rename to cpp/nvgraph/cpp/include/test_to_be_removed/k_in_test.cuh diff --git a/cpp/nvgraph/cpp/include/test/mem_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/mem_test.cuh similarity index 100% rename from cpp/nvgraph/cpp/include/test/mem_test.cuh rename to cpp/nvgraph/cpp/include/test_to_be_removed/mem_test.cuh diff --git a/cpp/nvgraph/cpp/include/test/modularity_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/modularity_test.cuh similarity index 100% rename from cpp/nvgraph/cpp/include/test/modularity_test.cuh rename to cpp/nvgraph/cpp/include/test_to_be_removed/modularity_test.cuh diff --git a/cpp/nvgraph/cpp/include/test/phase_1_color_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/phase_1_color_test.cuh similarity index 100% rename from cpp/nvgraph/cpp/include/test/phase_1_color_test.cuh rename to cpp/nvgraph/cpp/include/test_to_be_removed/phase_1_color_test.cuh diff --git a/cpp/nvgraph/cpp/include/test/phase_1_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/phase_1_test.cuh similarity index 100% rename from cpp/nvgraph/cpp/include/test/phase_1_test.cuh rename to cpp/nvgraph/cpp/include/test_to_be_removed/phase_1_test.cuh diff --git a/cpp/nvgraph/cpp/include/test/thrust_test.cuh b/cpp/nvgraph/cpp/include/test_to_be_removed/thrust_test.cuh similarity index 100% rename from cpp/nvgraph/cpp/include/test/thrust_test.cuh rename to cpp/nvgraph/cpp/include/test_to_be_removed/thrust_test.cuh diff --git a/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh b/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh index 7faec5ee85d..1a017d80c80 100644 --- a/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh +++ b/cpp/nvgraph/cpp/include/thrust_coarse_generator.cuh @@ -20,9 +20,12 @@ #include #include #include -#include "util.cuh" + +#include +#include + #include "graph_utils.cuh" -//#include //indices_to_offsets +#include "util.cuh" template void indices_to_offsets(const thrust::execution_policy &exec, @@ -161,21 +164,21 @@ void jToJKernel(const IndexType *column_indices, const IndexType *aggregates, In // Method to compute Ac on DEVICE using csr format template void generate_superverticies_graph(const int n_vertex, const int num_aggregates, - thrust::device_vector &csr_ptr_d, - thrust::device_vector &csr_ind_d, - thrust::device_vector &csr_val_d, - thrust::device_vector &new_csr_ptr_d, - thrust::device_vector &new_csr_ind_d, - thrust::device_vector &new_csr_val_d, - const thrust::device_vector &aggregates + rmm::device_vector &csr_ptr_d, + rmm::device_vector &csr_ind_d, + rmm::device_vector &csr_val_d, + rmm::device_vector &new_csr_ptr_d, + rmm::device_vector &new_csr_ind_d, + rmm::device_vector &new_csr_val_d, + const rmm::device_vector &aggregates ){ const int n_edges = csr_ptr_d[n_vertex]; - thrust::device_vector I(n_edges,-1); - thrust::device_vector J(n_edges,-1); - thrust::device_vector V(n_edges,-1); + rmm::device_vector I(n_edges,-1); + rmm::device_vector J(n_edges,-1); + rmm::device_vector V(n_edges,-1); const int block_size_I = 128; const int block_size_J = 256; @@ -229,7 +232,7 @@ void generate_superverticies_graph(const int n_vertex, const int num_aggregates, // Reduce by key to fill in Ac.column_indices and Ac.values - thrust::device_vector new_row_indices(NNZ,0); + rmm::device_vector new_row_indices(NNZ,0); thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(I.begin(), J.begin())), diff --git a/cpp/nvgraph/cpp/include/thrust_traits.hxx b/cpp/nvgraph/cpp/include/thrust_traits.hxx index 922d680474d..89a026d8c53 100644 --- a/cpp/nvgraph/cpp/include/thrust_traits.hxx +++ b/cpp/nvgraph/cpp/include/thrust_traits.hxx @@ -14,65 +14,35 @@ * limitations under the License. */ - - #ifndef THRUST_TRAITS_HXX - #define THRUST_TRAITS_HXX - - -#include - +#include #include - +#include +#include namespace nvgraph - { - //generic Vector Ptr Type facade: - - // - template - struct VectorPtrT; - - //partial specialization for device_vector: - - // - template - - struct VectorPtrT > - + struct VectorPtrT> { - typedef thrust::device_ptr PtrT; }; - - //partial specialization for host_vector: - - // - template - - struct VectorPtrT > - + struct VectorPtrT> { - typedef typename thrust::host_vector::value_type* PtrT; - }; - } #endif - diff --git a/cpp/nvgraph/cpp/include/valued_csr_graph.cuh b/cpp/nvgraph/cpp/include/valued_csr_graph.cuh index 81e0e517f06..cf000da24a9 100644 --- a/cpp/nvgraph/cpp/include/valued_csr_graph.cuh +++ b/cpp/nvgraph/cpp/include/valued_csr_graph.cuh @@ -16,27 +16,30 @@ #pragma once +#include +#include + namespace nvlouvain{ template -class Vector: public thrust::device_vector{ +class Vector: public rmm::device_vector{ public: - Vector(): thrust::device_vector(){} - Vector(int size): thrust::device_vector(size){} + Vector(): rmm::device_vector(){} + Vector(int size): rmm::device_vector(size){} template - Vector(Iter begin, Iter end): thrust::device_vector(begin, end){} + Vector(Iter begin, Iter end): rmm::device_vector(begin, end){} inline void fill(const ValType val){ thrust::fill(thrust::cuda::par, this->begin(), this->end(), val); } - inline thrust::device_vector& to_device_vector(){ - return static_cast> (*this); + inline rmm::device_vector& to_device_vector(){ + return static_cast> (*this); } inline ValType* raw(){ - return (ValType*)thrust::raw_pointer_cast( thrust::device_vector::data() ); + return (ValType*)thrust::raw_pointer_cast( rmm::device_vector::data() ); } inline int get_size(){ @@ -49,7 +52,7 @@ template class CsrGraph{ public: - CsrGraph( thrust::device_vector& csr_ptr_d, thrust::device_vector& csr_ind_d, thrust::device_vector& csr_val_d, IndexType v, IndexType e, bool _w=false): + CsrGraph( rmm::device_vector& csr_ptr_d, rmm::device_vector& csr_ind_d, rmm::device_vector& csr_val_d, IndexType v, IndexType e, bool _w=false): _n_vertices(v), _n_edges(e), csr_ptr(csr_ptr_d.begin(), csr_ptr_d.end()), csr_ind(csr_ind_d.begin(), csr_ind_d.end()), csr_val(csr_val_d.begin(), csr_val_d.end()), weighted(_w){ } @@ -93,32 +96,32 @@ class CsrGraph{ return csr_val; } - inline void update_csr_ptr(thrust::device_vector & d_v){ + inline void update_csr_ptr(rmm::device_vector & d_v){ thrust::copy(thrust::cuda::par, d_v.begin(), d_v.end(), csr_ptr.begin()); } - inline void update_csr_ptr_n(thrust::device_vector & d_v,unsigned size){ + inline void update_csr_ptr_n(rmm::device_vector & d_v,unsigned size){ csr_ptr.resize(size); thrust::copy_n(thrust::cuda::par, d_v.begin(), size, csr_ptr.begin()); } - inline void update_csr_ind(thrust::device_vector & d_v){ + inline void update_csr_ind(rmm::device_vector & d_v){ thrust::copy(thrust::cuda::par, d_v.begin(), d_v.end(), csr_ind.begin()); } - inline void update_csr_ind_n(thrust::device_vector & d_v,unsigned size){ + inline void update_csr_ind_n(rmm::device_vector & d_v,unsigned size){ csr_ind.resize(size); thrust::copy_n(thrust::cuda::par, d_v.begin(), size, csr_ind.begin()); } - inline void update_csr_val(thrust::device_vector & d_v){ + inline void update_csr_val(rmm::device_vector & d_v){ thrust::copy(thrust::cuda::par, d_v.begin(), d_v.end(), csr_val.begin()); } - inline void update_csr_val_n(thrust::device_vector & d_v,unsigned size){ + inline void update_csr_val_n(rmm::device_vector & d_v,unsigned size){ csr_val.resize(size); thrust::copy_n(thrust::cuda::par, d_v.begin(), size, csr_val.begin()); } - inline void update_graph(size_t n_v, size_t n_e, thrust::device_vector & ptr, thrust::device_vector & ind, thrust::device_vector & val, bool w){ + inline void update_graph(size_t n_v, size_t n_e, rmm::device_vector & ptr, rmm::device_vector & ind, rmm::device_vector & val, bool w){ _n_vertices = n_v; _n_edges = n_e; #ifdef DEBUG diff --git a/cpp/nvgraph/cpp/include/valued_csr_graph.hxx b/cpp/nvgraph/cpp/include/valued_csr_graph.hxx index 5fe1986c449..0469eabf2fa 100644 --- a/cpp/nvgraph/cpp/include/valued_csr_graph.hxx +++ b/cpp/nvgraph/cpp/include/valued_csr_graph.hxx @@ -38,7 +38,7 @@ private: protected: /*! Storage for the nonzero entries of the CSR data structure. */ - SHARED_PREFIX::shared_ptr values; + std::shared_ptr values; public: diff --git a/cpp/nvgraph/cpp/src/arnoldi.cu b/cpp/nvgraph/cpp/src/arnoldi.cu index 8975b985f83..617adb893ad 100644 --- a/cpp/nvgraph/cpp/src/arnoldi.cu +++ b/cpp/nvgraph/cpp/src/arnoldi.cu @@ -31,13 +31,6 @@ #include "nvgraph_csrmv.hxx" #include "matrix.hxx" - -#include "debug_macros.h" -#ifdef DEBUG -#define IRAM_VERBOSE -// #define IRAM_DEBUG -#endif - namespace nvgraph { @@ -88,19 +81,6 @@ NVGRAPH_ERROR ImplicitArnoldi::solve(const int restart_i const int nested_subspaces_freq) { //try { - #ifdef IRAM_VERBOSE - std::stringstream ss; - ss.str(std::string()); - size_t used_mem, free_mem, total_mem; - ss <<" ------------------ImplicitArnoldi------------------"<< std::endl; - ss <<" --------------------------------------------"<< std::endl; - ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl; - ss <<" --------------------------------------------"<< std::endl; - COUT()<::solve(const int restart_i bool converged = false; int i = 0; // we can print stats after setup to have the initial residual - #ifdef IRAM_VERBOSE - ss.str(std::string()); - cnmemMemGetInfo(&free_mem, &total_mem, NULL); - used_mem=total_mem-free_mem; - ss << std::setw(10) << i ; - ss.precision(3); - ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0; - ss << std::setw(15) << std::scientific << m_residual; - if (m_miramns) ss << " (Krylov size: " << m_select << ")"; - ss << std::endl; - COUT()<::solve(const int restart_i compute_eigenvectors(); cudaMemcpyAsync(eigVals.raw(), &m_ritz_eigenvalues[0], (size_t)(m_nr_eigenvalues*sizeof(m_ritz_eigenvalues[0])), cudaMemcpyHostToDevice); cudaCheckError(); - #ifdef IRAM_VERBOSE - COUT() <<" --------------------------------------------"<< std::endl; - //stop timer - COUT() <<" Total Time : "<< timer.stop() << "ms"<::solve_arnoldi(int lower_bound, int } } - #ifdef IRAM_DEBUG - COUT() - <<"---------------------------------------------"<::solve_it() template void ImplicitArnoldi::select_subspace() { - #ifdef IRAM_DEBUG - COUT() < 199711L typename std::vector::iterator it = std::min_element(std::begin(m_mns_residuals), std::end(m_mns_residuals)); #else @@ -682,25 +604,10 @@ void ImplicitArnoldi::compute_residual(int subspace_size lam = std::abs(m_ritz_eigenvalues[i]); tmp_residual = residual_norm / lam; - //tmp_residual = residual_norm ; - //COUT() << "last_ritz_vector : "<::implicit_restart() // optim: avoid the cpy here if (!m_miramns) std::copy(m_H.begin(), m_H.end(), m_H_select.begin()); select_shifts(m_dirty_bit); - #ifdef IRAM_DEBUG - for(int i = 0; i::select_shifts(bool dirty_bit) std::copy(m_H_select.begin(), m_H_select.end(), m_H_tmp.begin()); //Lapack::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0], &m_ritz_eigenvectors[0], m_select , m_select, m_select); Lapack::geev(&m_H_tmp[0], &m_ritz_eigenvalues[0],&m_ritz_eigenvalues_i[0], &m_ritz_eigenvectors[0], NULL, m_select , m_select, m_select); - // #ifdef IRAM_DEBUG - // COUT() << "m_ritz_eigenvalues : "<::qr_step() //for (int j = 0; j < m_select; j++) // m_Q[j*m_select+j] = 1.0; - #ifdef IRAM_DEBUG - COUT() << "m_ritz_eigenvalues : "<= m_n_eigenvalues) { @@ -1144,17 +995,6 @@ void ImplicitArnoldi::compute_eigenvectors() //nrm 1 for pagerank if(m_markov) Cublas::scal(n, (ValueType_)1.0/m_eigenvectors.nrm1(), m_eigenvectors.raw(), 1); - - #ifdef IRAM_DEBUG - COUT()< @@ -1200,15 +1040,9 @@ void ImplicitArnoldi::cleanup_subspace(std::vector void ImplicitArnoldi::shift(std::vector& H, int ld, int m, ValueType mu) { - #ifdef IRAM_DEBUG - dump_host_dense_mat(H,ld); - #endif int start = ld-m; for (int i = start; i < ld; i++) H[i*ld+i-start] -= mu; - #ifdef IRAM_DEBUG - dump_host_dense_mat(H,ld); - #endif } template diff --git a/cpp/nvgraph/cpp/src/bfs.cu b/cpp/nvgraph/cpp/src/bfs.cu index 218f01a87ac..8c4934ca442 100644 --- a/cpp/nvgraph/cpp/src/bfs.cu +++ b/cpp/nvgraph/cpp/src/bfs.cu @@ -19,542 +19,540 @@ #include "bfs.hxx" #include +#include + #include "nvgraph_error.hxx" #include "bfs_kernels.cu" using namespace bfs_kernels; namespace nvgraph { - enum BFS_ALGO_STATE { - TOPDOWN, BOTTOMUP - }; - - template - NVGRAPH_ERROR Bfs::setup() { - - // Determinism flag, false by default - deterministic = false; - //Working data - //Each vertex can be in the frontier at most once - cudaMalloc(&frontier, n * sizeof(IndexType)); - cudaCheckError() - ; - - //We will update frontier during the execution - //We need the orig to reset frontier, or cudaFree - original_frontier = frontier; - - //size of bitmaps for vertices - vertices_bmap_size = (n / (8 * sizeof(int)) + 1); - //ith bit of visited_bmap is set <=> ith vertex is visited - cudaMalloc(&visited_bmap, sizeof(int) * vertices_bmap_size); - cudaCheckError() - ; - - //ith bit of isolated_bmap is set <=> degree of ith vertex = 0 - cudaMalloc(&isolated_bmap, sizeof(int) * vertices_bmap_size); - cudaCheckError() - ; - - //vertices_degree[i] = degree of vertex i - cudaMalloc(&vertex_degree, sizeof(IndexType) * n); - cudaCheckError() - ; - - //Cub working data - cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes); - - //We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive - cudaMalloc(&buffer_np1_1, (n + 1) * sizeof(IndexType)); - cudaCheckError() - ; - cudaMalloc(&buffer_np1_2, (n + 1) * sizeof(IndexType)); - cudaCheckError() - ; - - //Using buffers : top down - - //frontier_vertex_degree[i] is the degree of vertex frontier[i] - frontier_vertex_degree = buffer_np1_1; - //exclusive sum of frontier_vertex_degree - exclusive_sum_frontier_vertex_degree = buffer_np1_2; - - //Using buffers : bottom up - - //contains list of unvisited vertices - unvisited_queue = buffer_np1_1; - //size of the "last" unvisited queue : size_last_unvisited_queue - //refers to the size of unvisited_queue - //which may not be up to date (the queue may contains vertices that are now visited) - - //We may leave vertices unvisited after bottom up main kernels - storing them here - left_unvisited_queue = buffer_np1_2; - - //We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket - //See top down kernels for more details - cudaMalloc( &exclusive_sum_frontier_vertex_buckets_offsets, - ((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType)); - cudaCheckError() - ; - - //Init device-side counters - //Those counters must be/can be reset at each bfs iteration - //Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck - cudaMalloc(&d_counters_pad, 4 * sizeof(IndexType)); - cudaCheckError() - ; - - d_new_frontier_cnt = &d_counters_pad[0]; - d_mu = &d_counters_pad[1]; - d_unvisited_cnt = &d_counters_pad[2]; - d_left_unvisited_cnt = &d_counters_pad[3]; - - //Lets use this int* for the next 3 lines - //Its dereferenced value is not initialized - so we dont care about what we put in it - IndexType * d_nisolated = d_new_frontier_cnt; - cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream); - cudaCheckError() - ; - - //Computing isolated_bmap - //Only dependent on graph - not source vertex - done once - flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream); - cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - cudaCheckError() - ; - - //We need nisolated to be ready to use - cudaStreamSynchronize(stream); - cudaCheckError() - ; - - return NVGRAPH_OK; - } - - template - NVGRAPH_ERROR Bfs::configure( IndexType *_distances, - IndexType *_predecessors, - int *_edge_mask) - { - distances = _distances; - predecessors = _predecessors; - edge_mask = _edge_mask; - - useEdgeMask = (edge_mask != NULL); - computeDistances = (distances != NULL); - computePredecessors = (predecessors != NULL); - - //We need distances to use bottom up - if (directed && !computeDistances) - cudaMalloc(&distances, n * sizeof(IndexType)); - - cudaCheckError() - ; - - return NVGRAPH_OK; - } - - template - NVGRAPH_ERROR Bfs::traverse(IndexType source_vertex) { - - //Init visited_bmap - //If the graph is undirected, we not that - //we will never discover isolated vertices (in degree = out degree = 0) - //we avoid a lot of work by flagging them now - //in g500 graphs they represent ~25% of total vertices - //more than that for wiki and twitter graphs - - if (directed) { - cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream); - } else { - cudaMemcpyAsync( visited_bmap, - isolated_bmap, - vertices_bmap_size * sizeof(int), - cudaMemcpyDeviceToDevice, - stream); - } - cudaCheckError() - ; - - //If needed, setting all vertices as undiscovered (inf distance) - //We dont use computeDistances here - //if the graph is undirected, we may need distances even if - //computeDistances is false - if (distances) - fill_vec(distances, n, vec_t::max, stream); - - //If needed, setting all predecessors to non-existent (-1) - if (computePredecessors) - { - cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream); - cudaCheckError() - ; - } - - // - //Initial frontier - // - - frontier = original_frontier; - - if (distances) - { - cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream); - cudaCheckError() - ; - } - - //Setting source_vertex as visited - //There may be bit already set on that bmap (isolated vertices) - if the graph is undirected - int current_visited_bmap_source_vert = 0; - - if (!directed) { - cudaMemcpyAsync(¤t_visited_bmap_source_vert, - &visited_bmap[source_vertex / INT_SIZE], - sizeof(int), - cudaMemcpyDeviceToHost); - cudaCheckError() - ; - //We need current_visited_bmap_source_vert - cudaStreamSynchronize(stream); - cudaCheckError() - ; - //We could detect that source is isolated here - } - - int m = (1 << (source_vertex % INT_SIZE)); - - //In that case, source is isolated, done now - if (!directed && (m & current_visited_bmap_source_vert)) { - //Init distances and predecessors are done, (cf Streamsync in previous if) - cudaCheckError() - ; - return NVGRAPH_OK; - } - - m |= current_visited_bmap_source_vert; - - cudaMemcpyAsync( &visited_bmap[source_vertex / INT_SIZE], - &m, - sizeof(int), - cudaMemcpyHostToDevice, - stream); - cudaCheckError() - ; - - //Adding source_vertex to init frontier - cudaMemcpyAsync( &frontier[0], - &source_vertex, - sizeof(IndexType), - cudaMemcpyHostToDevice, - stream); - cudaCheckError() - ; - - //mf : edges in frontier - //nf : vertices in frontier - //mu : edges undiscovered - //nu : nodes undiscovered - //lvl : current frontier's depth - IndexType mf, nf, mu, nu; - bool growing; - IndexType lvl = 1; - - //Frontier has one vertex - nf = 1; - - //all edges are undiscovered (by def isolated vertices have 0 edges) - mu = nnz; - - //all non isolated vertices are undiscovered (excepted source vertex, which is in frontier) - //That number is wrong if source_vertex is also isolated - but it's not important - nu = n - nisolated - nf; - - //Last frontier was 0, now it is 1 - growing = true; - - IndexType size_last_left_unvisited_queue = n; //we just need value > 0 - IndexType size_last_unvisited_queue = 0; //queue empty - - //Typical pre-top down workflow. set_frontier_degree + exclusive-scan - set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream); - exclusive_sum( d_cub_exclusive_sum_storage, - cub_exclusive_sum_storage_bytes, - frontier_vertex_degree, - exclusive_sum_frontier_vertex_degree, - nf + 1, - stream); - - cudaMemcpyAsync( &mf, - &exclusive_sum_frontier_vertex_degree[nf], - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - cudaCheckError() - ; - - //We need mf - cudaStreamSynchronize(stream); - cudaCheckError() - ; - - //At first we know we have to use top down - BFS_ALGO_STATE algo_state = TOPDOWN; - - //useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data - //undirected g : need parents to be in children's neighbors - bool can_use_bottom_up = !directed && distances; - - while (nf > 0) { - //Each vertices can appear only once in the frontierer array - we know it will fit - new_frontier = frontier + nf; - IndexType old_nf = nf; - resetDevicePointers(); - - if (can_use_bottom_up) { - //Choosing algo - //Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf - - switch (algo_state) { - case TOPDOWN: - if (mf > mu / alpha) - algo_state = BOTTOMUP; - break; - case BOTTOMUP: - if (!growing && nf < n / beta) { - - //We need to prepare the switch back to top down - //We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here - count_unvisited_edges( unvisited_queue, - size_last_unvisited_queue, - visited_bmap, - vertex_degree, - d_mu, - stream); - - //Typical pre-top down workflow. set_frontier_degree + exclusive-scan - set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream); - exclusive_sum( d_cub_exclusive_sum_storage, - cub_exclusive_sum_storage_bytes, - frontier_vertex_degree, - exclusive_sum_frontier_vertex_degree, - nf + 1, - stream); - - cudaMemcpyAsync( &mf, - &exclusive_sum_frontier_vertex_degree[nf], - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - cudaCheckError() - ; - - cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - cudaCheckError() - ; - - //We will need mf and mu - cudaStreamSynchronize(stream); - cudaCheckError() - ; - - algo_state = TOPDOWN; - } - break; - } - } - - //Executing algo - - switch (algo_state) { - case TOPDOWN: - compute_bucket_offsets( exclusive_sum_frontier_vertex_degree, - exclusive_sum_frontier_vertex_buckets_offsets, - nf, - mf, - stream); - frontier_expand( row_offsets, - col_indices, - frontier, - nf, - mf, - lvl, - new_frontier, - d_new_frontier_cnt, - exclusive_sum_frontier_vertex_degree, - exclusive_sum_frontier_vertex_buckets_offsets, - visited_bmap, - distances, - predecessors, - edge_mask, - isolated_bmap, - directed, - stream, - deterministic); - - mu -= mf; - - cudaMemcpyAsync( &nf, - d_new_frontier_cnt, - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - cudaCheckError(); - - //We need nf - cudaStreamSynchronize(stream); - cudaCheckError(); - - if (nf) { - - //Typical pre-top down workflow. set_frontier_degree + exclusive-scan - set_frontier_degree(frontier_vertex_degree, new_frontier, vertex_degree, nf, stream); - exclusive_sum( d_cub_exclusive_sum_storage, - cub_exclusive_sum_storage_bytes, - frontier_vertex_degree, - exclusive_sum_frontier_vertex_degree, - nf + 1, - stream); - cudaMemcpyAsync( &mf, - &exclusive_sum_frontier_vertex_degree[nf], - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - cudaCheckError() - ; - - //We need mf - cudaStreamSynchronize(stream); - cudaCheckError() - ; - } - break; - - case BOTTOMUP: - fill_unvisited_queue(visited_bmap, - vertices_bmap_size, - n, - unvisited_queue, - d_unvisited_cnt, - stream, - deterministic); - - size_last_unvisited_queue = nu; - - bottom_up_main(unvisited_queue, - size_last_unvisited_queue, - left_unvisited_queue, - d_left_unvisited_cnt, - visited_bmap, - row_offsets, - col_indices, - lvl, - new_frontier, - d_new_frontier_cnt, - distances, - predecessors, - edge_mask, - stream, - deterministic); - - //The number of vertices left unvisited decreases - //If it wasnt necessary last time, it wont be this time - if (size_last_left_unvisited_queue) { - cudaMemcpyAsync( &size_last_left_unvisited_queue, - d_left_unvisited_cnt, - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - cudaCheckError() - ; - //We need last_left_unvisited_size - cudaStreamSynchronize(stream); - cudaCheckError() - ; - bottom_up_large( left_unvisited_queue, - size_last_left_unvisited_queue, - visited_bmap, - row_offsets, - col_indices, - lvl, - new_frontier, - d_new_frontier_cnt, - distances, - predecessors, - edge_mask, - stream, - deterministic); - } - cudaMemcpyAsync( &nf, - d_new_frontier_cnt, - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - cudaCheckError() - ; - - //We will need nf - cudaStreamSynchronize(stream); - cudaCheckError() - ; - - break; - } - - //Updating undiscovered edges count - nu -= nf; - - //Using new frontier - frontier = new_frontier; - growing = (nf > old_nf); - - ++lvl; - } - - cudaCheckError() - ; - return NVGRAPH_OK; - } - - //Just used for benchmarks now - template - NVGRAPH_ERROR Bfs::traverse(IndexType *source_vertices, IndexType nsources) { - for (IndexType i = 0; i < nsources; ++i) - traverse(source_vertices[i]); - - return NVGRAPH_OK; - } - - template - void Bfs::resetDevicePointers() { - cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream); - cudaCheckError() - ; - } - - template - void Bfs::clean() { - cudaCheckError() - ; - - //the vectors have a destructor that takes care of cleaning - cudaFree(original_frontier); - cudaFree(visited_bmap); - cudaFree(isolated_bmap); - cudaFree(vertex_degree); - cudaFree(d_cub_exclusive_sum_storage); - cudaFree(buffer_np1_1); - cudaFree(buffer_np1_2); - cudaFree(exclusive_sum_frontier_vertex_buckets_offsets); - cudaFree(d_counters_pad); - - //In that case, distances is a working data - if (directed && !computeDistances) - cudaFree(distances); - - cudaCheckError() - ; - } - - template class Bfs ; + enum BFS_ALGO_STATE { + TOPDOWN, BOTTOMUP + }; + + template + NVGRAPH_ERROR Bfs::setup() { + + // Determinism flag, false by default + deterministic = false; + + auto rmm_result = RMM_SUCCESS; + + //Working data + //Each vertex can be in the frontier at most once + rmm_result = RMM_ALLOC(&frontier, n * sizeof(IndexType), stream); + rmmCheckError(rmm_result); + + //We will update frontier during the execution + //We need the orig to reset frontier, or cudaFree + original_frontier = frontier; + + //size of bitmaps for vertices + vertices_bmap_size = (n / (8 * sizeof(int)) + 1); + //ith bit of visited_bmap is set <=> ith vertex is visited + rmm_result = RMM_ALLOC(&visited_bmap, sizeof(int) * vertices_bmap_size, stream); + rmmCheckError(rmm_result); + + //ith bit of isolated_bmap is set <=> degree of ith vertex = 0 + rmm_result = RMM_ALLOC(&isolated_bmap, sizeof(int) * vertices_bmap_size, stream); + rmmCheckError(rmm_result); + + //vertices_degree[i] = degree of vertex i + rmm_result = RMM_ALLOC(&vertex_degree, sizeof(IndexType) * n, stream); + rmmCheckError(rmm_result); + + //Cub working data + cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes); + + //We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive + rmm_result = RMM_ALLOC(&buffer_np1_1, (n + 1) * sizeof(IndexType), stream); + rmmCheckError(rmm_result); + + rmm_result = RMM_ALLOC(&buffer_np1_2, (n + 1) * sizeof(IndexType), stream); + rmmCheckError(rmm_result); + + //Using buffers : top down + + //frontier_vertex_degree[i] is the degree of vertex frontier[i] + frontier_vertex_degree = buffer_np1_1; + //exclusive sum of frontier_vertex_degree + exclusive_sum_frontier_vertex_degree = buffer_np1_2; + + //Using buffers : bottom up + + //contains list of unvisited vertices + unvisited_queue = buffer_np1_1; + //size of the "last" unvisited queue : size_last_unvisited_queue + //refers to the size of unvisited_queue + //which may not be up to date (the queue may contains vertices that are now visited) + + //We may leave vertices unvisited after bottom up main kernels - storing them here + left_unvisited_queue = buffer_np1_2; + + //We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket + //See top down kernels for more details + rmm_result = RMM_ALLOC(&exclusive_sum_frontier_vertex_buckets_offsets, + ((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType), + stream); + rmmCheckError(rmm_result); + + //Init device-side counters + //Those counters must be/can be reset at each bfs iteration + //Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck + rmm_result = RMM_ALLOC(&d_counters_pad, 4 * sizeof(IndexType), stream); + rmmCheckError(rmm_result); + + d_new_frontier_cnt = &d_counters_pad[0]; + d_mu = &d_counters_pad[1]; + d_unvisited_cnt = &d_counters_pad[2]; + d_left_unvisited_cnt = &d_counters_pad[3]; + + //Lets use this int* for the next 3 lines + //Its dereferenced value is not initialized - so we dont care about what we put in it + IndexType * d_nisolated = d_new_frontier_cnt; + cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream); + cudaCheckError() + ; + + //Computing isolated_bmap + //Only dependent on graph - not source vertex - done once + flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream); + cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); + cudaCheckError() + ; + + //We need nisolated to be ready to use + cudaStreamSynchronize(stream); + cudaCheckError() + ; + + return NVGRAPH_OK; + } + + template + NVGRAPH_ERROR Bfs::configure( IndexType *_distances, + IndexType *_predecessors, + int *_edge_mask) + { + distances = _distances; + predecessors = _predecessors; + edge_mask = _edge_mask; + + useEdgeMask = (edge_mask != NULL); + computeDistances = (distances != NULL); + computePredecessors = (predecessors != NULL); + + //We need distances to use bottom up + if (directed && !computeDistances) { + auto rmm_result = RMM_ALLOC(&distances, n * sizeof(IndexType), stream); + rmmCheckError(rmm_result); + } + + return NVGRAPH_OK; + } + + template + NVGRAPH_ERROR Bfs::traverse(IndexType source_vertex) { + + //Init visited_bmap + //If the graph is undirected, we not that + //we will never discover isolated vertices (in degree = out degree = 0) + //we avoid a lot of work by flagging them now + //in g500 graphs they represent ~25% of total vertices + //more than that for wiki and twitter graphs + + if (directed) { + cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream); + } else { + cudaMemcpyAsync( visited_bmap, + isolated_bmap, + vertices_bmap_size * sizeof(int), + cudaMemcpyDeviceToDevice, + stream); + } + cudaCheckError() + ; + + //If needed, setting all vertices as undiscovered (inf distance) + //We dont use computeDistances here + //if the graph is undirected, we may need distances even if + //computeDistances is false + if (distances) + fill_vec(distances, n, vec_t::max, stream); + + //If needed, setting all predecessors to non-existent (-1) + if (computePredecessors) + { + cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream); + cudaCheckError() + ; + } + + // + //Initial frontier + // + + frontier = original_frontier; + + if (distances) + { + cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream); + cudaCheckError() + ; + } + + //Setting source_vertex as visited + //There may be bit already set on that bmap (isolated vertices) - if the graph is undirected + int current_visited_bmap_source_vert = 0; + + if (!directed) { + cudaMemcpyAsync(¤t_visited_bmap_source_vert, + &visited_bmap[source_vertex / INT_SIZE], + sizeof(int), + cudaMemcpyDeviceToHost); + cudaCheckError() + ; + //We need current_visited_bmap_source_vert + cudaStreamSynchronize(stream); + cudaCheckError() + ; + //We could detect that source is isolated here + } + + int m = (1 << (source_vertex % INT_SIZE)); + + //In that case, source is isolated, done now + if (!directed && (m & current_visited_bmap_source_vert)) { + //Init distances and predecessors are done, (cf Streamsync in previous if) + cudaCheckError() + ; + return NVGRAPH_OK; + } + + m |= current_visited_bmap_source_vert; + + cudaMemcpyAsync( &visited_bmap[source_vertex / INT_SIZE], + &m, + sizeof(int), + cudaMemcpyHostToDevice, + stream); + cudaCheckError() + ; + + //Adding source_vertex to init frontier + cudaMemcpyAsync( &frontier[0], + &source_vertex, + sizeof(IndexType), + cudaMemcpyHostToDevice, + stream); + cudaCheckError() + ; + + //mf : edges in frontier + //nf : vertices in frontier + //mu : edges undiscovered + //nu : nodes undiscovered + //lvl : current frontier's depth + IndexType mf, nf, mu, nu; + bool growing; + IndexType lvl = 1; + + //Frontier has one vertex + nf = 1; + + //all edges are undiscovered (by def isolated vertices have 0 edges) + mu = nnz; + + //all non isolated vertices are undiscovered (excepted source vertex, which is in frontier) + //That number is wrong if source_vertex is also isolated - but it's not important + nu = n - nisolated - nf; + + //Last frontier was 0, now it is 1 + growing = true; + + IndexType size_last_left_unvisited_queue = n; //we just need value > 0 + IndexType size_last_unvisited_queue = 0; //queue empty + + //Typical pre-top down workflow. set_frontier_degree + exclusive-scan + set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream); + exclusive_sum( d_cub_exclusive_sum_storage, + cub_exclusive_sum_storage_bytes, + frontier_vertex_degree, + exclusive_sum_frontier_vertex_degree, + nf + 1, + stream); + + cudaMemcpyAsync( &mf, + &exclusive_sum_frontier_vertex_degree[nf], + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + cudaCheckError() + ; + + //We need mf + cudaStreamSynchronize(stream); + cudaCheckError() + ; + + //At first we know we have to use top down + BFS_ALGO_STATE algo_state = TOPDOWN; + + //useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data + //undirected g : need parents to be in children's neighbors + bool can_use_bottom_up = !directed && distances; + + while (nf > 0) { + //Each vertices can appear only once in the frontierer array - we know it will fit + new_frontier = frontier + nf; + IndexType old_nf = nf; + resetDevicePointers(); + + if (can_use_bottom_up) { + //Choosing algo + //Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf + + switch (algo_state) { + case TOPDOWN: + if (mf > mu / alpha) + algo_state = BOTTOMUP; + break; + case BOTTOMUP: + if (!growing && nf < n / beta) { + + //We need to prepare the switch back to top down + //We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here + count_unvisited_edges( unvisited_queue, + size_last_unvisited_queue, + visited_bmap, + vertex_degree, + d_mu, + stream); + + //Typical pre-top down workflow. set_frontier_degree + exclusive-scan + set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream); + exclusive_sum( d_cub_exclusive_sum_storage, + cub_exclusive_sum_storage_bytes, + frontier_vertex_degree, + exclusive_sum_frontier_vertex_degree, + nf + 1, + stream); + + cudaMemcpyAsync( &mf, + &exclusive_sum_frontier_vertex_degree[nf], + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + cudaCheckError() + ; + + cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); + cudaCheckError() + ; + + //We will need mf and mu + cudaStreamSynchronize(stream); + cudaCheckError() + ; + + algo_state = TOPDOWN; + } + break; + } + } + + //Executing algo + + switch (algo_state) { + case TOPDOWN: + compute_bucket_offsets( exclusive_sum_frontier_vertex_degree, + exclusive_sum_frontier_vertex_buckets_offsets, + nf, + mf, + stream); + frontier_expand( row_offsets, + col_indices, + frontier, + nf, + mf, + lvl, + new_frontier, + d_new_frontier_cnt, + exclusive_sum_frontier_vertex_degree, + exclusive_sum_frontier_vertex_buckets_offsets, + visited_bmap, + distances, + predecessors, + edge_mask, + isolated_bmap, + directed, + stream, + deterministic); + + mu -= mf; + + cudaMemcpyAsync( &nf, + d_new_frontier_cnt, + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + cudaCheckError(); + + //We need nf + cudaStreamSynchronize(stream); + cudaCheckError(); + + if (nf) { + + //Typical pre-top down workflow. set_frontier_degree + exclusive-scan + set_frontier_degree(frontier_vertex_degree, new_frontier, vertex_degree, nf, stream); + exclusive_sum( d_cub_exclusive_sum_storage, + cub_exclusive_sum_storage_bytes, + frontier_vertex_degree, + exclusive_sum_frontier_vertex_degree, + nf + 1, + stream); + cudaMemcpyAsync( &mf, + &exclusive_sum_frontier_vertex_degree[nf], + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + cudaCheckError() + ; + + //We need mf + cudaStreamSynchronize(stream); + cudaCheckError() + ; + } + break; + + case BOTTOMUP: + fill_unvisited_queue(visited_bmap, + vertices_bmap_size, + n, + unvisited_queue, + d_unvisited_cnt, + stream, + deterministic); + + size_last_unvisited_queue = nu; + + bottom_up_main(unvisited_queue, + size_last_unvisited_queue, + left_unvisited_queue, + d_left_unvisited_cnt, + visited_bmap, + row_offsets, + col_indices, + lvl, + new_frontier, + d_new_frontier_cnt, + distances, + predecessors, + edge_mask, + stream, + deterministic); + + //The number of vertices left unvisited decreases + //If it wasnt necessary last time, it wont be this time + if (size_last_left_unvisited_queue) { + cudaMemcpyAsync( &size_last_left_unvisited_queue, + d_left_unvisited_cnt, + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + cudaCheckError() + ; + //We need last_left_unvisited_size + cudaStreamSynchronize(stream); + cudaCheckError() + ; + bottom_up_large( left_unvisited_queue, + size_last_left_unvisited_queue, + visited_bmap, + row_offsets, + col_indices, + lvl, + new_frontier, + d_new_frontier_cnt, + distances, + predecessors, + edge_mask, + stream, + deterministic); + } + cudaMemcpyAsync( &nf, + d_new_frontier_cnt, + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + cudaCheckError() + ; + + //We will need nf + cudaStreamSynchronize(stream); + cudaCheckError() + ; + + break; + } + + //Updating undiscovered edges count + nu -= nf; + + //Using new frontier + frontier = new_frontier; + growing = (nf > old_nf); + + ++lvl; + } + + cudaCheckError() + ; + return NVGRAPH_OK; + } + + //Just used for benchmarks now + template + NVGRAPH_ERROR Bfs::traverse(IndexType *source_vertices, IndexType nsources) { + for (IndexType i = 0; i < nsources; ++i) + traverse(source_vertices[i]); + + return NVGRAPH_OK; + } + + template + void Bfs::resetDevicePointers() { + cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream); + cudaCheckError() + ; + } + + template + void Bfs::clean() { + cudaCheckError() + ; + + //the vectors have a destructor that takes care of cleaning + RMM_FREE(original_frontier, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(visited_bmap, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(isolated_bmap, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(vertex_degree, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(d_cub_exclusive_sum_storage, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(buffer_np1_1, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(buffer_np1_2, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(exclusive_sum_frontier_vertex_buckets_offsets, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + RMM_FREE(d_counters_pad, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + + //In that case, distances is a working data + if (directed && !computeDistances) + RMM_FREE(distances, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + + cudaCheckError() + ; + } + + template class Bfs ; } // end namespace nvgraph diff --git a/cpp/nvgraph/cpp/src/bfs_kernels.cu b/cpp/nvgraph/cpp/src/bfs_kernels.cu index 594e2b980ca..7024036def5 100644 --- a/cpp/nvgraph/cpp/src/bfs_kernels.cu +++ b/cpp/nvgraph/cpp/src/bfs_kernels.cu @@ -18,6 +18,9 @@ #include #include + +#include + #include #define MAXBLOCKS 65535 @@ -85,1496 +88,1497 @@ using namespace nvgraph; namespace bfs_kernels { - // - // gives the equivalent vectors from a type - // for the max val, would be better to use numeric_limits<>::max() once - // cpp11 is allowed in nvgraph - // - - template - struct vec_t { - typedef int4 vec4; - typedef int2 vec2; - }; - - template<> - struct vec_t { - typedef int4 vec4; - typedef int2 vec2; - static const int max = INT_MAX; - }; - - template<> - struct vec_t { - typedef longlong4 vec4; - typedef longlong2 vec2; - static const long long int max = LLONG_MAX; - }; - - // - // ------------------------- Helper device functions ------------------- - // - - __forceinline__ __device__ int getMaskNRightmostBitSet(int n) { - if (n == INT_SIZE) - return (~0); - int mask = (1 << n) - 1; - return mask; - } - - __forceinline__ __device__ int getMaskNLeftmostBitSet(int n) { - if (n == 0) - return 0; - int mask = ~((1 << (INT_SIZE - n)) - 1); - return mask; - } - - __forceinline__ __device__ int getNextZeroBit(int& val) { - int ibit = __ffs(~val) - 1; - val |= (1 << ibit); - - return ibit; - } - - struct BitwiseAnd - { - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return (a & b); - } - }; - - struct BitwiseOr - { - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return (a | b); - } - }; - - template - __device__ IndexType binsearch_maxle( const IndexType *vec, - const IndexType val, - IndexType low, - IndexType high) { - while (true) { - if (low == high) - return low; //we know it exists - if ((low + 1) == high) - return (vec[high] <= val) ? high : low; - - IndexType mid = low + (high - low) / 2; - - if (vec[mid] > val) - high = mid - 1; - else - low = mid; - - } - } - - // - // ------------------------- Bottom up ------------------------- - // - - // - // fill_unvisited_queue_kernel - // - // Finding unvisited vertices in the visited_bmap, and putting them in the queue - // Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted - // For instance, the queue can look like this : - // 34 38 45 58 61 4 18 24 29 71 84 85 90 - // Because they are represented by those ints in the bitmap : - // [34 38 45 58 61] [4 18 24 29] [71 84 85 90] - - //visited_bmap_nints = the visited_bmap is made of that number of ints - - template - __global__ void fill_unvisited_queue_kernel( int *visited_bmap, - IndexType visited_bmap_nints, - IndexType n, - IndexType *unvisited, - IndexType *unvisited_cnt) { - typedef cub::BlockScan BlockScan; - __shared__ typename BlockScan::TempStorage scan_temp_storage; - - //When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) ) - //We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in - //unvisited_common_block_offset - __shared__ IndexType unvisited_common_block_offset; - - //We don't want threads divergence in the loop (we're going to call __syncthreads) - //Using a block-only dependent in the condition of the loop - for (IndexType block_v_idx = blockIdx.x * blockDim.x; - block_v_idx < visited_bmap_nints; - block_v_idx += blockDim.x * gridDim.x) { - - //Index of visited_bmap that this thread will compute - IndexType v_idx = block_v_idx + threadIdx.x; - - int thread_visited_int = (v_idx < visited_bmap_nints) - ? visited_bmap[v_idx] - : - (~0); //will be neutral in the next lines (virtual vertices all visited) - - //The last int can only be partially valid - //If we are indeed taking care of the last visited int in this thread, - //We need to first disable (ie set as "visited") the inactive bits (vertices >= n) - if (v_idx == (visited_bmap_nints - 1)) { - int active_bits = n - (INT_SIZE * v_idx); - int inactive_bits = INT_SIZE - active_bits; - int mask = getMaskNLeftmostBitSet(inactive_bits); - thread_visited_int |= mask; //Setting inactive bits as visited - } - - //Counting number of unvisited vertices represented by this int - int n_unvisited_in_int = __popc(~thread_visited_int); - int unvisited_thread_offset; - - //We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue - //We ask for that space when computing the block scan, that will tell where to write those - //vertices in the queue, using the common offset of the block (see below) - BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset); - - //Last thread knows how many vertices will be written to the queue by this block - //Asking for that space in the queue using the global count, and saving the common offset - if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) { - IndexType total = unvisited_thread_offset + n_unvisited_in_int; - unvisited_common_block_offset = atomicAdd(unvisited_cnt, total); - } - - //syncthreads for two reasons : - // - we need to broadcast unvisited_common_block_offset - // - we will reuse scan_temp_storage (cf CUB doc) - __syncthreads(); - - IndexType current_unvisited_index = unvisited_common_block_offset - + unvisited_thread_offset; - int nvertices_to_write = n_unvisited_in_int; - - // getNextZeroBit uses __ffs, which gives least significant bit set - // which means that as long as n_unvisited_in_int is valid, - // we will use valid bits - - while (nvertices_to_write > 0) { - if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) { - typename vec_t::vec4 vec_v; - - vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - - typename vec_t::vec4 *unvisited_i4 = reinterpret_cast::vec4*>(&unvisited[current_unvisited_index]); - *unvisited_i4 = vec_v; - - current_unvisited_index += 4; - nvertices_to_write -= 4; - } - else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) { - typename vec_t::vec2 vec_v; - - vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - - typename vec_t::vec2 *unvisited_i2 = reinterpret_cast::vec2*>(&unvisited[current_unvisited_index]); - *unvisited_i2 = vec_v; - - current_unvisited_index += 2; - nvertices_to_write -= 2; - } else { - IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - - unvisited[current_unvisited_index] = v; - - current_unvisited_index += 1; - nvertices_to_write -= 1; - } - - } - } - } - - //Wrapper - template - void fill_unvisited_queue( int *visited_bmap, - IndexType visited_bmap_nints, - IndexType n, - IndexType *unvisited, - IndexType *unvisited_cnt, - cudaStream_t m_stream, - bool deterministic) { - dim3 grid, block; - block.x = FILL_UNVISITED_QUEUE_DIMX; - - grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x); - - fill_unvisited_queue_kernel<<>>( visited_bmap, - visited_bmap_nints, - n, - unvisited, - unvisited_cnt); - cudaCheckError() - ; - } - - // - // count_unvisited_edges_kernel - // Couting the total number of unvisited edges in the graph - using an potentially unvisited queue - // We need the current unvisited vertices to be in the unvisited queue - // But visited vertices can be in the potentially_unvisited queue - // We first check if the vertex is still unvisited before using it - // Useful when switching from "Bottom up" to "Top down" - // - - template - __global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited, - const IndexType potentially_unvisited_size, - const int *visited_bmap, - IndexType *degree_vertices, - IndexType *mu) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage reduce_temp_storage; - - //number of undiscovered edges counted by this thread - IndexType thread_unvisited_edges_count = 0; - - for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < potentially_unvisited_size; - idx += blockDim.x * gridDim.x) { - - IndexType u = potentially_unvisited[idx]; - int u_visited_bmap = visited_bmap[u / INT_SIZE]; - int is_visited = u_visited_bmap & (1 << (u % INT_SIZE)); - - if (!is_visited) - thread_unvisited_edges_count += degree_vertices[u]; - - } - - //We need all thread_unvisited_edges_count to be ready before reducing - __syncthreads(); - - IndexType block_unvisited_edges_count = - BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count); - - //block_unvisited_edges_count is only defined is th.x == 0 - if (threadIdx.x == 0) - atomicAdd(mu, block_unvisited_edges_count); - } - - //Wrapper - template - void count_unvisited_edges(const IndexType *potentially_unvisited, - const IndexType potentially_unvisited_size, - const int *visited_bmap, - IndexType *node_degree, - IndexType *mu, - cudaStream_t m_stream) { - dim3 grid, block; - block.x = COUNT_UNVISITED_EDGES_DIMX; - grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x); - - count_unvisited_edges_kernel<<>>( potentially_unvisited, - potentially_unvisited_size, - visited_bmap, - node_degree, - mu); - cudaCheckError() - ; - } - - // - // Main Bottom Up kernel - // Here we will start to process unvisited vertices in the unvisited queue - // We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges - // If it's not possible to define a valid parent using only those edges, - // add it to the "left_unvisited_queue" - // - - // - // We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property - // It is used to do a reduction locally and fully build the new visited_bmap - // - - template - __global__ void main_bottomup_kernel( const IndexType *unvisited, - const IndexType unvisited_size, - IndexType *left_unvisited, - IndexType *left_unvisited_cnt, - int *visited_bmap, - const IndexType *row_ptr, - const IndexType *col_ind, - IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_cnt, - IndexType *distances, - IndexType *predecessors, - int *edge_mask) { - typedef cub::BlockDiscontinuity BlockDiscontinuity; - typedef cub::WarpReduce WarpReduce; - typedef cub::BlockScan BlockScan; - - __shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage; - __shared__ typename WarpReduce::TempStorage reduce_temp_storage; - __shared__ typename BlockScan::TempStorage scan_temp_storage; - - //To write vertices in the frontier, - //We will use a block scan to locally compute the offsets - //frontier_common_block_offset contains the common offset for the block - __shared__ IndexType frontier_common_block_offset; - - // When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints - // from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23) - // vertices represented by the same int will be designed as part of the same "group" - // To detect the deliminations between those groups, we use BlockDiscontinuity - // Then we need to create the new "visited_bmap" within those group. - // We use a warp reduction that takes into account limits between groups to do it - // But a group can be cut in two different warps : in that case, the second warp - // put the result of its local reduction in local_visited_bmap_warp_head - // the first warp will then read it and finish the reduction - - __shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS]; - - const int warpid = threadIdx.x / WARP_SIZE; - const int laneid = threadIdx.x % WARP_SIZE; - - // we will call __syncthreads inside the loop - // we need to keep complete block active - for (IndexType block_off = blockIdx.x * blockDim.x; - block_off < unvisited_size; - block_off += blockDim.x * gridDim.x) - { - IndexType idx = block_off + threadIdx.x; - - // This thread will take care of unvisited_vertex - // in the visited_bmap, it is represented by the int at index - // visited_bmap_index = unvisited_vertex/INT_SIZE - // it will be used by BlockDiscontinuity - // to flag the separation between groups of vertices (vertices represented by different in in visited_bmap) - IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one - visited_bmap_index[0] = -1; - IndexType unvisited_vertex = -1; - - // local_visited_bmap gives info on the visited bit of unvisited_vertex - // - // By default, everything is visited - // This is because we only take care of unvisited vertices here, - // The other are by default unvisited - // If a vertex remain unvisited, we will notice it here - // That's why by default we consider everything visited ( ie ~0 ) - // If we fail to assign one parent to an unvisited vertex, we will - // explicitly unset the bit - int local_visited_bmap = (~0); - int found = 0; - int more_to_visit = 0; - IndexType valid_parent; - IndexType left_unvisited_off; - - if (idx < unvisited_size) - { - //Processing first STPV edges of unvisited v - //If bigger than that, push to left_unvisited queue - unvisited_vertex = unvisited[idx]; - - IndexType edge_begin = row_ptr[unvisited_vertex]; - IndexType edge_end = row_ptr[unvisited_vertex + 1]; - - visited_bmap_index[0] = unvisited_vertex / INT_SIZE; - - IndexType degree = edge_end - edge_begin; - - for (IndexType edge = edge_begin; - edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge) - { - if (edge_mask && !edge_mask[edge]) - continue; - - IndexType parent_candidate = col_ind[edge]; - - if (distances[parent_candidate] == (lvl - 1)) - { - found = 1; - valid_parent = parent_candidate; - break; - } - } - - // This vertex will remain unvisited at the end of this kernel - // Explicitly say it - if (!found) - local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited - else - { - if (distances) - distances[unvisited_vertex] = lvl; - if (predecessors) - predecessors[unvisited_vertex] = valid_parent; - } - - //If we haven't found a parent and there's more edge to check - if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES) - { - left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1); //TODO scan - more_to_visit = 1; - } - - } - - // - // We will separate vertices in group - // Two vertices are in the same group if represented by same int in visited_bmap - // ie u and v in same group <=> u/32 == v/32 - // - // We will now flag the head of those group (first element of each group) - // - // 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue) - // 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained - // at most by two warps - - int is_head_a[1]; //CUB need an array - BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a, - visited_bmap_index, - cub::Inequality()); - int is_head = is_head_a[0]; - - // Computing the warp reduce within group - // This primitive uses the is_head flags to know where the limits of the groups are - // We use bitwise and as operator, because of the fact that 1 is the default value - // If a vertex is unvisited, we have to explicitly ask for it - int local_bmap_agg = - WarpReduce(reduce_temp_storage).HeadSegmentedReduce( local_visited_bmap, - is_head, - BitwiseAnd()); - - // We need to take care of the groups cut in two in two different warps - // Saving second part of the reduce here, then applying it on the first part bellow - // Corner case : if the first thread of the warp is a head, then this group is not cut in two - // and then we have to be neutral (for an bitwise and, it's an ~0) - if (laneid == 0) - { - local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg; - } - - //broadcasting local_visited_bmap_warp_head - __syncthreads(); - - int head_ballot = nvgraph::utils::ballot(is_head); - - //As long as idx < unvisited_size, we know there's at least one head per warp - int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot); - - int is_last_head_in_warp = (laneid == laneid_last_head_in_warp); - - // if laneid == 0 && is_last_head_in_warp, it's a special case where - // a group of size 32 starts exactly at lane 0 - // in that case, nothing to do (this group is not cut by a warp delimitation) - // we also have to make sure that a warp actually exists after this one (this corner case is handled after) - if (laneid != 0 && is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS) - { - local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1]; - } - - //Three cases : - // -> This is the first group of the block - it may be cut in two (with previous block) - // -> This is the last group of the block - same thing - // -> This group is completely contained in this block - - if (warpid == 0 && laneid == 0) - { - //The first elt of this group considered in this block is unvisited_vertex - //We know that's the case because elts are sorted in a group, and we are at laneid == 0 - //We will do an atomicOr - we have to be neutral about elts < unvisited_vertex - int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid - int mask = getMaskNLeftmostBitSet(INT_SIZE - iv); - local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex - atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); - } - else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) && - laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case - idx < unvisited_size //we could be out - ) - { - //Last head of the block - //We don't know if this group is complete - - //last_v is the last unvisited_vertex of the group IN THIS block - //we dont know about the rest - we have to be neutral about elts > last_v - - //the destination thread of the __shfl is active - int laneid_max = min((IndexType) (WARP_SIZE - 1), - (unvisited_size - (block_off + 32 * warpid))); - IndexType last_v = nvgraph::utils::shfl( unvisited_vertex, - laneid_max, - WARP_SIZE, - __activemask()); - - if (is_last_head_in_warp) - { - int ilast_v = last_v % INT_SIZE + 1; - int mask = getMaskNRightmostBitSet(ilast_v); - local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex - atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); - } - } - else - { - //group completely in block - if (is_head && idx < unvisited_size) { - visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int - } - } - - //Saving in frontier - - int thread_frontier_offset; - BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset); - IndexType inclusive_sum = thread_frontier_offset + found; - if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum) - { - frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); - } - - //1) Broadcasting frontier_common_block_offset - //2) we want to reuse the *_temp_storage - __syncthreads(); - - if (found) - new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex; - if (more_to_visit) - left_unvisited[left_unvisited_off] = unvisited_vertex; - - } - } - - template - void bottom_up_main( IndexType *unvisited, - IndexType unvisited_size, - IndexType *left_unvisited, - IndexType *d_left_unvisited_idx, - int *visited, - const IndexType *row_ptr, - const IndexType *col_ind, - IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_idx, - IndexType *distances, - IndexType *predecessors, - int *edge_mask, - cudaStream_t m_stream, - bool deterministic) { - dim3 grid, block; - block.x = MAIN_BOTTOMUP_DIMX; - - grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x); - - main_bottomup_kernel<<>>(unvisited, - unvisited_size, - left_unvisited, - d_left_unvisited_idx, - visited, - row_ptr, - col_ind, - lvl, - new_frontier, - new_frontier_idx, - distances, - predecessors, - edge_mask); - cudaCheckError() - ; - } - - // - // bottom_up_large_degree_kernel - // finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found - // - template - __global__ void bottom_up_large_degree_kernel( IndexType *left_unvisited, - IndexType left_unvisited_size, - int *visited, - const IndexType *row_ptr, - const IndexType *col_ind, - IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_cnt, - IndexType *distances, - IndexType *predecessors, - int *edge_mask) { - - int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE; - int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE; - int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE; - - //Inactive threads are not a pb for __ballot (known behaviour) - for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id; - idx < left_unvisited_size; - idx += gridDim.x * logical_warps_per_block) { - - //Unvisited vertices - potentially in the next frontier - IndexType v = left_unvisited[idx]; - - //Used only with symmetric graphs - //Parents are included in v's neighbors - IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited - - IndexType end_i_edge = row_ptr[v + 1]; - - //We can have warp divergence in the next loop - //It's not a pb because the behaviour of __ballot - //is know with inactive threads - for (IndexType i_edge = first_i_edge + logical_lane_id; - i_edge < end_i_edge; - i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) { - - IndexType valid_parent = -1; - - if (!edge_mask || edge_mask[i_edge]) { - IndexType u = col_ind[i_edge]; - IndexType lvl_u = distances[u]; - - if (lvl_u == (lvl - 1)) { - valid_parent = u; - } - } - - unsigned int warp_valid_p_ballot = nvgraph::utils::ballot((valid_parent != -1)); - - int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE; - unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1; - unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot - >> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp); - logical_warp_valid_p_ballot &= mask; - - int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1; - - if (chosen_thread == logical_lane_id) { - //Using only one valid parent (reduce bw) - IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1); - int m = 1 << (v % INT_SIZE); - atomicOr(&visited[v / INT_SIZE], m); - distances[v] = lvl; - - if (predecessors) - predecessors[v] = valid_parent; - - new_frontier[off] = v; - } - - if (logical_warp_valid_p_ballot) { - break; - } - } - - } - } - - template - void bottom_up_large(IndexType *left_unvisited, - IndexType left_unvisited_size, - int *visited, - const IndexType *row_ptr, - const IndexType *col_ind, - IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_idx, - IndexType *distances, - IndexType *predecessors, - int *edge_mask, - cudaStream_t m_stream, - bool deterministic) { - dim3 grid, block; - block.x = LARGE_BOTTOMUP_DIMX; - grid.x = min( (IndexType) MAXBLOCKS, - ((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x); - - bottom_up_large_degree_kernel<<>>(left_unvisited, - left_unvisited_size, - visited, - row_ptr, - col_ind, - lvl, - new_frontier, - new_frontier_idx, - distances, - predecessors, - edge_mask); - cudaCheckError() - ; - } - - // - // - // ------------------------------ Top down ------------------------------ - // - // - - // - // compute_bucket_offsets_kernel - // simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer - // - - template - __global__ void compute_bucket_offsets_kernel( const IndexType *frontier_degrees_exclusive_sum, - IndexType *bucket_offsets, - const IndexType frontier_size, - IndexType total_degree) { - IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX - * NBUCKETS_PER_BLOCK + 1); - - for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x; - bid <= end; - bid += gridDim.x * blockDim.x) { - - IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1); - - bucket_offsets[bid] = binsearch_maxle( frontier_degrees_exclusive_sum, - eid, - (IndexType) 0, - frontier_size - 1); - - } - } - - template - void compute_bucket_offsets( IndexType *cumul, - IndexType *bucket_offsets, - IndexType frontier_size, - IndexType total_degree, - cudaStream_t m_stream) { - dim3 grid, block; - block.x = COMPUTE_BUCKET_OFFSETS_DIMX; - - grid.x = min( (IndexType) MAXBLOCKS, - ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX - * NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x); - - compute_bucket_offsets_kernel<<>>(cumul, - bucket_offsets, - frontier_size, - total_degree); - cudaCheckError() - ; - } - - // - // topdown_expand_kernel - // Read current frontier and compute new one with top down paradigm - // One thread = One edge - // To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than) - // This index k will give us the origin of this edge, which is frontier[k] - // This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k] - // - // To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches - // We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges - // - // Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k - // To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory - // We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below) - // - // We will then look which vertices are not visited yet : - // 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on - // 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue - // - // We then treat the candidates queue using the threadIdx.x < ncandidates - // If we are indeed the first thread to discover that vertex (result of atomicOr(visited)) - // We add it to the new frontier - // - - template - __global__ void topdown_expand_kernel( const IndexType *row_ptr, - const IndexType *col_ind, - const IndexType *frontier, - const IndexType frontier_size, - const IndexType totaldegree, - const IndexType max_items_per_thread, - const IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_cnt, - const IndexType *frontier_degrees_exclusive_sum, - const IndexType *frontier_degrees_exclusive_sum_buckets_offsets, - int *bmap, - IndexType *distances, - IndexType *predecessors, - const int *edge_mask, - const int *isolated_bmap, - bool directed) { - //BlockScan - typedef cub::BlockScan BlockScan; - __shared__ typename BlockScan::TempStorage scan_storage; - - // We will do a scan to know where to write in frontier - // This will contain the common offset of the block - __shared__ IndexType frontier_common_block_offset; - - __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1]; - __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1]; - - // - // Frontier candidates local queue - // We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything - // We also save the predecessors here, because we will not be able to retrieve it after - // - __shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE - * TOP_DOWN_EXPAND_DIMX]; - __shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE - * TOP_DOWN_EXPAND_DIMX]; - __shared__ IndexType block_n_frontier_candidates; - - IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread; - IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1) - / TOP_DOWN_EXPAND_DIMX; - - n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left); - - for (; - (n_items_per_thread_left > 0) && (block_offset < totaldegree); - - block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x, - n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) { - - // In this loop, we will process batch_set_size batches - IndexType nitems_per_thread = min( n_items_per_thread_left, - (IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD); - - // Loading buckets offset (see compute_bucket_offsets_kernel) - - if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1)) - shared_buckets_offsets[threadIdx.x] = - frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE - + threadIdx.x]; - - // We will use shared_buckets_offsets - __syncthreads(); - - // - // shared_buckets_offsets gives us a range of the possible indexes - // for edge of linear_threadx, we are looking for the value k such as - // k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx - // - // we have 0 <= k < frontier_size - // but we also have : - // - // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE] - // <= k - // <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1] - // - // To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below) - // We will load them here - // We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop - // Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below) - - //We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[ - //If it doesn't fit, --right until it does, then loop - //It is excepted to fit on the first try, that's why we start right = nitems_per_thread - - IndexType left = 0; - IndexType right = nitems_per_thread; - - while (left < nitems_per_thread) { - // - // Values that are necessary to compute the local binary searches - // We only need those with indexes between extremes indexes of buckets_offsets - // We need the next val for the binary search, hence the +1 - // - - IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] - - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; - - //If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1 - while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) { - --right; - - nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] - - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; - } - - IndexType nitems_per_thread_for_this_load = right - left; - - IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left - * NBUCKETS_PER_BLOCK]; - - //TODO put again the nvalues_to_load == 1 - if (threadIdx.x < nvalues_to_load) { - shared_frontier_degrees_exclusive_sum[threadIdx.x] = - frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset - + threadIdx.x]; - } - - if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) { - shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] = - frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset - + TOP_DOWN_EXPAND_DIMX]; - } - - //shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync - //TODO we don't use it if nvalues_to_load == 1 - __syncthreads(); - - // Now we will process the edges - // Here each thread will process nitems_per_thread_for_this_load - for (IndexType item_index = 0; - item_index < nitems_per_thread_for_this_load; - item_index += TOP_DOWN_BATCH_SIZE) { - - // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism) - // Reduces latency - - IndexType current_max_edge_index = min(block_offset - + (left - + nitems_per_thread_for_this_load) - * blockDim.x, - totaldegree); - - //We will need vec_u (source of the edge) until the end if we need to save the predecessors - //For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case) - - IndexType vec_u[TOP_DOWN_BATCH_SIZE]; - IndexType local_buf1[TOP_DOWN_BATCH_SIZE]; - IndexType local_buf2[TOP_DOWN_BATCH_SIZE]; - - IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0]; + // + // gives the equivalent vectors from a type + // for the max val, would be better to use numeric_limits<>::max() once + // cpp11 is allowed in nvgraph + // + + template + struct vec_t { + typedef int4 vec4; + typedef int2 vec2; + }; + + template<> + struct vec_t { + typedef int4 vec4; + typedef int2 vec2; + static const int max = INT_MAX; + }; + + template<> + struct vec_t { + typedef longlong4 vec4; + typedef longlong2 vec2; + static const long long int max = LLONG_MAX; + }; + + // + // ------------------------- Helper device functions ------------------- + // + + __forceinline__ __device__ int getMaskNRightmostBitSet(int n) { + if (n == INT_SIZE) + return (~0); + int mask = (1 << n) - 1; + return mask; + } + + __forceinline__ __device__ int getMaskNLeftmostBitSet(int n) { + if (n == 0) + return 0; + int mask = ~((1 << (INT_SIZE - n)) - 1); + return mask; + } + + __forceinline__ __device__ int getNextZeroBit(int& val) { + int ibit = __ffs(~val) - 1; + val |= (1 << ibit); + + return ibit; + } + + struct BitwiseAnd + { + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return (a & b); + } + }; + + struct BitwiseOr + { + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return (a | b); + } + }; + + template + __device__ IndexType binsearch_maxle( const IndexType *vec, + const IndexType val, + IndexType low, + IndexType high) { + while (true) { + if (low == high) + return low; //we know it exists + if ((low + 1) == high) + return (vec[high] <= val) ? high : low; + + IndexType mid = low + (high - low) / 2; + + if (vec[mid] > val) + high = mid - 1; + else + low = mid; + + } + } + + // + // ------------------------- Bottom up ------------------------- + // + + // + // fill_unvisited_queue_kernel + // + // Finding unvisited vertices in the visited_bmap, and putting them in the queue + // Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted + // For instance, the queue can look like this : + // 34 38 45 58 61 4 18 24 29 71 84 85 90 + // Because they are represented by those ints in the bitmap : + // [34 38 45 58 61] [4 18 24 29] [71 84 85 90] + + //visited_bmap_nints = the visited_bmap is made of that number of ints + + template + __global__ void fill_unvisited_queue_kernel( int *visited_bmap, + IndexType visited_bmap_nints, + IndexType n, + IndexType *unvisited, + IndexType *unvisited_cnt) { + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage scan_temp_storage; + + //When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) ) + //We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in + //unvisited_common_block_offset + __shared__ IndexType unvisited_common_block_offset; + + //We don't want threads divergence in the loop (we're going to call __syncthreads) + //Using a block-only dependent in the condition of the loop + for (IndexType block_v_idx = blockIdx.x * blockDim.x; + block_v_idx < visited_bmap_nints; + block_v_idx += blockDim.x * gridDim.x) { + + //Index of visited_bmap that this thread will compute + IndexType v_idx = block_v_idx + threadIdx.x; + + int thread_visited_int = (v_idx < visited_bmap_nints) + ? visited_bmap[v_idx] + : + (~0); //will be neutral in the next lines (virtual vertices all visited) + + //The last int can only be partially valid + //If we are indeed taking care of the last visited int in this thread, + //We need to first disable (ie set as "visited") the inactive bits (vertices >= n) + if (v_idx == (visited_bmap_nints - 1)) { + int active_bits = n - (INT_SIZE * v_idx); + int inactive_bits = INT_SIZE - active_bits; + int mask = getMaskNLeftmostBitSet(inactive_bits); + thread_visited_int |= mask; //Setting inactive bits as visited + } + + //Counting number of unvisited vertices represented by this int + int n_unvisited_in_int = __popc(~thread_visited_int); + int unvisited_thread_offset; + + //We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue + //We ask for that space when computing the block scan, that will tell where to write those + //vertices in the queue, using the common offset of the block (see below) + BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset); + + //Last thread knows how many vertices will be written to the queue by this block + //Asking for that space in the queue using the global count, and saving the common offset + if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) { + IndexType total = unvisited_thread_offset + n_unvisited_in_int; + unvisited_common_block_offset = atomicAdd(unvisited_cnt, total); + } + + //syncthreads for two reasons : + // - we need to broadcast unvisited_common_block_offset + // - we will reuse scan_temp_storage (cf CUB doc) + __syncthreads(); + + IndexType current_unvisited_index = unvisited_common_block_offset + + unvisited_thread_offset; + int nvertices_to_write = n_unvisited_in_int; + + // getNextZeroBit uses __ffs, which gives least significant bit set + // which means that as long as n_unvisited_in_int is valid, + // we will use valid bits + + while (nvertices_to_write > 0) { + if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) { + typename vec_t::vec4 vec_v; + + vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + + typename vec_t::vec4 *unvisited_i4 = reinterpret_cast::vec4*>(&unvisited[current_unvisited_index]); + *unvisited_i4 = vec_v; + + current_unvisited_index += 4; + nvertices_to_write -= 4; + } + else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) { + typename vec_t::vec2 vec_v; + + vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + + typename vec_t::vec2 *unvisited_i2 = reinterpret_cast::vec2*>(&unvisited[current_unvisited_index]); + *unvisited_i2 = vec_v; + + current_unvisited_index += 2; + nvertices_to_write -= 2; + } else { + IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + + unvisited[current_unvisited_index] = v; + + current_unvisited_index += 1; + nvertices_to_write -= 1; + } + + } + } + } + + //Wrapper + template + void fill_unvisited_queue( int *visited_bmap, + IndexType visited_bmap_nints, + IndexType n, + IndexType *unvisited, + IndexType *unvisited_cnt, + cudaStream_t m_stream, + bool deterministic) { + dim3 grid, block; + block.x = FILL_UNVISITED_QUEUE_DIMX; + + grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x); + + fill_unvisited_queue_kernel<<>>( visited_bmap, + visited_bmap_nints, + n, + unvisited, + unvisited_cnt); + cudaCheckError() + ; + } + + // + // count_unvisited_edges_kernel + // Couting the total number of unvisited edges in the graph - using an potentially unvisited queue + // We need the current unvisited vertices to be in the unvisited queue + // But visited vertices can be in the potentially_unvisited queue + // We first check if the vertex is still unvisited before using it + // Useful when switching from "Bottom up" to "Top down" + // + + template + __global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited, + const IndexType potentially_unvisited_size, + const int *visited_bmap, + IndexType *degree_vertices, + IndexType *mu) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage reduce_temp_storage; + + //number of undiscovered edges counted by this thread + IndexType thread_unvisited_edges_count = 0; + + for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < potentially_unvisited_size; + idx += blockDim.x * gridDim.x) { + + IndexType u = potentially_unvisited[idx]; + int u_visited_bmap = visited_bmap[u / INT_SIZE]; + int is_visited = u_visited_bmap & (1 << (u % INT_SIZE)); + + if (!is_visited) + thread_unvisited_edges_count += degree_vertices[u]; + + } + + //We need all thread_unvisited_edges_count to be ready before reducing + __syncthreads(); + + IndexType block_unvisited_edges_count = + BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count); + + //block_unvisited_edges_count is only defined is th.x == 0 + if (threadIdx.x == 0) + atomicAdd(mu, block_unvisited_edges_count); + } + + //Wrapper + template + void count_unvisited_edges(const IndexType *potentially_unvisited, + const IndexType potentially_unvisited_size, + const int *visited_bmap, + IndexType *node_degree, + IndexType *mu, + cudaStream_t m_stream) { + dim3 grid, block; + block.x = COUNT_UNVISITED_EDGES_DIMX; + grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x); + + count_unvisited_edges_kernel<<>>( potentially_unvisited, + potentially_unvisited_size, + visited_bmap, + node_degree, + mu); + cudaCheckError() + ; + } + + // + // Main Bottom Up kernel + // Here we will start to process unvisited vertices in the unvisited queue + // We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges + // If it's not possible to define a valid parent using only those edges, + // add it to the "left_unvisited_queue" + // + + // + // We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property + // It is used to do a reduction locally and fully build the new visited_bmap + // + + template + __global__ void main_bottomup_kernel( const IndexType *unvisited, + const IndexType unvisited_size, + IndexType *left_unvisited, + IndexType *left_unvisited_cnt, + int *visited_bmap, + const IndexType *row_ptr, + const IndexType *col_ind, + IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_cnt, + IndexType *distances, + IndexType *predecessors, + int *edge_mask) { + typedef cub::BlockDiscontinuity BlockDiscontinuity; + typedef cub::WarpReduce WarpReduce; + typedef cub::BlockScan BlockScan; + + __shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage; + __shared__ typename WarpReduce::TempStorage reduce_temp_storage; + __shared__ typename BlockScan::TempStorage scan_temp_storage; + + //To write vertices in the frontier, + //We will use a block scan to locally compute the offsets + //frontier_common_block_offset contains the common offset for the block + __shared__ IndexType frontier_common_block_offset; + + // When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints + // from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23) + // vertices represented by the same int will be designed as part of the same "group" + // To detect the deliminations between those groups, we use BlockDiscontinuity + // Then we need to create the new "visited_bmap" within those group. + // We use a warp reduction that takes into account limits between groups to do it + // But a group can be cut in two different warps : in that case, the second warp + // put the result of its local reduction in local_visited_bmap_warp_head + // the first warp will then read it and finish the reduction + + __shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS]; + + const int warpid = threadIdx.x / WARP_SIZE; + const int laneid = threadIdx.x % WARP_SIZE; + + // we will call __syncthreads inside the loop + // we need to keep complete block active + for (IndexType block_off = blockIdx.x * blockDim.x; + block_off < unvisited_size; + block_off += blockDim.x * gridDim.x) + { + IndexType idx = block_off + threadIdx.x; + + // This thread will take care of unvisited_vertex + // in the visited_bmap, it is represented by the int at index + // visited_bmap_index = unvisited_vertex/INT_SIZE + // it will be used by BlockDiscontinuity + // to flag the separation between groups of vertices (vertices represented by different in in visited_bmap) + IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one + visited_bmap_index[0] = -1; + IndexType unvisited_vertex = -1; + + // local_visited_bmap gives info on the visited bit of unvisited_vertex + // + // By default, everything is visited + // This is because we only take care of unvisited vertices here, + // The other are by default unvisited + // If a vertex remain unvisited, we will notice it here + // That's why by default we consider everything visited ( ie ~0 ) + // If we fail to assign one parent to an unvisited vertex, we will + // explicitly unset the bit + int local_visited_bmap = (~0); + int found = 0; + int more_to_visit = 0; + IndexType valid_parent; + IndexType left_unvisited_off; + + if (idx < unvisited_size) + { + //Processing first STPV edges of unvisited v + //If bigger than that, push to left_unvisited queue + unvisited_vertex = unvisited[idx]; + + IndexType edge_begin = row_ptr[unvisited_vertex]; + IndexType edge_end = row_ptr[unvisited_vertex + 1]; + + visited_bmap_index[0] = unvisited_vertex / INT_SIZE; + + IndexType degree = edge_end - edge_begin; + + for (IndexType edge = edge_begin; + edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge) + { + if (edge_mask && !edge_mask[edge]) + continue; + + IndexType parent_candidate = col_ind[edge]; + + if (distances[parent_candidate] == (lvl - 1)) + { + found = 1; + valid_parent = parent_candidate; + break; + } + } + + // This vertex will remain unvisited at the end of this kernel + // Explicitly say it + if (!found) + local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited + else + { + if (distances) + distances[unvisited_vertex] = lvl; + if (predecessors) + predecessors[unvisited_vertex] = valid_parent; + } + + //If we haven't found a parent and there's more edge to check + if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES) + { + left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1); //TODO scan + more_to_visit = 1; + } + + } + + // + // We will separate vertices in group + // Two vertices are in the same group if represented by same int in visited_bmap + // ie u and v in same group <=> u/32 == v/32 + // + // We will now flag the head of those group (first element of each group) + // + // 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue) + // 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained + // at most by two warps + + int is_head_a[1]; //CUB need an array + BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a, + visited_bmap_index, + cub::Inequality()); + int is_head = is_head_a[0]; + + // Computing the warp reduce within group + // This primitive uses the is_head flags to know where the limits of the groups are + // We use bitwise and as operator, because of the fact that 1 is the default value + // If a vertex is unvisited, we have to explicitly ask for it + int local_bmap_agg = + WarpReduce(reduce_temp_storage).HeadSegmentedReduce( local_visited_bmap, + is_head, + BitwiseAnd()); + + // We need to take care of the groups cut in two in two different warps + // Saving second part of the reduce here, then applying it on the first part bellow + // Corner case : if the first thread of the warp is a head, then this group is not cut in two + // and then we have to be neutral (for an bitwise and, it's an ~0) + if (laneid == 0) + { + local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg; + } + + //broadcasting local_visited_bmap_warp_head + __syncthreads(); + + int head_ballot = nvgraph::utils::ballot(is_head); + + //As long as idx < unvisited_size, we know there's at least one head per warp + int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot); + + int is_last_head_in_warp = (laneid == laneid_last_head_in_warp); + + // if laneid == 0 && is_last_head_in_warp, it's a special case where + // a group of size 32 starts exactly at lane 0 + // in that case, nothing to do (this group is not cut by a warp delimitation) + // we also have to make sure that a warp actually exists after this one (this corner case is handled after) + if (laneid != 0 && is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS) + { + local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1]; + } + + //Three cases : + // -> This is the first group of the block - it may be cut in two (with previous block) + // -> This is the last group of the block - same thing + // -> This group is completely contained in this block + + if (warpid == 0 && laneid == 0) + { + //The first elt of this group considered in this block is unvisited_vertex + //We know that's the case because elts are sorted in a group, and we are at laneid == 0 + //We will do an atomicOr - we have to be neutral about elts < unvisited_vertex + int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid + int mask = getMaskNLeftmostBitSet(INT_SIZE - iv); + local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex + atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); + } + else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) && + laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case + idx < unvisited_size //we could be out + ) + { + //Last head of the block + //We don't know if this group is complete + + //last_v is the last unvisited_vertex of the group IN THIS block + //we dont know about the rest - we have to be neutral about elts > last_v + + //the destination thread of the __shfl is active + int laneid_max = min((IndexType) (WARP_SIZE - 1), + (unvisited_size - (block_off + 32 * warpid))); + IndexType last_v = nvgraph::utils::shfl( unvisited_vertex, + laneid_max, + WARP_SIZE, + __activemask()); + + if (is_last_head_in_warp) + { + int ilast_v = last_v % INT_SIZE + 1; + int mask = getMaskNRightmostBitSet(ilast_v); + local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex + atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); + } + } + else + { + //group completely in block + if (is_head && idx < unvisited_size) { + visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int + } + } + + //Saving in frontier + + int thread_frontier_offset; + BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset); + IndexType inclusive_sum = thread_frontier_offset + found; + if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum) + { + frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); + } + + //1) Broadcasting frontier_common_block_offset + //2) we want to reuse the *_temp_storage + __syncthreads(); + + if (found) + new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex; + if (more_to_visit) + left_unvisited[left_unvisited_off] = unvisited_vertex; + + } + } + + template + void bottom_up_main( IndexType *unvisited, + IndexType unvisited_size, + IndexType *left_unvisited, + IndexType *d_left_unvisited_idx, + int *visited, + const IndexType *row_ptr, + const IndexType *col_ind, + IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_idx, + IndexType *distances, + IndexType *predecessors, + int *edge_mask, + cudaStream_t m_stream, + bool deterministic) { + dim3 grid, block; + block.x = MAIN_BOTTOMUP_DIMX; + + grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x); + + main_bottomup_kernel<<>>(unvisited, + unvisited_size, + left_unvisited, + d_left_unvisited_idx, + visited, + row_ptr, + col_ind, + lvl, + new_frontier, + new_frontier_idx, + distances, + predecessors, + edge_mask); + cudaCheckError() + ; + } + + // + // bottom_up_large_degree_kernel + // finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found + // + template + __global__ void bottom_up_large_degree_kernel( IndexType *left_unvisited, + IndexType left_unvisited_size, + int *visited, + const IndexType *row_ptr, + const IndexType *col_ind, + IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_cnt, + IndexType *distances, + IndexType *predecessors, + int *edge_mask) { + + int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE; + int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE; + int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE; + + //Inactive threads are not a pb for __ballot (known behaviour) + for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id; + idx < left_unvisited_size; + idx += gridDim.x * logical_warps_per_block) { + + //Unvisited vertices - potentially in the next frontier + IndexType v = left_unvisited[idx]; + + //Used only with symmetric graphs + //Parents are included in v's neighbors + IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited + + IndexType end_i_edge = row_ptr[v + 1]; + + //We can have warp divergence in the next loop + //It's not a pb because the behaviour of __ballot + //is know with inactive threads + for (IndexType i_edge = first_i_edge + logical_lane_id; + i_edge < end_i_edge; + i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) { + + IndexType valid_parent = -1; + + if (!edge_mask || edge_mask[i_edge]) { + IndexType u = col_ind[i_edge]; + IndexType lvl_u = distances[u]; + + if (lvl_u == (lvl - 1)) { + valid_parent = u; + } + } + + unsigned int warp_valid_p_ballot = nvgraph::utils::ballot((valid_parent != -1)); + + int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE; + unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1; + unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot + >> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp); + logical_warp_valid_p_ballot &= mask; + + int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1; + + if (chosen_thread == logical_lane_id) { + //Using only one valid parent (reduce bw) + IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1); + int m = 1 << (v % INT_SIZE); + atomicOr(&visited[v / INT_SIZE], m); + distances[v] = lvl; + + if (predecessors) + predecessors[v] = valid_parent; + + new_frontier[off] = v; + } + + if (logical_warp_valid_p_ballot) { + break; + } + } + + } + } + + template + void bottom_up_large(IndexType *left_unvisited, + IndexType left_unvisited_size, + int *visited, + const IndexType *row_ptr, + const IndexType *col_ind, + IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_idx, + IndexType *distances, + IndexType *predecessors, + int *edge_mask, + cudaStream_t m_stream, + bool deterministic) { + dim3 grid, block; + block.x = LARGE_BOTTOMUP_DIMX; + grid.x = min( (IndexType) MAXBLOCKS, + ((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x); + + bottom_up_large_degree_kernel<<>>(left_unvisited, + left_unvisited_size, + visited, + row_ptr, + col_ind, + lvl, + new_frontier, + new_frontier_idx, + distances, + predecessors, + edge_mask); + cudaCheckError() + ; + } + + // + // + // ------------------------------ Top down ------------------------------ + // + // + + // + // compute_bucket_offsets_kernel + // simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer + // + + template + __global__ void compute_bucket_offsets_kernel( const IndexType *frontier_degrees_exclusive_sum, + IndexType *bucket_offsets, + const IndexType frontier_size, + IndexType total_degree) { + IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX + * NBUCKETS_PER_BLOCK + 1); + + for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x; + bid <= end; + bid += gridDim.x * blockDim.x) { + + IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1); + + bucket_offsets[bid] = binsearch_maxle( frontier_degrees_exclusive_sum, + eid, + (IndexType) 0, + frontier_size - 1); + + } + } + + template + void compute_bucket_offsets( IndexType *cumul, + IndexType *bucket_offsets, + IndexType frontier_size, + IndexType total_degree, + cudaStream_t m_stream) { + dim3 grid, block; + block.x = COMPUTE_BUCKET_OFFSETS_DIMX; + + grid.x = min( (IndexType) MAXBLOCKS, + ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX + * NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x); + + compute_bucket_offsets_kernel<<>>(cumul, + bucket_offsets, + frontier_size, + total_degree); + cudaCheckError() + ; + } + + // + // topdown_expand_kernel + // Read current frontier and compute new one with top down paradigm + // One thread = One edge + // To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than) + // This index k will give us the origin of this edge, which is frontier[k] + // This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k] + // + // To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches + // We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges + // + // Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k + // To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory + // We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below) + // + // We will then look which vertices are not visited yet : + // 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on + // 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue + // + // We then treat the candidates queue using the threadIdx.x < ncandidates + // If we are indeed the first thread to discover that vertex (result of atomicOr(visited)) + // We add it to the new frontier + // + + template + __global__ void topdown_expand_kernel( const IndexType *row_ptr, + const IndexType *col_ind, + const IndexType *frontier, + const IndexType frontier_size, + const IndexType totaldegree, + const IndexType max_items_per_thread, + const IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_cnt, + const IndexType *frontier_degrees_exclusive_sum, + const IndexType *frontier_degrees_exclusive_sum_buckets_offsets, + int *bmap, + IndexType *distances, + IndexType *predecessors, + const int *edge_mask, + const int *isolated_bmap, + bool directed) { + //BlockScan + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage scan_storage; + + // We will do a scan to know where to write in frontier + // This will contain the common offset of the block + __shared__ IndexType frontier_common_block_offset; + + __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1]; + __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1]; + + // + // Frontier candidates local queue + // We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything + // We also save the predecessors here, because we will not be able to retrieve it after + // + __shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE + * TOP_DOWN_EXPAND_DIMX]; + __shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE + * TOP_DOWN_EXPAND_DIMX]; + __shared__ IndexType block_n_frontier_candidates; + + IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread; + IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1) + / TOP_DOWN_EXPAND_DIMX; + + n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left); + + for (; + (n_items_per_thread_left > 0) && (block_offset < totaldegree); + + block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x, + n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) { + + // In this loop, we will process batch_set_size batches + IndexType nitems_per_thread = min( n_items_per_thread_left, + (IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD); + + // Loading buckets offset (see compute_bucket_offsets_kernel) + + if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1)) + shared_buckets_offsets[threadIdx.x] = + frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE + + threadIdx.x]; + + // We will use shared_buckets_offsets + __syncthreads(); + + // + // shared_buckets_offsets gives us a range of the possible indexes + // for edge of linear_threadx, we are looking for the value k such as + // k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx + // + // we have 0 <= k < frontier_size + // but we also have : + // + // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE] + // <= k + // <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1] + // + // To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below) + // We will load them here + // We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop + // Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below) + + //We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[ + //If it doesn't fit, --right until it does, then loop + //It is excepted to fit on the first try, that's why we start right = nitems_per_thread + + IndexType left = 0; + IndexType right = nitems_per_thread; + + while (left < nitems_per_thread) { + // + // Values that are necessary to compute the local binary searches + // We only need those with indexes between extremes indexes of buckets_offsets + // We need the next val for the binary search, hence the +1 + // + + IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] + - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; + + //If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1 + while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) { + --right; + + nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] + - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; + } + + IndexType nitems_per_thread_for_this_load = right - left; + + IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left + * NBUCKETS_PER_BLOCK]; + + //TODO put again the nvalues_to_load == 1 + if (threadIdx.x < nvalues_to_load) { + shared_frontier_degrees_exclusive_sum[threadIdx.x] = + frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset + + threadIdx.x]; + } + + if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) { + shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] = + frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset + + TOP_DOWN_EXPAND_DIMX]; + } + + //shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync + //TODO we don't use it if nvalues_to_load == 1 + __syncthreads(); + + // Now we will process the edges + // Here each thread will process nitems_per_thread_for_this_load + for (IndexType item_index = 0; + item_index < nitems_per_thread_for_this_load; + item_index += TOP_DOWN_BATCH_SIZE) { + + // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism) + // Reduces latency + + IndexType current_max_edge_index = min(block_offset + + (left + + nitems_per_thread_for_this_load) + * blockDim.x, + totaldegree); + + //We will need vec_u (source of the edge) until the end if we need to save the predecessors + //For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case) + + IndexType vec_u[TOP_DOWN_BATCH_SIZE]; + IndexType local_buf1[TOP_DOWN_BATCH_SIZE]; + IndexType local_buf2[TOP_DOWN_BATCH_SIZE]; + + IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0]; #pragma unroll - for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - - IndexType ibatch = left + item_index + iv; - IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x; - - if (gid < current_max_edge_index) { - IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x) - / TOP_DOWN_BUCKET_SIZE; - IndexType bucket_start = shared_buckets_offsets[start_off_idx] - - frontier_degrees_exclusive_sum_block_offset; - IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1] - - frontier_degrees_exclusive_sum_block_offset; - - IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum, - gid, - bucket_start, - bucket_end) - + frontier_degrees_exclusive_sum_block_offset; - vec_u[iv] = frontier[k]; // origin of this edge - vec_frontier_degrees_exclusive_sum_index[iv] = - frontier_degrees_exclusive_sum[k]; - } else { - vec_u[iv] = -1; - vec_frontier_degrees_exclusive_sum_index[iv] = -1; - } - - } - - IndexType *vec_row_ptr_u = &local_buf1[0]; + for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + + IndexType ibatch = left + item_index + iv; + IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x; + + if (gid < current_max_edge_index) { + IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x) + / TOP_DOWN_BUCKET_SIZE; + IndexType bucket_start = shared_buckets_offsets[start_off_idx] + - frontier_degrees_exclusive_sum_block_offset; + IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1] + - frontier_degrees_exclusive_sum_block_offset; + + IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum, + gid, + bucket_start, + bucket_end) + + frontier_degrees_exclusive_sum_block_offset; + vec_u[iv] = frontier[k]; // origin of this edge + vec_frontier_degrees_exclusive_sum_index[iv] = + frontier_degrees_exclusive_sum[k]; + } else { + vec_u[iv] = -1; + vec_frontier_degrees_exclusive_sum_index[iv] = -1; + } + + } + + IndexType *vec_row_ptr_u = &local_buf1[0]; #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType u = vec_u[iv]; - //row_ptr for this vertex origin u - vec_row_ptr_u[iv] = (u != -1) - ? row_ptr[u] - : - -1; - } - - //We won't need row_ptr after that, reusing pointer - IndexType *vec_dest_v = vec_row_ptr_u; + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType u = vec_u[iv]; + //row_ptr for this vertex origin u + vec_row_ptr_u[iv] = (u != -1) + ? row_ptr[u] + : + -1; + } + + //We won't need row_ptr after that, reusing pointer + IndexType *vec_dest_v = vec_row_ptr_u; #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType thread_item_index = left + item_index + iv; - IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x; + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType thread_item_index = left + item_index + iv; + IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x; - IndexType row_ptr_u = vec_row_ptr_u[iv]; - IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv]; + IndexType row_ptr_u = vec_row_ptr_u[iv]; + IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv]; - if (edge_mask && !edge_mask[edge]) - row_ptr_u = -1; //disabling edge + if (edge_mask && !edge_mask[edge]) + row_ptr_u = -1; //disabling edge - //Destination of this edge - vec_dest_v[iv] = (row_ptr_u != -1) - ? col_ind[edge] - : - -1; - } + //Destination of this edge + vec_dest_v[iv] = (row_ptr_u != -1) + ? col_ind[edge] + : + -1; + } - //We don't need vec_frontier_degrees_exclusive_sum_index anymore - IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index; + //We don't need vec_frontier_degrees_exclusive_sum_index anymore + IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index; #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_dest_v[iv]; - vec_v_visited_bmap[iv] = (v != -1) - ? bmap[v / INT_SIZE] - : - (~0); //will look visited - } - - // From now on we will consider v as a frontier candidate - // If for some reason vec_candidate[iv] should be put in the new_frontier - // Then set vec_candidate[iv] = -1 - IndexType *vec_frontier_candidate = vec_dest_v; + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_dest_v[iv]; + vec_v_visited_bmap[iv] = (v != -1) + ? bmap[v / INT_SIZE] + : + (~0); //will look visited + } + + // From now on we will consider v as a frontier candidate + // If for some reason vec_candidate[iv] should be put in the new_frontier + // Then set vec_candidate[iv] = -1 + IndexType *vec_frontier_candidate = vec_dest_v; #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - int m = 1 << (v % INT_SIZE); + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_frontier_candidate[iv]; + int m = 1 << (v % INT_SIZE); - int is_visited = vec_v_visited_bmap[iv] & m; + int is_visited = vec_v_visited_bmap[iv] & m; - if (is_visited) - vec_frontier_candidate[iv] = -1; - } + if (is_visited) + vec_frontier_candidate[iv] = -1; + } - if (directed) { - //vec_v_visited_bmap is available + if (directed) { + //vec_v_visited_bmap is available - IndexType *vec_is_isolated_bmap = vec_v_visited_bmap; + IndexType *vec_is_isolated_bmap = vec_v_visited_bmap; #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - vec_is_isolated_bmap[iv] = (v != -1) - ? isolated_bmap[v / INT_SIZE] - : - -1; - } + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_frontier_candidate[iv]; + vec_is_isolated_bmap[iv] = (v != -1) + ? isolated_bmap[v / INT_SIZE] + : + -1; + } #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - int m = 1 << (v % INT_SIZE); - int is_isolated = vec_is_isolated_bmap[iv] & m; + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_frontier_candidate[iv]; + int m = 1 << (v % INT_SIZE); + int is_isolated = vec_is_isolated_bmap[iv] & m; - //If v is isolated, we will not add it to the frontier (it's not a frontier candidate) - // 1st reason : it's useless - // 2nd reason : it will make top down algo fail - // we need each node in frontier to have a degree > 0 - // If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr + //If v is isolated, we will not add it to the frontier (it's not a frontier candidate) + // 1st reason : it's useless + // 2nd reason : it will make top down algo fail + // we need each node in frontier to have a degree > 0 + // If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr - if (is_isolated && v != -1) { - int m = 1 << (v % INT_SIZE); - atomicOr(&bmap[v / INT_SIZE], m); - if (distances) - distances[v] = lvl; + if (is_isolated && v != -1) { + int m = 1 << (v % INT_SIZE); + atomicOr(&bmap[v / INT_SIZE], m); + if (distances) + distances[v] = lvl; - if (predecessors) - predecessors[v] = vec_u[iv]; + if (predecessors) + predecessors[v] = vec_u[iv]; - //This is no longer a candidate, neutralize it - vec_frontier_candidate[iv] = -1; - } + //This is no longer a candidate, neutralize it + vec_frontier_candidate[iv] = -1; + } - } - } + } + } - //Number of successor candidate hold by this thread - IndexType thread_n_frontier_candidates = 0; + //Number of successor candidate hold by this thread + IndexType thread_n_frontier_candidates = 0; #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - if (v != -1) - ++thread_n_frontier_candidates; - } + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_frontier_candidate[iv]; + if (v != -1) + ++thread_n_frontier_candidates; + } - // We need to have all nfrontier_candidates to be ready before doing the scan - __syncthreads(); + // We need to have all nfrontier_candidates to be ready before doing the scan + __syncthreads(); - // We will put the frontier candidates in a local queue - // Computing offsets - IndexType thread_frontier_candidate_offset = 0; //offset inside block - BlockScan(scan_storage).ExclusiveSum( thread_n_frontier_candidates, - thread_frontier_candidate_offset); + // We will put the frontier candidates in a local queue + // Computing offsets + IndexType thread_frontier_candidate_offset = 0; //offset inside block + BlockScan(scan_storage).ExclusiveSum( thread_n_frontier_candidates, + thread_frontier_candidate_offset); #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - //May have bank conflicts - IndexType frontier_candidate = vec_frontier_candidate[iv]; - - if (frontier_candidate != -1) { - shared_local_new_frontier_candidates[thread_frontier_candidate_offset] = - frontier_candidate; - shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] = - vec_u[iv]; - ++thread_frontier_candidate_offset; - } - } - - if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { - //No need to add nsuccessor_candidate, even if its an - //exclusive sum - //We incremented the thread_frontier_candidate_offset - block_n_frontier_candidates = thread_frontier_candidate_offset; - } - - //broadcast block_n_frontier_candidates - __syncthreads(); - - IndexType naccepted_vertices = 0; - //We won't need vec_frontier_candidate after that - IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate; + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + //May have bank conflicts + IndexType frontier_candidate = vec_frontier_candidate[iv]; + + if (frontier_candidate != -1) { + shared_local_new_frontier_candidates[thread_frontier_candidate_offset] = + frontier_candidate; + shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] = + vec_u[iv]; + ++thread_frontier_candidate_offset; + } + } + + if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { + //No need to add nsuccessor_candidate, even if its an + //exclusive sum + //We incremented the thread_frontier_candidate_offset + block_n_frontier_candidates = thread_frontier_candidate_offset; + } + + //broadcast block_n_frontier_candidates + __syncthreads(); + + IndexType naccepted_vertices = 0; + //We won't need vec_frontier_candidate after that + IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate; #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - const int idx_shared = iv * blockDim.x + threadIdx.x; - vec_frontier_accepted_vertex[iv] = -1; + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + const int idx_shared = iv * blockDim.x + threadIdx.x; + vec_frontier_accepted_vertex[iv] = -1; - if (idx_shared < block_n_frontier_candidates) { - IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue - int m = 1 << (v % INT_SIZE); - int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old + if (idx_shared < block_n_frontier_candidates) { + IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue + int m = 1 << (v % INT_SIZE); + int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old - if (!(m & q)) { //if this thread was the first to discover this node - if (distances) - distances[v] = lvl; + if (!(m & q)) { //if this thread was the first to discover this node + if (distances) + distances[v] = lvl; - if (predecessors) { - IndexType pred = shared_local_new_frontier_predecessors[idx_shared]; - predecessors[v] = pred; - } + if (predecessors) { + IndexType pred = shared_local_new_frontier_predecessors[idx_shared]; + predecessors[v] = pred; + } - vec_frontier_accepted_vertex[iv] = v; - ++naccepted_vertices; - } - } + vec_frontier_accepted_vertex[iv] = v; + ++naccepted_vertices; + } + } - } + } - //We need naccepted_vertices to be ready - __syncthreads(); + //We need naccepted_vertices to be ready + __syncthreads(); - IndexType thread_new_frontier_offset; + IndexType thread_new_frontier_offset; - BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset); + BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset); - if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { + if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { - IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices; - //for this thread, thread_new_frontier_offset + has_successor (exclusive sum) - if (inclusive_sum) - frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); - } + IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices; + //for this thread, thread_new_frontier_offset + has_successor (exclusive sum) + if (inclusive_sum) + frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); + } - //Broadcasting frontier_common_block_offset - __syncthreads(); + //Broadcasting frontier_common_block_offset + __syncthreads(); #pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - const int idx_shared = iv * blockDim.x + threadIdx.x; - if (idx_shared < block_n_frontier_candidates) { - - IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv]; - - if (new_frontier_vertex != -1) { - IndexType off = frontier_common_block_offset + thread_new_frontier_offset++; - //TODO Access is not good - new_frontier[off] = new_frontier_vertex; - } - } - } - - } - - //We need to keep shared_frontier_degrees_exclusive_sum coherent - __syncthreads(); - - //Preparing for next load - left = right; - right = nitems_per_thread; - } - - //we need to keep shared_buckets_offsets coherent - __syncthreads(); - } - - } - - template - void frontier_expand(const IndexType *row_ptr, - const IndexType *col_ind, - const IndexType *frontier, - const IndexType frontier_size, - const IndexType totaldegree, - const IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_cnt, - const IndexType *frontier_degrees_exclusive_sum, - const IndexType *frontier_degrees_exclusive_sum_buckets_offsets, - int *visited_bmap, - IndexType *distances, - IndexType *predecessors, - const int *edge_mask, - const int *isolated_bmap, - bool directed, - cudaStream_t m_stream, - bool deterministic) { - if (!totaldegree) - return; - - dim3 block; - block.x = TOP_DOWN_EXPAND_DIMX; - - IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1) - / (MAXBLOCKS * block.x); - - dim3 grid; - grid.x = min( (totaldegree + max_items_per_thread * block.x - 1) - / (max_items_per_thread * block.x), - (IndexType) MAXBLOCKS); - - topdown_expand_kernel<<>>( row_ptr, - col_ind, - frontier, - frontier_size, - totaldegree, - max_items_per_thread, - lvl, - new_frontier, - new_frontier_cnt, - frontier_degrees_exclusive_sum, - frontier_degrees_exclusive_sum_buckets_offsets, - visited_bmap, - distances, - predecessors, - edge_mask, - isolated_bmap, - directed); - cudaCheckError() - ; - } - - template - __global__ void flag_isolated_vertices_kernel( IndexType n, - int *isolated_bmap, - const IndexType *row_ptr, - IndexType *degrees, - IndexType *nisolated) { - typedef cub::BlockLoad BlockLoad; - typedef cub::BlockStore BlockStore; - typedef cub::BlockReduce BlockReduce; - typedef cub::WarpReduce WarpReduce; - - __shared__ typename BlockLoad::TempStorage load_temp_storage; - __shared__ typename BlockStore::TempStorage store_temp_storage; - __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage; - - __shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX - / FLAG_ISOLATED_VERTICES_THREADS_PER_INT]; - - __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX]; - - for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - * (blockDim.x * blockIdx.x); - block_off < n; - block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) { - - IndexType thread_off = block_off - + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x; - IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1; - - IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; - IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1] - - BlockLoad(load_temp_storage).Load( row_ptr + block_off, - thread_row_ptr, - block_valid_items, - -1); - - //To compute 4 degrees, we need 5 values of row_ptr - //Saving the "5th" value in shared memory for previous thread to use - if (threadIdx.x > 0) { - row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0]; - } - - //If this is the last thread, it needs to load its row ptr tail value - if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) { - row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1]; - - } - __syncthreads(); // we may reuse temp_storage - - int local_isolated_bmap = 0; - - IndexType imax = (n - thread_off); - - IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + const int idx_shared = iv * blockDim.x + threadIdx.x; + if (idx_shared < block_n_frontier_candidates) { + + IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv]; + + if (new_frontier_vertex != -1) { + IndexType off = frontier_common_block_offset + thread_new_frontier_offset++; + //TODO Access is not good + new_frontier[off] = new_frontier_vertex; + } + } + } + + } + + //We need to keep shared_frontier_degrees_exclusive_sum coherent + __syncthreads(); + + //Preparing for next load + left = right; + right = nitems_per_thread; + } + + //we need to keep shared_buckets_offsets coherent + __syncthreads(); + } + + } + + template + void frontier_expand(const IndexType *row_ptr, + const IndexType *col_ind, + const IndexType *frontier, + const IndexType frontier_size, + const IndexType totaldegree, + const IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_cnt, + const IndexType *frontier_degrees_exclusive_sum, + const IndexType *frontier_degrees_exclusive_sum_buckets_offsets, + int *visited_bmap, + IndexType *distances, + IndexType *predecessors, + const int *edge_mask, + const int *isolated_bmap, + bool directed, + cudaStream_t m_stream, + bool deterministic) { + if (!totaldegree) + return; + + dim3 block; + block.x = TOP_DOWN_EXPAND_DIMX; + + IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1) + / (MAXBLOCKS * block.x); + + dim3 grid; + grid.x = min( (totaldegree + max_items_per_thread * block.x - 1) + / (max_items_per_thread * block.x), + (IndexType) MAXBLOCKS); + + topdown_expand_kernel<<>>( row_ptr, + col_ind, + frontier, + frontier_size, + totaldegree, + max_items_per_thread, + lvl, + new_frontier, + new_frontier_cnt, + frontier_degrees_exclusive_sum, + frontier_degrees_exclusive_sum_buckets_offsets, + visited_bmap, + distances, + predecessors, + edge_mask, + isolated_bmap, + directed); + cudaCheckError() + ; + } + + template + __global__ void flag_isolated_vertices_kernel( IndexType n, + int *isolated_bmap, + const IndexType *row_ptr, + IndexType *degrees, + IndexType *nisolated) { + typedef cub::BlockLoad BlockLoad; + typedef cub::BlockStore BlockStore; + typedef cub::BlockReduce BlockReduce; + typedef cub::WarpReduce WarpReduce; + + __shared__ typename BlockLoad::TempStorage load_temp_storage; + __shared__ typename BlockStore::TempStorage store_temp_storage; + __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage; + + __shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX + / FLAG_ISOLATED_VERTICES_THREADS_PER_INT]; + + __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX]; + + for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + * (blockDim.x * blockIdx.x); + block_off < n; + block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) { + + IndexType thread_off = block_off + + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x; + IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1; + + IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; + IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1] + + BlockLoad(load_temp_storage).Load( row_ptr + block_off, + thread_row_ptr, + block_valid_items, + -1); + + //To compute 4 degrees, we need 5 values of row_ptr + //Saving the "5th" value in shared memory for previous thread to use + if (threadIdx.x > 0) { + row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0]; + } + + //If this is the last thread, it needs to load its row ptr tail value + if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) { + row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1]; + + } + __syncthreads(); // we may reuse temp_storage + + int local_isolated_bmap = 0; + + IndexType imax = (n - thread_off); + + IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; #pragma unroll - for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) { - IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i]; - - if (i < imax) - local_isolated_bmap |= ((degree == 0) << i); - } - - if (last_node_thread < n) { - IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] = - row_ptr_tail[threadIdx.x] - - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1]; - - local_isolated_bmap |= ((degree == 0) - << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1)); - - } - - local_isolated_bmap <<= (thread_off % INT_SIZE); - - IndexType local_nisolated = __popc(local_isolated_bmap); - - //We need local_nisolated and local_isolated_bmap to be ready for next steps - __syncthreads(); - - IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated); - - if (threadIdx.x == 0 && total_nisolated) { - atomicAdd(nisolated, total_nisolated); - } - - int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT; - - //Building int for bmap - int int_aggregate_isolated_bmap = - WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce( local_isolated_bmap, - BitwiseOr()); - - int is_head_of_visited_int = - ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0); - if (is_head_of_visited_int) { - isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap; - } - - BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items); - } - } - - template - void flag_isolated_vertices( IndexType n, - int *isolated_bmap, - const IndexType *row_ptr, - IndexType *degrees, - IndexType *nisolated, - cudaStream_t m_stream) { - dim3 grid, block; - block.x = FLAG_ISOLATED_VERTICES_DIMX; - - grid.x = min( (IndexType) MAXBLOCKS, - (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x); - - flag_isolated_vertices_kernel<<>>(n, - isolated_bmap, - row_ptr, - degrees, - nisolated); - cudaCheckError() - ; - } - - // - // - // - // Some utils functions - // - // - - //Creates CUB data for graph size n - template - void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) { - // Determine temporary device storage requirements for exclusive prefix scan - d_temp_storage = NULL; - temp_storage_bytes = 0; - IndexType *d_in = NULL, *d_out = NULL; - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n); - // Allocate temporary storage for exclusive prefix scan - cudaMalloc(&d_temp_storage, temp_storage_bytes); - } - - template - __global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) { - for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x; - u < n; - u += gridDim.x * blockDim.x) - vec[u] = val; - - } - - template - void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) { - dim3 grid, block; - block.x = 256; - grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS); - fill_kernel<<>>(vec, n, val); - cudaCheckError() - ; - } - - template - __global__ void set_frontier_degree_kernel( IndexType *frontier_degree, - IndexType *frontier, - const IndexType *degree, - IndexType n) { - for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x; - idx < n; - idx += gridDim.x * blockDim.x) { - IndexType u = frontier[idx]; - frontier_degree[idx] = degree[u]; - } - } - - template - void set_frontier_degree( IndexType *frontier_degree, - IndexType *frontier, - const IndexType *degree, - IndexType n, - cudaStream_t m_stream) { - dim3 grid, block; - block.x = 256; - grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS); - set_frontier_degree_kernel<<>>(frontier_degree, - frontier, - degree, - n); - cudaCheckError() - ; - } - - template - void exclusive_sum( void *d_temp_storage, - size_t temp_storage_bytes, - IndexType *d_in, - IndexType *d_out, - IndexType num_items, - cudaStream_t m_stream) { - if (num_items <= 1) - return; //DeviceScan fails if n==1 - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - m_stream); - } - - template - __global__ void fill_vec_kernel(T *vec, T n, T val) { - for (T idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < n; - idx += blockDim.x * gridDim.x) - vec[idx] = val; - } - - template - void fill_vec(T *vec, T n, T val, cudaStream_t stream) { - dim3 grid, block; - block.x = 256; - grid.x = (n + block.x - 1) / block.x; - - fill_vec_kernel<<>>(vec, n, val); - cudaCheckError() - ; - } + for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) { + IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i]; + + if (i < imax) + local_isolated_bmap |= ((degree == 0) << i); + } + + if (last_node_thread < n) { + IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] = + row_ptr_tail[threadIdx.x] + - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1]; + + local_isolated_bmap |= ((degree == 0) + << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1)); + + } + + local_isolated_bmap <<= (thread_off % INT_SIZE); + + IndexType local_nisolated = __popc(local_isolated_bmap); + + //We need local_nisolated and local_isolated_bmap to be ready for next steps + __syncthreads(); + + IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated); + + if (threadIdx.x == 0 && total_nisolated) { + atomicAdd(nisolated, total_nisolated); + } + + int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT; + + //Building int for bmap + int int_aggregate_isolated_bmap = + WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce( local_isolated_bmap, + BitwiseOr()); + + int is_head_of_visited_int = + ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0); + if (is_head_of_visited_int) { + isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap; + } + + BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items); + } + } + + template + void flag_isolated_vertices( IndexType n, + int *isolated_bmap, + const IndexType *row_ptr, + IndexType *degrees, + IndexType *nisolated, + cudaStream_t m_stream) { + dim3 grid, block; + block.x = FLAG_ISOLATED_VERTICES_DIMX; + + grid.x = min( (IndexType) MAXBLOCKS, + (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x); + + flag_isolated_vertices_kernel<<>>(n, + isolated_bmap, + row_ptr, + degrees, + nisolated); + cudaCheckError() + ; + } + + // + // + // + // Some utils functions + // + // + + //Creates CUB data for graph size n + template + void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) { + // Determine temporary device storage requirements for exclusive prefix scan + d_temp_storage = NULL; + temp_storage_bytes = 0; + IndexType *d_in = NULL, *d_out = NULL; + cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n); + // Allocate temporary storage for exclusive prefix scan + cudaStream_t stream{nullptr}; + RMM_ALLOC(&d_temp_storage, temp_storage_bytes, stream);//Better to be error checked, but we do not have a policy for error checking yet (in particular for void functions), so I defer error check as future work. + } + + template + __global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) { + for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x; + u < n; + u += gridDim.x * blockDim.x) + vec[u] = val; + + } + + template + void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) { + dim3 grid, block; + block.x = 256; + grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS); + fill_kernel<<>>(vec, n, val); + cudaCheckError() + ; + } + + template + __global__ void set_frontier_degree_kernel( IndexType *frontier_degree, + IndexType *frontier, + const IndexType *degree, + IndexType n) { + for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x; + idx < n; + idx += gridDim.x * blockDim.x) { + IndexType u = frontier[idx]; + frontier_degree[idx] = degree[u]; + } + } + + template + void set_frontier_degree( IndexType *frontier_degree, + IndexType *frontier, + const IndexType *degree, + IndexType n, + cudaStream_t m_stream) { + dim3 grid, block; + block.x = 256; + grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS); + set_frontier_degree_kernel<<>>(frontier_degree, + frontier, + degree, + n); + cudaCheckError() + ; + } + + template + void exclusive_sum( void *d_temp_storage, + size_t temp_storage_bytes, + IndexType *d_in, + IndexType *d_out, + IndexType num_items, + cudaStream_t m_stream) { + if (num_items <= 1) + return; //DeviceScan fails if n==1 + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + m_stream); + } + + template + __global__ void fill_vec_kernel(T *vec, T n, T val) { + for (T idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < n; + idx += blockDim.x * gridDim.x) + vec[idx] = val; + } + + template + void fill_vec(T *vec, T n, T val, cudaStream_t stream) { + dim3 grid, block; + block.x = 256; + grid.x = (n + block.x - 1) / block.x; + + fill_vec_kernel<<>>(vec, n, val); + cudaCheckError() + ; + } } // diff --git a/cpp/nvgraph/cpp/src/convert.cu b/cpp/nvgraph/cpp/src/convert.cu index bb6c34146ee..3d1e0ad99e1 100644 --- a/cpp/nvgraph/cpp/src/convert.cu +++ b/cpp/nvgraph/cpp/src/convert.cu @@ -61,7 +61,7 @@ int *cscRowInd, int *cscColPtr, int *p, cusparseIndexBase_t idxBase){ - SHARED_PREFIX::shared_ptr pBuffer; + std::shared_ptr pBuffer; // Step 1: Allocate buffer size_t pBufferSizeInBytes = 0; @@ -79,8 +79,8 @@ void *dstVal, int *dstRowInd, int *dstColInd, cusparseIndexBase_t idxBase, cudaDataType_t *dataType){ size_t pBufferSizeInBytes = 0; - SHARED_PREFIX::shared_ptr pBuffer; - SHARED_PREFIX::shared_ptr P; // permutation array + std::shared_ptr pBuffer; + std::shared_ptr P; // permutation array // step 0: copy src to dst if(dstRowInd!=srcRowInd) @@ -103,8 +103,8 @@ void *dstVal, int *dstRowInd, int *dstColInd, cusparseIndexBase_t idxBase, cudaDataType_t *dataType){ size_t pBufferSizeInBytes = 0; - SHARED_PREFIX::shared_ptr pBuffer; - SHARED_PREFIX::shared_ptr P; // permutation array + std::shared_ptr pBuffer; + std::shared_ptr P; // permutation array // step 0: copy src to dst CHECK_CUDA( cudaMemcpy(dstRowInd, srcRowInd, nnz*sizeof(int), cudaMemcpyDefault) ); @@ -126,7 +126,7 @@ void *dstVal, int *dstRowInd, int *dstColPtr, cusparseIndexBase_t idxBase, cudaDataType_t *dataType){ // coos -> cood -> csc - SHARED_PREFIX::shared_ptr tmp = allocateDevice(nnz, NULL); + std::shared_ptr tmp = allocateDevice(nnz, NULL); cooSortByDestination(m, n, nnz, srcVal, srcRowInd, srcColInd, dstVal, dstRowInd, tmp.get(), idxBase, dataType); coo2csr(tmp.get(), nnz, m, dstColPtr, idxBase); } @@ -135,7 +135,7 @@ void *dstVal, int *dstRowPtr, int *dstColInd, cusparseIndexBase_t idxBase, cudaDataType_t *dataType){ // cood -> coos -> csr - SHARED_PREFIX::shared_ptr tmp = allocateDevice(nnz, NULL); + std::shared_ptr tmp = allocateDevice(nnz, NULL); cooSortBySource(m, n, nnz, srcVal, srcRowInd, srcColInd, dstVal, tmp.get(), dstColInd, idxBase, dataType); coo2csr(tmp.get(), nnz, m, dstRowPtr, idxBase); } diff --git a/cpp/nvgraph/cpp/src/nvgraph.cu b/cpp/nvgraph/cpp/src/nvgraph.cu index 38124c148e4..ee2131b9da1 100644 --- a/cpp/nvgraph/cpp/src/nvgraph.cu +++ b/cpp/nvgraph/cpp/src/nvgraph.cu @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include @@ -52,64 +52,57 @@ #include "2d_partitioning.h" #include "bfs2d.hxx" -static inline int check_context(const nvgraphHandle_t h) - { - int ret = 0; - if (h == NULL || !h->nvgraphIsInitialized) - ret = 1; - return ret; +static inline int check_context(const nvgraphHandle_t h) { + int ret = 0; + if (h == NULL || !h->nvgraphIsInitialized) + ret = 1; + return ret; } -static inline int check_graph(const nvgraphGraphDescr_t d) - { - int ret = 0; - if (d == NULL || d->graphStatus == IS_EMPTY) - ret = 1; - return ret; +static inline int check_graph(const nvgraphGraphDescr_t d) { + int ret = 0; + if (d == NULL || d->graphStatus == IS_EMPTY) + ret = 1; + return ret; } -static inline int check_topology(const nvgraphGraphDescr_t d) - { - int ret = 0; - if (d->graphStatus == IS_EMPTY) - ret = 1; - return ret; +static inline int check_topology(const nvgraphGraphDescr_t d) { + int ret = 0; + if (d->graphStatus == IS_EMPTY) + ret = 1; + return ret; } -static inline int check_int_size(size_t sz) - { - int ret = 0; - if (sz >= INT_MAX) - ret = 1; - return ret; +static inline int check_int_size(size_t sz) { + int ret = 0; + if (sz >= INT_MAX) + ret = 1; + return ret; } -static inline int check_int_ptr(const int* p) - { - int ret = 0; - if (!p) - ret = 1; - return ret; +static inline int check_int_ptr(const int* p) { + int ret = 0; + if (!p) + ret = 1; + return ret; } -static inline int check_uniform_type_array(const cudaDataType_t * t, size_t sz) - { - int ret = 0; - cudaDataType_t uniform_type = t[0]; - for (size_t i = 1; i < sz; i++) - { - if (t[i] != uniform_type) - ret = 1; - } - return ret; +static inline int check_uniform_type_array(const cudaDataType_t * t, size_t sz) { + int ret = 0; + cudaDataType_t uniform_type = t[0]; + for (size_t i = 1; i < sz; i++) + { + if (t[i] != uniform_type) + ret = 1; + } + return ret; } template -bool check_ptr(const T* p) - { - bool ret = false; - if (!p) - ret = true; - return ret; +bool check_ptr(const T* p) { + bool ret = false; + if (!p) + ret = true; + return ret; } namespace nvgraph @@ -120,3417 +113,3319 @@ namespace nvgraph //right now this header does not exist and including graph_concrete_visitors.hxx //doesn't compile because of the Thrust code; // - extern CsrGraph* extract_subgraph_by_vertices(CsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream); - extern MultiValuedCsrGraph* extract_subgraph_by_vertices(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream); - extern MultiValuedCsrGraph* extract_subgraph_by_vertices(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream); - - extern CsrGraph* extract_subgraph_by_edges(CsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream); - extern MultiValuedCsrGraph* extract_subgraph_by_edges(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream); - extern MultiValuedCsrGraph* extract_subgraph_by_edges(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream); + extern CsrGraph* extract_subgraph_by_vertices(CsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream); + extern MultiValuedCsrGraph* extract_subgraph_by_vertices(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream); + extern MultiValuedCsrGraph* extract_subgraph_by_vertices(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream); + + extern CsrGraph* extract_subgraph_by_edges(CsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream); + extern MultiValuedCsrGraph* extract_subgraph_by_edges(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream); + extern MultiValuedCsrGraph* extract_subgraph_by_edges(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream); #ifndef NVGRAPH_LIGHT - extern CsrGraph* contract_graph_csr_mul(CsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern CsrGraph* contract_graph_csr_sum(CsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern CsrGraph* contract_graph_csr_min(CsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern CsrGraph* contract_graph_csr_max(CsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern MultiValuedCsrGraph* contract_graph_mv_float_mul(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern MultiValuedCsrGraph* contract_graph_mv_float_sum(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern MultiValuedCsrGraph* contract_graph_mv_float_min(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern MultiValuedCsrGraph* contract_graph_mv_float_max(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern MultiValuedCsrGraph* contract_graph_mv_double_mul(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern MultiValuedCsrGraph* contract_graph_mv_double_sum(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern MultiValuedCsrGraph* contract_graph_mv_double_min(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); - - extern MultiValuedCsrGraph* contract_graph_mv_double_max(MultiValuedCsrGraph& graph, - int* pV, - size_t n, - cudaStream_t stream, - const int& VCombine, - const int& VReduce, - const int& ECombine, - const int& EReduce); + extern CsrGraph* contract_graph_csr_mul(CsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern CsrGraph* contract_graph_csr_sum(CsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern CsrGraph* contract_graph_csr_min(CsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern CsrGraph* contract_graph_csr_max(CsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern MultiValuedCsrGraph* contract_graph_mv_float_mul(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern MultiValuedCsrGraph* contract_graph_mv_float_sum(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern MultiValuedCsrGraph* contract_graph_mv_float_min(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern MultiValuedCsrGraph* contract_graph_mv_float_max(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern MultiValuedCsrGraph* contract_graph_mv_double_mul(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern MultiValuedCsrGraph* contract_graph_mv_double_sum(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern MultiValuedCsrGraph* contract_graph_mv_double_min(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); + + extern MultiValuedCsrGraph* contract_graph_mv_double_max(MultiValuedCsrGraph& graph, + int* pV, + size_t n, + cudaStream_t stream, + const int& VCombine, + const int& VReduce, + const int& ECombine, + const int& EReduce); #endif - nvgraphStatus_t getCAPIStatusForError(NVGRAPH_ERROR err) - { - nvgraphStatus_t ret = NVGRAPH_STATUS_SUCCESS; - - switch (err) - { - case NVGRAPH_OK: - ret = NVGRAPH_STATUS_SUCCESS; - break; - case NVGRAPH_ERR_BAD_PARAMETERS: - ret = NVGRAPH_STATUS_INVALID_VALUE; - break; - case NVGRAPH_ERR_UNKNOWN: - ret = NVGRAPH_STATUS_INTERNAL_ERROR; - break; - case NVGRAPH_ERR_CUDA_FAILURE: - ret = NVGRAPH_STATUS_EXECUTION_FAILED; - break; - case NVGRAPH_ERR_THRUST_FAILURE: - ret = NVGRAPH_STATUS_EXECUTION_FAILED; - break; - case NVGRAPH_ERR_IO: - ret = NVGRAPH_STATUS_INTERNAL_ERROR; - break; - case NVGRAPH_ERR_NOT_IMPLEMENTED: - ret = NVGRAPH_STATUS_INVALID_VALUE; - break; - case NVGRAPH_ERR_NO_MEMORY: - ret = NVGRAPH_STATUS_ALLOC_FAILED; - break; - case NVGRAPH_ERR_NOT_CONVERGED: - ret = NVGRAPH_STATUS_NOT_CONVERGED; - break; - default: - ret = NVGRAPH_STATUS_INTERNAL_ERROR; - } - return ret; - } - - extern "C" { - const char* nvgraphStatusGetString(nvgraphStatus_t status) - { - switch (status) { - case NVGRAPH_STATUS_SUCCESS: - return "Success"; - case NVGRAPH_STATUS_NOT_INITIALIZED: - return "nvGRAPH not initialized"; - case NVGRAPH_STATUS_ALLOC_FAILED: - return "nvGRAPH alloc failed"; - case NVGRAPH_STATUS_INVALID_VALUE: - return "nvGRAPH invalid value"; - case NVGRAPH_STATUS_ARCH_MISMATCH: - return "nvGRAPH arch mismatch"; - case NVGRAPH_STATUS_MAPPING_ERROR: - return "nvGRAPH mapping error"; - case NVGRAPH_STATUS_EXECUTION_FAILED: - return "nvGRAPH execution failed"; - case NVGRAPH_STATUS_INTERNAL_ERROR: - return "nvGRAPH internal error"; - case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED: - return "nvGRAPH type not supported"; - case NVGRAPH_STATUS_NOT_CONVERGED: - return "nvGRAPH algorithm failed to converge"; - case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED: - return "nvGRAPH graph type not supported"; - default: - return "Unknown nvGRAPH Status"; - } - } - ; - } - - static nvgraphStatus_t nvgraphCreateMulti_impl(struct nvgraphContext **outCtx, - int numDevices, - int* _devices) { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - int device; - - CHECK_CUDA(cudaFree((void * )0)); - CHECK_CUDA(cudaGetDevice(&device)); - struct nvgraphContext *ctx = NULL; - ctx = (struct nvgraphContext *) malloc(sizeof(*ctx)); - if (!ctx) { - FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN); - } - - //cnmem - memset(&ctx->cnmem_device, 0, sizeof(ctx->cnmem_device)); // init all to 0 - ctx->cnmem_device.device = device; // cnmem runs on the device set by cudaSetDevice - - size_t init_alloc = 1; // Initial allocation tentative, it is currently 1 so this feature is basically disabeled. - - // Warning : Should uncomment that if using init_alloc > 1 - //size_t freeMem, totalMem; - //cudaMemGetInfo(&freeMem, &totalMem); - //if (freeMem < init_alloc) // Couldn't find enough memory to do the initial alloc - // init_alloc = 1; // (0 is used as default parameter in cnmem) - - ctx->cnmem_device.size = init_alloc; - cnmemDevice_t* devices = (cnmemDevice_t*) malloc(sizeof(cnmemDevice_t) * numDevices); - memset(devices, 0, sizeof(cnmemDevice_t) * numDevices); - for (int i = 0; i < numDevices; i++) { - devices[i].device = _devices[i]; - devices[i].size = 1; - } - cnmemStatus_t cm_status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT); - free(devices); - if (cm_status != CNMEM_STATUS_SUCCESS) - FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_UNKNOWN); - - //Cublas and Cusparse - nvgraph::Cusparse::get_handle(); - nvgraph::Cublas::get_handle(); - - //others - ctx->stream = 0; - ctx->nvgraphIsInitialized = true; - - if (outCtx) { - *outCtx = ctx; - } - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - static nvgraphStatus_t nvgraphCreate_impl(struct nvgraphContext **outCtx) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - int device; - - CHECK_CUDA(cudaFree((void * )0)); - CHECK_CUDA(cudaGetDevice(&device)); - struct nvgraphContext *ctx = NULL; - ctx = (struct nvgraphContext *) malloc(sizeof(*ctx)); - if (!ctx) { - FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN); - } - - //cnmem - memset(&ctx->cnmem_device, 0, sizeof(ctx->cnmem_device)); // init all to 0 - ctx->cnmem_device.device = device; // cnmem runs on the device set by cudaSetDevice - - size_t init_alloc = 1; // Initial allocation tentative, it is currently 1 so this feature is basically disabeled. - - // Warning : Should uncomment that if using init_alloc > 1 - //size_t freeMem, totalMem; - //cudaMemGetInfo(&freeMem, &totalMem); - //if (freeMem < init_alloc) // Couldn't find enough memory to do the initial alloc - // init_alloc = 1; // (0 is used as default parameter in cnmem) - - ctx->cnmem_device.size = init_alloc; - - cnmemStatus_t cm_status = cnmemInit(1, &ctx->cnmem_device, CNMEM_FLAGS_DEFAULT); - if (cm_status != CNMEM_STATUS_SUCCESS) - FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_UNKNOWN); - - //Cublas and Cusparse - nvgraph::Cusparse::get_handle(); - nvgraph::Cublas::get_handle(); - - //others - ctx->stream = 0; - ctx->nvgraphIsInitialized = true; - - if (outCtx) { - *outCtx = ctx; - } - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - static nvgraphStatus_t nvgraphDestroy_impl(nvgraphHandle_t handle) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle)) - FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_NO_MEMORY); - - //Cublas and Cusparse - nvgraph::Cusparse::destroy_handle(); - nvgraph::Cublas::destroy_handle(); - //cnmem - -// compiler is complaining, cm_status is not used in release build -#ifdef DEBUG - cnmemStatus_t cm_status = cnmemFinalize(); - if( cm_status != CNMEM_STATUS_SUCCESS ) { - CERR() << "Warning: " << cnmemGetErrorString(cm_status) << std::endl; - } -#else - cnmemFinalize(); -#endif - //others - free(handle); - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - static nvgraphStatus_t nvgraphCreateGraphDescr_impl(nvgraphHandle_t handle, - struct nvgraphGraphDescr **outGraphDescr) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - struct nvgraphGraphDescr *descrG = NULL; - descrG = (struct nvgraphGraphDescr*) malloc(sizeof(*descrG)); - if (!descrG) - { - FatalError("Cannot allocate graph descriptor.", NVGRAPH_ERR_UNKNOWN); - } - descrG->graphStatus = IS_EMPTY; - if (outGraphDescr) - { - *outGraphDescr = descrG; - } - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - static nvgraphStatus_t nvgraphDestroyGraphDescr_impl(nvgraphHandle_t handle, - struct nvgraphGraphDescr *descrG) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG) - { - if (descrG->TT == NVGRAPH_2D_32I_32I) { - switch (descrG->T) { - case CUDA_R_32I: { - nvgraph::Matrix2d* m = - static_cast*>(descrG->graph_handle); - delete m; - break; - } - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - else { - switch (descrG->graphStatus) { - case IS_EMPTY: { - break; - } - case HAS_TOPOLOGY: { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - delete CSRG; - break; - } - case HAS_VALUES: { - if (descrG->T == CUDA_R_32F) { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - delete MCSRG; - } - else if (descrG->T == CUDA_R_64F) { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - delete MCSRG; - } - else if (descrG->T == CUDA_R_32I) { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - delete MCSRG; - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - break; - } - default: - return NVGRAPH_STATUS_INVALID_VALUE; - } - } - free(descrG); - } - else - return NVGRAPH_STATUS_INVALID_VALUE; - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphSetStream_impl(nvgraphHandle_t handle, cudaStream_t stream) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - //CnMem - cnmemStatus_t cm_status = cnmemRegisterStream(stream); - if (cm_status != CNMEM_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - // nvgraph handle - handle->stream = stream; - //Cublas and Cusparse - nvgraph::Cublas::setStream(stream); - nvgraph::Cusparse::setStream(stream); - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void* topologyData, - nvgraphTopologyType_t TT) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - if (descrG->graphStatus != IS_EMPTY) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - if (check_ptr(topologyData)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32) - { - int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL; - switch (TT) - { - case NVGRAPH_CSR_32: - { - nvgraphCSRTopology32I_t t = static_cast(topologyData); - if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets) - || check_ptr(t->destination_indices)) - return NVGRAPH_STATUS_INVALID_VALUE; - v = t->nvertices; - e = t->nedges; - neighborhood = t->source_offsets; - edgedest = t->destination_indices; - break; - } - case NVGRAPH_CSC_32: - { - nvgraphCSCTopology32I_t t = static_cast(topologyData); - if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets) - || check_ptr(t->source_indices)) - return NVGRAPH_STATUS_INVALID_VALUE; - v = t->nvertices; - e = t->nedges; - neighborhood = t->destination_offsets; - edgedest = t->source_indices; - break; - } - default: - return NVGRAPH_STATUS_INVALID_VALUE; - } - - descrG->TT = TT; - - // Create the internal CSR representation - nvgraph::CsrGraph * CSRG = new nvgraph::CsrGraph(v, e, handle->stream); - - CHECK_CUDA(cudaMemcpy(CSRG->get_raw_row_offsets(), - neighborhood, - (size_t )((CSRG->get_num_vertices() + 1) * sizeof(int)), - cudaMemcpyDefault)); - - CHECK_CUDA(cudaMemcpy(CSRG->get_raw_column_indices(), - edgedest, - (size_t )((CSRG->get_num_edges()) * sizeof(int)), - cudaMemcpyDefault)); - - // Set the graph handle - descrG->graph_handle = CSRG; - descrG->graphStatus = HAS_TOPOLOGY; - } - else if (TT == NVGRAPH_2D_32I_32I) { - nvgraph2dCOOTopology32I_t td = static_cast(topologyData); - switch (td->valueType) { - case CUDA_R_32I: { - if (!td->nvertices || !td->nedges || !td->source_indices - || !td->destination_indices || !td->numDevices || !td->devices - || !td->blockN) - return NVGRAPH_STATUS_INVALID_VALUE; - descrG->TT = TT; - descrG->graphStatus = HAS_TOPOLOGY; - if (td->values) - descrG->graphStatus = HAS_VALUES; - descrG->T = td->valueType; - std::vector devices; - for (int32_t i = 0; i < td->numDevices; i++) - devices.push_back(td->devices[i]); - nvgraph::MatrixDecompositionDescription description( td->nvertices, - td->blockN, - td->nedges, - devices); - nvgraph::Matrix2d* m = new nvgraph::Matrix2d(); - *m = nvgraph::COOto2d(description, - td->source_indices, - td->destination_indices, - (int32_t*) td->values); - descrG->graph_handle = m; - break; - } - default: { - return NVGRAPH_STATUS_INVALID_VALUE; - } - } - } - else - { - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - - } - - nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void* topologyData, - nvgraphTopologyType_t TT) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - if (descrG->graphStatus != IS_EMPTY) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - if (check_ptr(topologyData)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32) - { - int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL; - switch (TT) - { - case NVGRAPH_CSR_32: - { - nvgraphCSRTopology32I_t t = static_cast(topologyData); - if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets) - || check_ptr(t->destination_indices)) - return NVGRAPH_STATUS_INVALID_VALUE; - v = t->nvertices; - e = t->nedges; - neighborhood = t->source_offsets; - edgedest = t->destination_indices; - break; - } - case NVGRAPH_CSC_32: - { - nvgraphCSCTopology32I_t t = static_cast(topologyData); - if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets) - || check_ptr(t->source_indices)) - return NVGRAPH_STATUS_INVALID_VALUE; - v = t->nvertices; - e = t->nedges; - neighborhood = t->destination_offsets; - edgedest = t->source_indices; - break; - } - default: - return NVGRAPH_STATUS_INVALID_VALUE; - } - - descrG->TT = TT; - - // Create the internal CSR representation - nvgraph::CsrGraph * CSRG = new nvgraph::CsrGraph(v, e, handle->stream); - - CSRG->set_raw_row_offsets(neighborhood); - CSRG->set_raw_column_indices(edgedest); - - // Set the graph handle - descrG->graph_handle = CSRG; - descrG->graphStatus = HAS_TOPOLOGY; - } - else if (TT == NVGRAPH_2D_32I_32I) { - nvgraph2dCOOTopology32I_t td = static_cast(topologyData); - switch (td->valueType) { - case CUDA_R_32I: { - if (!td->nvertices || !td->nedges || !td->source_indices - || !td->destination_indices || !td->numDevices || !td->devices - || !td->blockN) - return NVGRAPH_STATUS_INVALID_VALUE; - descrG->TT = TT; - descrG->graphStatus = HAS_TOPOLOGY; - if (td->values) - descrG->graphStatus = HAS_VALUES; - descrG->T = td->valueType; - std::vector devices; - for (int32_t i = 0; i < td->numDevices; i++) - devices.push_back(td->devices[i]); - nvgraph::MatrixDecompositionDescription description( td->nvertices, - td->blockN, - td->nedges, - devices); - nvgraph::Matrix2d* m = new nvgraph::Matrix2d(); - *m = nvgraph::COOto2d(description, - td->source_indices, - td->destination_indices, - (int32_t*) td->values); - descrG->graph_handle = m; - break; - } - default: { - return NVGRAPH_STATUS_INVALID_VALUE; - } - } - } - else - { - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - - } - - nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void* topologyData, - nvgraphTopologyType_t* TT) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_topology(descrG)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - nvgraphTopologyType_t graphTType = descrG->TT; - - if (TT != NULL) - *TT = graphTType; - - if (topologyData != NULL) { - nvgraph::CsrGraph *CSRG = - static_cast *>(descrG->graph_handle); - int v = static_cast(CSRG->get_num_vertices()); - int e = static_cast(CSRG->get_num_edges()); - int *neighborhood = NULL, *edgedest = NULL; - - switch (graphTType) - { - case NVGRAPH_CSR_32: - { - nvgraphCSRTopology32I_t t = static_cast(topologyData); - t->nvertices = static_cast(v); - t->nedges = static_cast(e); - neighborhood = t->source_offsets; - edgedest = t->destination_indices; - break; - } - case NVGRAPH_CSC_32: - { - nvgraphCSCTopology32I_t t = static_cast(topologyData); - t->nvertices = static_cast(v); - t->nedges = static_cast(e); - neighborhood = t->destination_offsets; - edgedest = t->source_indices; - break; - } - default: - return NVGRAPH_STATUS_INTERNAL_ERROR; - } - - if (neighborhood != NULL) { - CHECK_CUDA(cudaMemcpy(neighborhood, - CSRG->get_raw_row_offsets(), - (size_t )((v + 1) * sizeof(int)), - cudaMemcpyDefault)); - } - - if (edgedest != NULL) { - CHECK_CUDA(cudaMemcpy(edgedest, - CSRG->get_raw_column_indices(), - (size_t )((e) * sizeof(int)), - cudaMemcpyDefault)); - } - - } - } - NVGRAPH_CATCHES(rc) - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t numsets, - cudaDataType_t *settypes) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(numsets) - || check_ptr(settypes)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - if (check_uniform_type_array(settypes, numsets)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first - { - if (*settypes == CUDA_R_32F) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< - int, float>(*CSRG); - descrG->graph_handle = MCSRG; - } - else if (*settypes == CUDA_R_64F) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< - int, double>(*CSRG); - descrG->graph_handle = MCSRG; - } - else if (*settypes == CUDA_R_32I) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph(*CSRG); - descrG->graph_handle = MCSRG; - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - descrG->T = *settypes; - descrG->graphStatus = HAS_VALUES; - } - else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type - { - if (*settypes != descrG->T) - return NVGRAPH_STATUS_INVALID_VALUE; - } - else - return NVGRAPH_STATUS_INVALID_VALUE; - - // Allocate and transfer - if (*settypes == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->allocateVertexData(numsets, NULL); - } - else if (*settypes == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->allocateVertexData(numsets, NULL); - } - else if (*settypes == CUDA_R_32I) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->allocateVertexData(numsets, NULL); - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t setnum, - cudaDataType_t settype, - void *vertexData) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first - { - if (settype == CUDA_R_32F) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< - int, float>(*CSRG); - descrG->graph_handle = MCSRG; - } - else if (settype == CUDA_R_64F) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< - int, double>(*CSRG); - descrG->graph_handle = MCSRG; - } - else if (settype == CUDA_R_32I) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph(*CSRG); - descrG->graph_handle = MCSRG; - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - descrG->T = settype; - descrG->graphStatus = HAS_VALUES; - } - else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type - { - if (settype != descrG->T) - return NVGRAPH_STATUS_INVALID_VALUE; - } - else - return NVGRAPH_STATUS_INVALID_VALUE; - - // transfer - if (settype == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->attachVertexData(setnum, (float*)vertexData, NULL); - } - else if (settype == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->attachVertexData(setnum, (double*)vertexData, NULL); - } - else if (settype == CUDA_R_32I) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->attachVertexData(setnum, (int*)vertexData, NULL); - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t numsets, - cudaDataType_t *settypes) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(numsets) - || check_ptr(settypes)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - if (check_uniform_type_array(settypes, numsets)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - // Look at what kind of graph we have - if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first - { - if (*settypes == CUDA_R_32F) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< - int, float>(*CSRG); - descrG->graph_handle = MCSRG; - } - else if (*settypes == CUDA_R_64F) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< - int, double>(*CSRG); - descrG->graph_handle = MCSRG; - } - else if (*settypes == CUDA_R_32I) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph(*CSRG); - descrG->graph_handle = MCSRG; - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - descrG->T = *settypes; - descrG->graphStatus = HAS_VALUES; - } - else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type - { - if (*settypes != descrG->T) - return NVGRAPH_STATUS_INVALID_VALUE; - } - else - return NVGRAPH_STATUS_INVALID_VALUE; - - // Allocate and transfer - if (*settypes == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->allocateEdgeData(numsets, NULL); - } - else if (*settypes == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->allocateEdgeData(numsets, NULL); - } - else if (*settypes == CUDA_R_32I) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->allocateEdgeData(numsets, NULL); - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t setnum, - cudaDataType_t settype, - void *edgeData) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - // Look at what kind of graph we have - if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first - { - if (settype == CUDA_R_32F) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< - int, float>(*CSRG); - descrG->graph_handle = MCSRG; - } - else if (settype == CUDA_R_64F) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< - int, double>(*CSRG); - descrG->graph_handle = MCSRG; - } - else if (settype == CUDA_R_32I) - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph(*CSRG); - descrG->graph_handle = MCSRG; - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - descrG->T = settype; - descrG->graphStatus = HAS_VALUES; - } - else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type - { - if (settype != descrG->T) - return NVGRAPH_STATUS_INVALID_VALUE; - } - else - return NVGRAPH_STATUS_INVALID_VALUE; - - // Allocate and transfer - if (settype == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->attachEdgeData(setnum, (float*)edgeData, NULL); - } - else if (settype == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->attachEdgeData(setnum, (double*)edgeData, NULL); - } - else if (settype == CUDA_R_32I) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - MCSRG->attachEdgeData(setnum, (int*)edgeData, NULL); - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *vertexData, - size_t setnum) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(setnum) - || check_ptr(vertexData)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - FatalError("Graph should have allocated values.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->T == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum), - (float*) vertexData, - (size_t) ((MCSRG->get_num_vertices()) * sizeof(float)), - cudaMemcpyDefault); - } - else if (descrG->T == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum), - (double*) vertexData, - (size_t) ((MCSRG->get_num_vertices()) * sizeof(double)), - cudaMemcpyDefault); - } - else if (descrG->T == CUDA_R_32I) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum), - (int*) vertexData, - (size_t) ((MCSRG->get_num_vertices()) * sizeof(int)), - cudaMemcpyDefault); - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - - cudaCheckError() - ; - - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *vertexData, - size_t setnum) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(setnum) - || check_ptr(vertexData)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - FatalError("Graph should have values.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->T == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy((float*) vertexData, - MCSRG->get_raw_vertex_dim(setnum), - (size_t) ((MCSRG->get_num_vertices()) * sizeof(float)), - cudaMemcpyDefault); - } - else if (descrG->T == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy((double*) vertexData, - MCSRG->get_raw_vertex_dim(setnum), - (size_t) ((MCSRG->get_num_vertices()) * sizeof(double)), - cudaMemcpyDefault); - } - else if (descrG->T == CUDA_R_32I) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy((int*) vertexData, - MCSRG->get_raw_vertex_dim(setnum), - (size_t) ((MCSRG->get_num_vertices()) * sizeof(int)), - cudaMemcpyDefault); - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - - cudaCheckError() - ; - - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology_impl(nvgraphHandle_t handle, - nvgraphTopologyType_t srcTType, - void *srcTopology, - void *srcEdgeData, - cudaDataType_t *dataType, - nvgraphTopologyType_t dstTType, - void *dstTopology, - void *dstEdgeData) - { - - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_ptr(dstEdgeData) || check_ptr(srcEdgeData)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - size_t sizeT; - if (*dataType == CUDA_R_32F) - sizeT = sizeof(float); - else if (*dataType == CUDA_R_64F) - sizeT = sizeof(double); - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - - // Trust me, this better than nested if's. - if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSR_32) { // CSR2CSR - nvgraphCSRTopology32I_t srcT = static_cast(srcTopology); - nvgraphCSRTopology32I_t dstT = static_cast(dstTopology); - dstT->nvertices = srcT->nvertices; - dstT->nedges = srcT->nedges; - CHECK_CUDA(cudaMemcpy(dstT->source_offsets, - srcT->source_offsets, - (srcT->nvertices + 1) * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstT->destination_indices, - srcT->destination_indices, - srcT->nedges * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstEdgeData, - srcEdgeData, - srcT->nedges * sizeT, - cudaMemcpyDefault)); - } else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSC_32) { // CSR2CSC - nvgraphCSRTopology32I_t srcT = static_cast(srcTopology); - nvgraphCSCTopology32I_t dstT = static_cast(dstTopology); - dstT->nvertices = srcT->nvertices; - dstT->nedges = srcT->nedges; - csr2csc(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - srcT->source_offsets, srcT->destination_indices, - dstEdgeData, - dstT->source_indices, dstT->destination_offsets, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, dataType); - } else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_COO_32) { // CSR2COO - nvgraphCSRTopology32I_t srcT = static_cast(srcTopology); - nvgraphCOOTopology32I_t dstT = static_cast(dstTopology); - dstT->nvertices = srcT->nvertices; - dstT->nedges = srcT->nedges; - if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE || dstT->tag == NVGRAPH_DEFAULT - || dstT->tag == NVGRAPH_UNSORTED) { - csr2coo(srcT->source_offsets, - srcT->nedges, - srcT->nvertices, - dstT->source_indices, - CUSPARSE_INDEX_BASE_ZERO); - CHECK_CUDA(cudaMemcpy(dstT->destination_indices, - srcT->destination_indices, - srcT->nedges * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstEdgeData, - srcEdgeData, - srcT->nedges * sizeT, - cudaMemcpyDefault)); - } else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) { - // Step 1: Convert to COO_Source - csr2coo(srcT->source_offsets, - srcT->nedges, - srcT->nvertices, - dstT->source_indices, - CUSPARSE_INDEX_BASE_ZERO); - // Step 2: Convert to COO_Destination - cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - dstT->source_indices, srcT->destination_indices, - dstEdgeData, - dstT->source_indices, dstT->destination_indices, - CUSPARSE_INDEX_BASE_ZERO, - dataType); - } else { - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////// - } else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSR_32) { // CSC2CSR - nvgraphCSCTopology32I_t srcT = static_cast(srcTopology); - nvgraphCSRTopology32I_t dstT = static_cast(dstTopology); - dstT->nvertices = srcT->nvertices; - dstT->nedges = srcT->nedges; - csc2csr(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - srcT->source_indices, srcT->destination_offsets, - dstEdgeData, - dstT->source_offsets, dstT->destination_indices, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, dataType); - } else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSC_32) { // CSC2CSC - nvgraphCSCTopology32I_t srcT = static_cast(srcTopology); - nvgraphCSCTopology32I_t dstT = static_cast(dstTopology); - dstT->nvertices = srcT->nvertices; - dstT->nedges = srcT->nedges; - CHECK_CUDA(cudaMemcpy(dstT->destination_offsets, - srcT->destination_offsets, - (srcT->nvertices + 1) * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstT->source_indices, - srcT->source_indices, - srcT->nedges * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstEdgeData, - srcEdgeData, - srcT->nedges * sizeT, - cudaMemcpyDefault)); - } else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_COO_32) { // CSC2COO - nvgraphCSCTopology32I_t srcT = static_cast(srcTopology); - nvgraphCOOTopology32I_t dstT = static_cast(dstTopology); - dstT->nvertices = srcT->nvertices; - dstT->nedges = srcT->nedges; - if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) { - // Step 1: Convert to COO_Destination - csr2coo(srcT->destination_offsets, - srcT->nedges, - srcT->nvertices, - dstT->destination_indices, - CUSPARSE_INDEX_BASE_ZERO); - // Step 2: Convert to COO_Source - cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - srcT->source_indices, dstT->destination_indices, - dstEdgeData, - dstT->source_indices, dstT->destination_indices, - CUSPARSE_INDEX_BASE_ZERO, - dataType); - } else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION || dstT->tag == NVGRAPH_DEFAULT - || dstT->tag == NVGRAPH_UNSORTED) { - csr2coo(srcT->destination_offsets, - srcT->nedges, - srcT->nvertices, - dstT->destination_indices, - CUSPARSE_INDEX_BASE_ZERO); - CHECK_CUDA(cudaMemcpy(dstT->source_indices, - srcT->source_indices, - srcT->nedges * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstEdgeData, - srcEdgeData, - srcT->nedges * sizeT, - cudaMemcpyDefault)); - } else { - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - /////////////////////////////////////////////////////////////////////////////////////////////////////////// - } else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSR_32) { // COO2CSR - nvgraphCOOTopology32I_t srcT = static_cast(srcTopology); - nvgraphCSRTopology32I_t dstT = static_cast(dstTopology); - dstT->nvertices = srcT->nvertices; - dstT->nedges = srcT->nedges; - if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) { - coo2csr(srcT->source_indices, - srcT->nedges, - srcT->nvertices, - dstT->source_offsets, - CUSPARSE_INDEX_BASE_ZERO); - CHECK_CUDA(cudaMemcpy(dstT->destination_indices, - srcT->destination_indices, - srcT->nedges * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstEdgeData, - srcEdgeData, - srcT->nedges * sizeT, - cudaMemcpyDefault)); - } else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) { - cood2csr(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - srcT->source_indices, srcT->destination_indices, - dstEdgeData, - dstT->source_offsets, dstT->destination_indices, - CUSPARSE_INDEX_BASE_ZERO, - dataType); - } else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) { - coou2csr(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - srcT->source_indices, srcT->destination_indices, - dstEdgeData, - dstT->source_offsets, dstT->destination_indices, - CUSPARSE_INDEX_BASE_ZERO, - dataType); - } else { - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSC_32) { // COO2CSC - nvgraphCOOTopology32I_t srcT = static_cast(srcTopology); - nvgraphCSCTopology32I_t dstT = static_cast(dstTopology); - dstT->nvertices = srcT->nvertices; - dstT->nedges = srcT->nedges; - if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) { - coos2csc(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - srcT->source_indices, srcT->destination_indices, - dstEdgeData, - dstT->source_indices, dstT->destination_offsets, - CUSPARSE_INDEX_BASE_ZERO, - dataType); - } else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) { - coo2csr(srcT->destination_indices, - srcT->nedges, - srcT->nvertices, - dstT->destination_offsets, - CUSPARSE_INDEX_BASE_ZERO); - CHECK_CUDA(cudaMemcpy(dstT->source_indices, - srcT->source_indices, - srcT->nedges * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstEdgeData, - srcEdgeData, - srcT->nedges * sizeT, - cudaMemcpyDefault)); - } else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) { - coou2csc(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - srcT->source_indices, srcT->destination_indices, - dstEdgeData, - dstT->source_indices, dstT->destination_offsets, - CUSPARSE_INDEX_BASE_ZERO, - dataType); - } else { - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_COO_32) { // COO2COO - nvgraphCOOTopology32I_t srcT = static_cast(srcTopology); - nvgraphCOOTopology32I_t dstT = static_cast(dstTopology); - dstT->nvertices = srcT->nvertices; - dstT->nedges = srcT->nedges; - if (srcT->tag == dstT->tag || dstT->tag == NVGRAPH_DEFAULT - || dstT->tag == NVGRAPH_UNSORTED) { - CHECK_CUDA(cudaMemcpy(dstT->source_indices, - srcT->source_indices, - srcT->nedges * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstT->destination_indices, - srcT->destination_indices, - srcT->nedges * sizeof(int), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy(dstEdgeData, - srcEdgeData, - srcT->nedges * sizeT, - cudaMemcpyDefault)); - } else if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) { - cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - srcT->source_indices, srcT->destination_indices, - dstEdgeData, - dstT->source_indices, dstT->destination_indices, - CUSPARSE_INDEX_BASE_ZERO, - dataType); - } else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) { - cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges, - srcEdgeData, - srcT->source_indices, srcT->destination_indices, - dstEdgeData, - dstT->source_indices, dstT->destination_indices, - CUSPARSE_INDEX_BASE_ZERO, - dataType); - } else { - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - - /////////////////////////////////////////////////////////////////////////////////////////////////////////// - } else { - return NVGRAPH_STATUS_INVALID_VALUE; - } - - } - NVGRAPH_CATCHES(rc) - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t srcDescrG, - nvgraphGraphDescr_t dstDescrG, - nvgraphTopologyType_t dstTType) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - nvgraphStatus_t status = NVGRAPH_STATUS_SUCCESS; - try - { - if (check_context(handle) || check_graph(srcDescrG)) // Graph must have a topology - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (dstDescrG->graphStatus != IS_EMPTY) // dst Graph must be empty - return NVGRAPH_STATUS_INVALID_VALUE; - - // graphs can only have CSR or CSC topology (EL is for storage only) - if (srcDescrG->TT != NVGRAPH_CSR_32 && srcDescrG->TT != NVGRAPH_CSC_32) - return NVGRAPH_STATUS_INTERNAL_ERROR; // invalid state, you can only create graph with CSR/CSC - if (dstTType != NVGRAPH_CSR_32 && dstTType != NVGRAPH_CSC_32) - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; // only conversion to CSR/CSC is allowed - - int nvertices, nedges; - int *srcOffsets = NULL, *srcIndices = NULL, *dstOffsets = NULL, *dstIndices = NULL; - SHARED_PREFIX::shared_ptr permutation, offsets, indices; - - // Step 1: get source graph structure - nvgraph::CsrGraph *CSRG = - static_cast *>(srcDescrG->graph_handle); - nvertices = static_cast(CSRG->get_num_vertices()); - nedges = static_cast(CSRG->get_num_edges()); - srcOffsets = CSRG->get_raw_row_offsets(); - srcIndices = CSRG->get_raw_column_indices(); - - // Step 2: convert topology and get permutation array. - if (srcDescrG->TT != dstTType) { // Otherwise conversion is not needed, only copy. - offsets = allocateDevice(nvertices + 1, NULL); - indices = allocateDevice(nedges, NULL); - permutation = allocateDevice(nedges, NULL); - csr2cscP(nvertices, nvertices, nedges, - srcOffsets, - srcIndices, - indices.get(), - offsets.get(), permutation.get(), CUSPARSE_INDEX_BASE_ZERO); - dstOffsets = offsets.get(); - dstIndices = indices.get(); - } else { - dstOffsets = srcOffsets; - dstIndices = srcIndices; - } - - // Step 3: Set dst graph structure - if (dstTType == NVGRAPH_CSR_32) { - nvgraphCSRTopology32I_st dstTopology; - dstTopology.nedges = nedges; - dstTopology.nvertices = nvertices; - dstTopology.source_offsets = dstOffsets; - dstTopology.destination_indices = dstIndices; - status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType); - } else if (dstTType == NVGRAPH_CSC_32) { - nvgraphCSCTopology32I_st dstTopology; - dstTopology.nedges = nedges; - dstTopology.nvertices = nvertices; - dstTopology.destination_offsets = dstOffsets; - dstTopology.source_indices = dstIndices; - status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType); - } else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - if (status != NVGRAPH_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - offsets.reset(); - indices.reset(); - - // Step 4: Allocate, convert and set edge+vertex data on the new graph - if (srcDescrG->graphStatus == HAS_VALUES) { - if (srcDescrG->T == CUDA_R_32F) { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(srcDescrG->graph_handle); - size_t vertexDim = MCSRG->get_num_vertex_dim(); - size_t edgesDim = MCSRG->get_num_edge_dim(); - // Step 4.1: allocate and set vertex data (no need for convert) - if (vertexDim > 0) { - std::vector vertexDataType(vertexDim); - std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T); - status = nvgraphAllocateVertexData(handle, - dstDescrG, - vertexDim, - vertexDataType.data()); - if (status != NVGRAPH_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - for (size_t i = 0; i < vertexDim; ++i) { - void *vertexData = MCSRG->get_raw_vertex_dim(i); - status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i); - if (status != NVGRAPH_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - } - } - // Step 4.2: allocate and set vertex data - if (edgesDim > 0) { - void *dstEdgeData = NULL; - SHARED_PREFIX::shared_ptr dstEdgeDataSP; - - std::vector edgeDataType(edgesDim); - std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T); - status = nvgraphAllocateEdgeData(handle, - dstDescrG, - edgesDim, - edgeDataType.data()); - if (status != NVGRAPH_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - // allocate edge data memory (if there is a need) - if (edgesDim > 0 && srcDescrG->TT != dstTType) { - dstEdgeDataSP = allocateDevice(nedges, NULL); - dstEdgeData = dstEdgeDataSP.get(); - } - // Convert and set edge data (using permutation array) - for (size_t i = 0; i < edgesDim; ++i) { - void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i)); - if (srcDescrG->TT != dstTType) // Convert using permutation array - gthrX(nedges, - srcEdgeData, - dstEdgeData, - permutation.get(), - CUSPARSE_INDEX_BASE_ZERO, - &(srcDescrG->T)); - else - dstEdgeData = srcEdgeData; - // set edgedata - status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i); - if (status != NVGRAPH_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - } - } - } else if (srcDescrG->T == CUDA_R_64F) { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(srcDescrG->graph_handle); - size_t vertexDim = MCSRG->get_num_vertex_dim(); - size_t edgesDim = MCSRG->get_num_edge_dim(); - // Step 4.1: allocate and set vertex data (no need for convert) - if (vertexDim > 0) { - std::vector vertexDataType(vertexDim); - std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T); - status = nvgraphAllocateVertexData(handle, - dstDescrG, - vertexDim, - vertexDataType.data()); - if (status != NVGRAPH_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - for (size_t i = 0; i < vertexDim; ++i) { - void *vertexData = MCSRG->get_raw_vertex_dim(i); - status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i); - if (status != NVGRAPH_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - } - } - // Step 4.2: allocate and set vertex data - if (edgesDim > 0) { - void *dstEdgeData = NULL; - SHARED_PREFIX::shared_ptr dstEdgeDataSP; - - std::vector edgeDataType(edgesDim); - std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T); - status = nvgraphAllocateEdgeData(handle, - dstDescrG, - edgesDim, - edgeDataType.data()); - if (status != NVGRAPH_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - // allocate edge data memory (if there is a need) - if (edgesDim > 0 && srcDescrG->TT != dstTType) { - dstEdgeDataSP = allocateDevice(nedges, NULL); - dstEdgeData = dstEdgeDataSP.get(); - } - // Convert and set edge data (using permutation array) - for (size_t i = 0; i < edgesDim; ++i) { - void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i)); - if (srcDescrG->TT != dstTType) // Convert using permutation array - gthrX(nedges, - srcEdgeData, - dstEdgeData, - permutation.get(), - CUSPARSE_INDEX_BASE_ZERO, - &(srcDescrG->T)); - else - dstEdgeData = srcEdgeData; - // set edgedata - status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i); - if (status != NVGRAPH_STATUS_SUCCESS) - return NVGRAPH_STATUS_INTERNAL_ERROR; - } - } - } else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - return getCAPIStatusForError(rc); - - } - - nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *edgeData, - size_t setnum) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(setnum) - || check_ptr(edgeData)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->T == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy(MCSRG->get_raw_edge_dim(setnum), - (float*) edgeData, - (size_t) ((MCSRG->get_num_edges()) * sizeof(float)), - cudaMemcpyDefault); - } - else if (descrG->T == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy(MCSRG->get_raw_edge_dim(setnum), - (double*) edgeData, - (size_t) ((MCSRG->get_num_edges()) * sizeof(double)), - cudaMemcpyDefault); - } - else if (descrG->T == CUDA_R_32I) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy(MCSRG->get_raw_edge_dim(setnum), - (int*) edgeData, - (size_t) ((MCSRG->get_num_edges()) * sizeof(int)), - cudaMemcpyDefault); - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - - cudaCheckError() - ; - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *edgeData, - size_t setnum) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(setnum) - || check_ptr(edgeData)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->T == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy((float*) edgeData, - MCSRG->get_raw_edge_dim(setnum), - (size_t) ((MCSRG->get_num_edges()) * sizeof(float)), - cudaMemcpyDefault); - } - else if (descrG->T == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - cudaMemcpy((double*) edgeData, - MCSRG->get_raw_edge_dim(setnum), - (size_t) ((MCSRG->get_num_edges()) * sizeof(double)), - cudaMemcpyDefault); - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - - cudaCheckError() - ; - - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv_impl_cub(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const void *alpha, - const size_t x, - const void *beta, - const size_t y, - const nvgraphSemiring_t SR) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - - try - { - // some basic checks - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - rc = SemiringAPILauncher(handle, descrG, weight_index, alpha, x, beta, y, SR); - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphSssp_impl(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int *source_vert, - const size_t sssp) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index) - || check_int_ptr(source_vert)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->TT != NVGRAPH_CSC_32) // supported topologies - return NVGRAPH_STATUS_INVALID_VALUE; + nvgraphStatus_t getCAPIStatusForError(NVGRAPH_ERROR err) { + nvgraphStatus_t ret = NVGRAPH_STATUS_SUCCESS; + + switch (err) { + case NVGRAPH_OK: + ret = NVGRAPH_STATUS_SUCCESS; + break; + case NVGRAPH_ERR_BAD_PARAMETERS: + ret = NVGRAPH_STATUS_INVALID_VALUE; + break; + case NVGRAPH_ERR_UNKNOWN: + ret = NVGRAPH_STATUS_INTERNAL_ERROR; + break; + case NVGRAPH_ERR_CUDA_FAILURE: + ret = NVGRAPH_STATUS_EXECUTION_FAILED; + break; + case NVGRAPH_ERR_THRUST_FAILURE: + ret = NVGRAPH_STATUS_EXECUTION_FAILED; + break; + case NVGRAPH_ERR_IO: + ret = NVGRAPH_STATUS_INTERNAL_ERROR; + break; + case NVGRAPH_ERR_NOT_IMPLEMENTED: + ret = NVGRAPH_STATUS_INVALID_VALUE; + break; + case NVGRAPH_ERR_NO_MEMORY: + ret = NVGRAPH_STATUS_ALLOC_FAILED; + break; + case NVGRAPH_ERR_NOT_CONVERGED: + ret = NVGRAPH_STATUS_NOT_CONVERGED; + break; + default: + ret = NVGRAPH_STATUS_INTERNAL_ERROR; + } + return ret; + } + + extern "C" { + const char* nvgraphStatusGetString(nvgraphStatus_t status) { + switch (status) { + case NVGRAPH_STATUS_SUCCESS: + return "Success"; + case NVGRAPH_STATUS_NOT_INITIALIZED: + return "nvGRAPH not initialized"; + case NVGRAPH_STATUS_ALLOC_FAILED: + return "nvGRAPH alloc failed"; + case NVGRAPH_STATUS_INVALID_VALUE: + return "nvGRAPH invalid value"; + case NVGRAPH_STATUS_ARCH_MISMATCH: + return "nvGRAPH arch mismatch"; + case NVGRAPH_STATUS_MAPPING_ERROR: + return "nvGRAPH mapping error"; + case NVGRAPH_STATUS_EXECUTION_FAILED: + return "nvGRAPH execution failed"; + case NVGRAPH_STATUS_INTERNAL_ERROR: + return "nvGRAPH internal error"; + case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED: + return "nvGRAPH type not supported"; + case NVGRAPH_STATUS_NOT_CONVERGED: + return "nvGRAPH algorithm failed to converge"; + case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED: + return "nvGRAPH graph type not supported"; + default: + return "Unknown nvGRAPH Status"; + } + } + } + + static nvgraphStatus_t nvgraphCreateMulti_impl(struct nvgraphContext **outCtx, + int numDevices, + int* _devices) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + // First, initialize NVGraph's context + + auto ctx = static_cast(calloc(1, sizeof(struct nvgraphContext))); + if (ctx == nullptr) { + FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN); + } + + auto option = rmmOptions_t{}; + if (rmmIsInitialized(&option) == true) { + if ((option.allocation_mode & PoolAllocation) != 0) { + FatalError("RMM does not support multi-GPUs with pool allocation, yet.", NVGRAPH_ERR_UNKNOWN); + } + } + // if RMM is unintialized, RMM_ALLOC/RMM_FREE are just aliases for cudaMalloc/cudaFree + + ctx->stream = nullptr; + ctx->nvgraphIsInitialized = true; + + if (outCtx != nullptr) { + *outCtx = ctx; + } + + // Second, initialize Cublas and Cusparse (get_handle() creates a new handle + // if there is no existing handle). + + nvgraph::Cusparse::get_handle(); + nvgraph::Cublas::get_handle(); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + static nvgraphStatus_t nvgraphCreate_impl(struct nvgraphContext **outCtx) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + // First, initialize NVGraph's context + + auto ctx = static_cast(calloc(1, sizeof(struct nvgraphContext))); + if (ctx == nullptr) { + FatalError("Cannot allocate NVGRAPH context.", NVGRAPH_ERR_UNKNOWN); + } + + // Now NVGraph assumes that RMM is initialized outside NVGraph + // if RMM is unintialized, RMM_ALLOC/RMM_FREE are just aliases for cudaMalloc/cudaFree + + ctx->stream = nullptr; + ctx->nvgraphIsInitialized = true; + + if (outCtx != nullptr) { + *outCtx = ctx; + } + + // Second, initialize Cublas and Cusparse (get_handle() creates a new handle + // if there is no existing handle). + + nvgraph::Cusparse::get_handle(); + nvgraph::Cublas::get_handle(); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + static nvgraphStatus_t nvgraphDestroy_impl(nvgraphHandle_t handle) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle)) + FatalError("Cannot initialize memory manager.", NVGRAPH_ERR_NO_MEMORY); + + // First, destroy Cublas and Cusparse + + nvgraph::Cusparse::destroy_handle(); + nvgraph::Cublas::destroy_handle(); + + // Second, destroy NVGraph's context + + free(handle); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + static nvgraphStatus_t nvgraphCreateGraphDescr_impl(nvgraphHandle_t handle, + struct nvgraphGraphDescr **outGraphDescr) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + struct nvgraphGraphDescr *descrG = NULL; + descrG = (struct nvgraphGraphDescr*) malloc(sizeof(*descrG)); + if (!descrG) + { + FatalError("Cannot allocate graph descriptor.", NVGRAPH_ERR_UNKNOWN); + } + descrG->graphStatus = IS_EMPTY; + if (outGraphDescr) + { + *outGraphDescr = descrG; + } + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + static nvgraphStatus_t nvgraphDestroyGraphDescr_impl(nvgraphHandle_t handle, + struct nvgraphGraphDescr *descrG) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG) { + if (descrG->TT == NVGRAPH_2D_32I_32I) { + switch (descrG->T) { + case CUDA_R_32I: { + nvgraph::Matrix2d* m = + static_cast*>(descrG->graph_handle); + delete m; + break; + } + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + else { + switch (descrG->graphStatus) { + case IS_EMPTY: { + break; + } + case HAS_TOPOLOGY: { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + delete CSRG; + break; + } + case HAS_VALUES: { + if (descrG->T == CUDA_R_32F) { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + delete MCSRG; + } + else if (descrG->T == CUDA_R_64F) { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + delete MCSRG; + } + else if (descrG->T == CUDA_R_32I) { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + delete MCSRG; + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + break; + } + default: + return NVGRAPH_STATUS_INVALID_VALUE; + } + } + free(descrG); + } + else + return NVGRAPH_STATUS_INVALID_VALUE; + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphSetStream_impl(nvgraphHandle_t handle, cudaStream_t stream) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + // nvgraph handle + handle->stream = stream; + //Cublas and Cusparse + nvgraph::Cublas::setStream(stream); + nvgraph::Cusparse::setStream(stream); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void* topologyData, + nvgraphTopologyType_t TT) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + if (descrG->graphStatus != IS_EMPTY) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + if (check_ptr(topologyData)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32) + { + int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL; + switch (TT) + { + case NVGRAPH_CSR_32: + { + nvgraphCSRTopology32I_t t = static_cast(topologyData); + if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets) + || check_ptr(t->destination_indices)) + return NVGRAPH_STATUS_INVALID_VALUE; + v = t->nvertices; + e = t->nedges; + neighborhood = t->source_offsets; + edgedest = t->destination_indices; + break; + } + case NVGRAPH_CSC_32: + { + nvgraphCSCTopology32I_t t = static_cast(topologyData); + if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets) + || check_ptr(t->source_indices)) + return NVGRAPH_STATUS_INVALID_VALUE; + v = t->nvertices; + e = t->nedges; + neighborhood = t->destination_offsets; + edgedest = t->source_indices; + break; + } + default: + return NVGRAPH_STATUS_INVALID_VALUE; + } + + descrG->TT = TT; + + // Create the internal CSR representation + nvgraph::CsrGraph * CSRG = new nvgraph::CsrGraph(v, e, handle->stream); + + CHECK_CUDA(cudaMemcpy(CSRG->get_raw_row_offsets(), + neighborhood, + (size_t )((CSRG->get_num_vertices() + 1) * sizeof(int)), + cudaMemcpyDefault)); + + CHECK_CUDA(cudaMemcpy(CSRG->get_raw_column_indices(), + edgedest, + (size_t )((CSRG->get_num_edges()) * sizeof(int)), + cudaMemcpyDefault)); + + // Set the graph handle + descrG->graph_handle = CSRG; + descrG->graphStatus = HAS_TOPOLOGY; + } + else if (TT == NVGRAPH_2D_32I_32I) { + nvgraph2dCOOTopology32I_t td = static_cast(topologyData); + switch (td->valueType) { + case CUDA_R_32I: { + if (!td->nvertices || !td->nedges || !td->source_indices + || !td->destination_indices || !td->numDevices || !td->devices + || !td->blockN) + return NVGRAPH_STATUS_INVALID_VALUE; + descrG->TT = TT; + descrG->graphStatus = HAS_TOPOLOGY; + if (td->values) + descrG->graphStatus = HAS_VALUES; + descrG->T = td->valueType; + std::vector devices; + for (int32_t i = 0; i < td->numDevices; i++) + devices.push_back(td->devices[i]); + nvgraph::MatrixDecompositionDescription description(td->nvertices, + td->blockN, + td->nedges, + devices); + nvgraph::Matrix2d* m = new nvgraph::Matrix2d(); + *m = nvgraph::COOto2d(description, + td->source_indices, + td->destination_indices, + (int32_t*) td->values); + descrG->graph_handle = m; + break; + } + default: { + return NVGRAPH_STATUS_INVALID_VALUE; + } + } + } + else + { + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + + } + + nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void* topologyData, + nvgraphTopologyType_t TT) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + if (descrG->graphStatus != IS_EMPTY) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + if (check_ptr(topologyData)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (TT == NVGRAPH_CSR_32 || TT == NVGRAPH_CSC_32) + { + int v = 0, e = 0, *neighborhood = NULL, *edgedest = NULL; + switch (TT) + { + case NVGRAPH_CSR_32: + { + nvgraphCSRTopology32I_t t = static_cast(topologyData); + if (!t->nvertices || !t->nedges || check_ptr(t->source_offsets) + || check_ptr(t->destination_indices)) + return NVGRAPH_STATUS_INVALID_VALUE; + v = t->nvertices; + e = t->nedges; + neighborhood = t->source_offsets; + edgedest = t->destination_indices; + break; + } + case NVGRAPH_CSC_32: + { + nvgraphCSCTopology32I_t t = static_cast(topologyData); + if (!t->nvertices || !t->nedges || check_ptr(t->destination_offsets) + || check_ptr(t->source_indices)) + return NVGRAPH_STATUS_INVALID_VALUE; + v = t->nvertices; + e = t->nedges; + neighborhood = t->destination_offsets; + edgedest = t->source_indices; + break; + } + default: + return NVGRAPH_STATUS_INVALID_VALUE; + } + + descrG->TT = TT; + + // Create the internal CSR representation + nvgraph::CsrGraph * CSRG = new nvgraph::CsrGraph(v, e, handle->stream); + + CSRG->set_raw_row_offsets(neighborhood); + CSRG->set_raw_column_indices(edgedest); + + // Set the graph handle + descrG->graph_handle = CSRG; + descrG->graphStatus = HAS_TOPOLOGY; + } + else if (TT == NVGRAPH_2D_32I_32I) { + nvgraph2dCOOTopology32I_t td = static_cast(topologyData); + switch (td->valueType) { + case CUDA_R_32I: { + if (!td->nvertices || !td->nedges || !td->source_indices + || !td->destination_indices || !td->numDevices || !td->devices + || !td->blockN) + return NVGRAPH_STATUS_INVALID_VALUE; + descrG->TT = TT; + descrG->graphStatus = HAS_TOPOLOGY; + if (td->values) + descrG->graphStatus = HAS_VALUES; + descrG->T = td->valueType; + std::vector devices; + for (int32_t i = 0; i < td->numDevices; i++) + devices.push_back(td->devices[i]); + nvgraph::MatrixDecompositionDescription description(td->nvertices, + td->blockN, + td->nedges, + devices); + nvgraph::Matrix2d* m = new nvgraph::Matrix2d(); + *m = nvgraph::COOto2d(description, + td->source_indices, + td->destination_indices, + (int32_t*) td->values); + descrG->graph_handle = m; + break; + } + default: { + return NVGRAPH_STATUS_INVALID_VALUE; + } + } + } + else + { + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + + } + + nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void* topologyData, + nvgraphTopologyType_t* TT) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_topology(descrG)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + nvgraphTopologyType_t graphTType = descrG->TT; + + if (TT != NULL) + *TT = graphTType; + + if (topologyData != NULL) { + nvgraph::CsrGraph *CSRG = + static_cast *>(descrG->graph_handle); + int v = static_cast(CSRG->get_num_vertices()); + int e = static_cast(CSRG->get_num_edges()); + int *neighborhood = NULL, *edgedest = NULL; + + switch (graphTType) + { + case NVGRAPH_CSR_32: + { + nvgraphCSRTopology32I_t t = static_cast(topologyData); + t->nvertices = static_cast(v); + t->nedges = static_cast(e); + neighborhood = t->source_offsets; + edgedest = t->destination_indices; + break; + } + case NVGRAPH_CSC_32: + { + nvgraphCSCTopology32I_t t = static_cast(topologyData); + t->nvertices = static_cast(v); + t->nedges = static_cast(e); + neighborhood = t->destination_offsets; + edgedest = t->source_indices; + break; + } + default: + return NVGRAPH_STATUS_INTERNAL_ERROR; + } + + if (neighborhood != NULL) { + CHECK_CUDA(cudaMemcpy(neighborhood, + CSRG->get_raw_row_offsets(), + (size_t )((v + 1) * sizeof(int)), + cudaMemcpyDefault)); + } + + if (edgedest != NULL) { + CHECK_CUDA(cudaMemcpy(edgedest, + CSRG->get_raw_column_indices(), + (size_t )((e) * sizeof(int)), + cudaMemcpyDefault)); + } + + } + } + NVGRAPH_CATCHES(rc) + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + size_t numsets, + cudaDataType_t *settypes) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(numsets) + || check_ptr(settypes)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + if (check_uniform_type_array(settypes, numsets)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first + { + if (*settypes == CUDA_R_32F) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< + int, float>(*CSRG); + descrG->graph_handle = MCSRG; + } + else if (*settypes == CUDA_R_64F) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< + int, double>(*CSRG); + descrG->graph_handle = MCSRG; + } + else if (*settypes == CUDA_R_32I) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph(*CSRG); + descrG->graph_handle = MCSRG; + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + descrG->T = *settypes; + descrG->graphStatus = HAS_VALUES; + } + else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type + { + if (*settypes != descrG->T) + return NVGRAPH_STATUS_INVALID_VALUE; + } + else + return NVGRAPH_STATUS_INVALID_VALUE; + + // Allocate and transfer + if (*settypes == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->allocateVertexData(numsets, NULL); + } + else if (*settypes == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->allocateVertexData(numsets, NULL); + } + else if (*settypes == CUDA_R_32I) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->allocateVertexData(numsets, NULL); + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + size_t setnum, + cudaDataType_t settype, + void *vertexData) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first + { + if (settype == CUDA_R_32F) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< + int, float>(*CSRG); + descrG->graph_handle = MCSRG; + } + else if (settype == CUDA_R_64F) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< + int, double>(*CSRG); + descrG->graph_handle = MCSRG; + } + else if (settype == CUDA_R_32I) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph(*CSRG); + descrG->graph_handle = MCSRG; + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + descrG->T = settype; + descrG->graphStatus = HAS_VALUES; + } + else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type + { + if (settype != descrG->T) + return NVGRAPH_STATUS_INVALID_VALUE; + } + else + return NVGRAPH_STATUS_INVALID_VALUE; + + // transfer + if (settype == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->attachVertexData(setnum, (float*)vertexData, NULL); + } + else if (settype == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->attachVertexData(setnum, (double*)vertexData, NULL); + } + else if (settype == CUDA_R_32I) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->attachVertexData(setnum, (int*)vertexData, NULL); + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + size_t numsets, + cudaDataType_t *settypes) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(numsets) + || check_ptr(settypes)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + if (check_uniform_type_array(settypes, numsets)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + // Look at what kind of graph we have + if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first + { + if (*settypes == CUDA_R_32F) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< + int, float>(*CSRG); + descrG->graph_handle = MCSRG; + } + else if (*settypes == CUDA_R_64F) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< + int, double>(*CSRG); + descrG->graph_handle = MCSRG; + } + else if (*settypes == CUDA_R_32I) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph(*CSRG); + descrG->graph_handle = MCSRG; + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + descrG->T = *settypes; + descrG->graphStatus = HAS_VALUES; + } + else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type + { + if (*settypes != descrG->T) + return NVGRAPH_STATUS_INVALID_VALUE; + } + else + return NVGRAPH_STATUS_INVALID_VALUE; + + // Allocate and transfer + if (*settypes == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->allocateEdgeData(numsets, NULL); + } + else if (*settypes == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->allocateEdgeData(numsets, NULL); + } + else if (*settypes == CUDA_R_32I) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->allocateEdgeData(numsets, NULL); + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + size_t setnum, + cudaDataType_t settype, + void *edgeData) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(setnum)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + // Look at what kind of graph we have + if (descrG->graphStatus == HAS_TOPOLOGY) // need to convert CsrGraph to MultiValuedCsrGraph first + { + if (settype == CUDA_R_32F) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< + int, float>(*CSRG); + descrG->graph_handle = MCSRG; + } + else if (settype == CUDA_R_64F) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph< + int, double>(*CSRG); + descrG->graph_handle = MCSRG; + } + else if (settype == CUDA_R_32I) + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph *MCSRG = new nvgraph::MultiValuedCsrGraph(*CSRG); + descrG->graph_handle = MCSRG; + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + descrG->T = settype; + descrG->graphStatus = HAS_VALUES; + } + else if (descrG->graphStatus == HAS_VALUES) // Already in MultiValuedCsrGraph, just need to check the type + { + if (settype != descrG->T) + return NVGRAPH_STATUS_INVALID_VALUE; + } + else + return NVGRAPH_STATUS_INVALID_VALUE; + + // Allocate and transfer + if (settype == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->attachEdgeData(setnum, (float*)edgeData, NULL); + } + else if (settype == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->attachEdgeData(setnum, (double*)edgeData, NULL); + } + else if (settype == CUDA_R_32I) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + MCSRG->attachEdgeData(setnum, (int*)edgeData, NULL); + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void *vertexData, + size_t setnum) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(setnum) + || check_ptr(vertexData)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + FatalError("Graph should have allocated values.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->T == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum), + (float*) vertexData, + (size_t) ((MCSRG->get_num_vertices()) * sizeof(float)), + cudaMemcpyDefault); + } + else if (descrG->T == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum), + (double*) vertexData, + (size_t) ((MCSRG->get_num_vertices()) * sizeof(double)), + cudaMemcpyDefault); + } + else if (descrG->T == CUDA_R_32I) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy(MCSRG->get_raw_vertex_dim(setnum), + (int*) vertexData, + (size_t) ((MCSRG->get_num_vertices()) * sizeof(int)), + cudaMemcpyDefault); + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + + cudaCheckError(); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void *vertexData, + size_t setnum) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(setnum) + || check_ptr(vertexData)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + FatalError("Graph should have values.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->T == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy((float*) vertexData, + MCSRG->get_raw_vertex_dim(setnum), + (size_t) ((MCSRG->get_num_vertices()) * sizeof(float)), + cudaMemcpyDefault); + } + else if (descrG->T == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy((double*) vertexData, + MCSRG->get_raw_vertex_dim(setnum), + (size_t) ((MCSRG->get_num_vertices()) * sizeof(double)), + cudaMemcpyDefault); + } + else if (descrG->T == CUDA_R_32I) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy((int*) vertexData, + MCSRG->get_raw_vertex_dim(setnum), + (size_t) ((MCSRG->get_num_vertices()) * sizeof(int)), + cudaMemcpyDefault); + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + + cudaCheckError(); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology_impl(nvgraphHandle_t handle, + nvgraphTopologyType_t srcTType, + void *srcTopology, + void *srcEdgeData, + cudaDataType_t *dataType, + nvgraphTopologyType_t dstTType, + void *dstTopology, + void *dstEdgeData) { + + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_ptr(dstEdgeData) || check_ptr(srcEdgeData)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + size_t sizeT; + if (*dataType == CUDA_R_32F) + sizeT = sizeof(float); + else if (*dataType == CUDA_R_64F) + sizeT = sizeof(double); + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + + // Trust me, this better than nested if's. + if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSR_32) { // CSR2CSR + nvgraphCSRTopology32I_t srcT = static_cast(srcTopology); + nvgraphCSRTopology32I_t dstT = static_cast(dstTopology); + dstT->nvertices = srcT->nvertices; + dstT->nedges = srcT->nedges; + CHECK_CUDA(cudaMemcpy(dstT->source_offsets, + srcT->source_offsets, + (srcT->nvertices + 1) * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstT->destination_indices, + srcT->destination_indices, + srcT->nedges * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstEdgeData, + srcEdgeData, + srcT->nedges * sizeT, + cudaMemcpyDefault)); + } else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_CSC_32) { // CSR2CSC + nvgraphCSRTopology32I_t srcT = static_cast(srcTopology); + nvgraphCSCTopology32I_t dstT = static_cast(dstTopology); + dstT->nvertices = srcT->nvertices; + dstT->nedges = srcT->nedges; + csr2csc(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + srcT->source_offsets, srcT->destination_indices, + dstEdgeData, + dstT->source_indices, dstT->destination_offsets, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, dataType); + } else if (srcTType == NVGRAPH_CSR_32 && dstTType == NVGRAPH_COO_32) { // CSR2COO + nvgraphCSRTopology32I_t srcT = static_cast(srcTopology); + nvgraphCOOTopology32I_t dstT = static_cast(dstTopology); + dstT->nvertices = srcT->nvertices; + dstT->nedges = srcT->nedges; + if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE || dstT->tag == NVGRAPH_DEFAULT + || dstT->tag == NVGRAPH_UNSORTED) { + csr2coo(srcT->source_offsets, + srcT->nedges, + srcT->nvertices, + dstT->source_indices, + CUSPARSE_INDEX_BASE_ZERO); + CHECK_CUDA(cudaMemcpy(dstT->destination_indices, + srcT->destination_indices, + srcT->nedges * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstEdgeData, + srcEdgeData, + srcT->nedges * sizeT, + cudaMemcpyDefault)); + } else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) { + // Step 1: Convert to COO_Source + csr2coo(srcT->source_offsets, + srcT->nedges, + srcT->nvertices, + dstT->source_indices, + CUSPARSE_INDEX_BASE_ZERO); + // Step 2: Convert to COO_Destination + cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + dstT->source_indices, srcT->destination_indices, + dstEdgeData, + dstT->source_indices, dstT->destination_indices, + CUSPARSE_INDEX_BASE_ZERO, + dataType); + } else { + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////// + } else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSR_32) { // CSC2CSR + nvgraphCSCTopology32I_t srcT = static_cast(srcTopology); + nvgraphCSRTopology32I_t dstT = static_cast(dstTopology); + dstT->nvertices = srcT->nvertices; + dstT->nedges = srcT->nedges; + csc2csr(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + srcT->source_indices, srcT->destination_offsets, + dstEdgeData, + dstT->source_offsets, dstT->destination_indices, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, dataType); + } else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_CSC_32) { // CSC2CSC + nvgraphCSCTopology32I_t srcT = static_cast(srcTopology); + nvgraphCSCTopology32I_t dstT = static_cast(dstTopology); + dstT->nvertices = srcT->nvertices; + dstT->nedges = srcT->nedges; + CHECK_CUDA(cudaMemcpy(dstT->destination_offsets, + srcT->destination_offsets, + (srcT->nvertices + 1) * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstT->source_indices, + srcT->source_indices, + srcT->nedges * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstEdgeData, + srcEdgeData, + srcT->nedges * sizeT, + cudaMemcpyDefault)); + } else if (srcTType == NVGRAPH_CSC_32 && dstTType == NVGRAPH_COO_32) { // CSC2COO + nvgraphCSCTopology32I_t srcT = static_cast(srcTopology); + nvgraphCOOTopology32I_t dstT = static_cast(dstTopology); + dstT->nvertices = srcT->nvertices; + dstT->nedges = srcT->nedges; + if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) { + // Step 1: Convert to COO_Destination + csr2coo(srcT->destination_offsets, + srcT->nedges, + srcT->nvertices, + dstT->destination_indices, + CUSPARSE_INDEX_BASE_ZERO); + // Step 2: Convert to COO_Source + cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + srcT->source_indices, dstT->destination_indices, + dstEdgeData, + dstT->source_indices, dstT->destination_indices, + CUSPARSE_INDEX_BASE_ZERO, + dataType); + } else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION || dstT->tag == NVGRAPH_DEFAULT + || dstT->tag == NVGRAPH_UNSORTED) { + csr2coo(srcT->destination_offsets, + srcT->nedges, + srcT->nvertices, + dstT->destination_indices, + CUSPARSE_INDEX_BASE_ZERO); + CHECK_CUDA(cudaMemcpy(dstT->source_indices, + srcT->source_indices, + srcT->nedges * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstEdgeData, + srcEdgeData, + srcT->nedges * sizeT, + cudaMemcpyDefault)); + } else { + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + /////////////////////////////////////////////////////////////////////////////////////////////////////////// + } else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSR_32) { // COO2CSR + nvgraphCOOTopology32I_t srcT = static_cast(srcTopology); + nvgraphCSRTopology32I_t dstT = static_cast(dstTopology); + dstT->nvertices = srcT->nvertices; + dstT->nedges = srcT->nedges; + if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) { + coo2csr(srcT->source_indices, + srcT->nedges, + srcT->nvertices, + dstT->source_offsets, + CUSPARSE_INDEX_BASE_ZERO); + CHECK_CUDA(cudaMemcpy(dstT->destination_indices, + srcT->destination_indices, + srcT->nedges * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstEdgeData, + srcEdgeData, + srcT->nedges * sizeT, + cudaMemcpyDefault)); + } else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) { + cood2csr(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + srcT->source_indices, srcT->destination_indices, + dstEdgeData, + dstT->source_offsets, dstT->destination_indices, + CUSPARSE_INDEX_BASE_ZERO, + dataType); + } else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) { + coou2csr(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + srcT->source_indices, srcT->destination_indices, + dstEdgeData, + dstT->source_offsets, dstT->destination_indices, + CUSPARSE_INDEX_BASE_ZERO, + dataType); + } else { + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_CSC_32) { // COO2CSC + nvgraphCOOTopology32I_t srcT = static_cast(srcTopology); + nvgraphCSCTopology32I_t dstT = static_cast(dstTopology); + dstT->nvertices = srcT->nvertices; + dstT->nedges = srcT->nedges; + if (srcT->tag == NVGRAPH_SORTED_BY_SOURCE) { + coos2csc(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + srcT->source_indices, srcT->destination_indices, + dstEdgeData, + dstT->source_indices, dstT->destination_offsets, + CUSPARSE_INDEX_BASE_ZERO, + dataType); + } else if (srcT->tag == NVGRAPH_SORTED_BY_DESTINATION) { + coo2csr(srcT->destination_indices, + srcT->nedges, + srcT->nvertices, + dstT->destination_offsets, + CUSPARSE_INDEX_BASE_ZERO); + CHECK_CUDA(cudaMemcpy(dstT->source_indices, + srcT->source_indices, + srcT->nedges * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstEdgeData, + srcEdgeData, + srcT->nedges * sizeT, + cudaMemcpyDefault)); + } else if (srcT->tag == NVGRAPH_DEFAULT || srcT->tag == NVGRAPH_UNSORTED) { + coou2csc(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + srcT->source_indices, srcT->destination_indices, + dstEdgeData, + dstT->source_indices, dstT->destination_offsets, + CUSPARSE_INDEX_BASE_ZERO, + dataType); + } else { + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } else if (srcTType == NVGRAPH_COO_32 && dstTType == NVGRAPH_COO_32) { // COO2COO + nvgraphCOOTopology32I_t srcT = static_cast(srcTopology); + nvgraphCOOTopology32I_t dstT = static_cast(dstTopology); + dstT->nvertices = srcT->nvertices; + dstT->nedges = srcT->nedges; + if (srcT->tag == dstT->tag || dstT->tag == NVGRAPH_DEFAULT + || dstT->tag == NVGRAPH_UNSORTED) { + CHECK_CUDA(cudaMemcpy(dstT->source_indices, + srcT->source_indices, + srcT->nedges * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstT->destination_indices, + srcT->destination_indices, + srcT->nedges * sizeof(int), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy(dstEdgeData, + srcEdgeData, + srcT->nedges * sizeT, + cudaMemcpyDefault)); + } else if (dstT->tag == NVGRAPH_SORTED_BY_SOURCE) { + cooSortBySource(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + srcT->source_indices, srcT->destination_indices, + dstEdgeData, + dstT->source_indices, dstT->destination_indices, + CUSPARSE_INDEX_BASE_ZERO, + dataType); + } else if (dstT->tag == NVGRAPH_SORTED_BY_DESTINATION) { + cooSortByDestination(srcT->nvertices, srcT->nvertices, srcT->nedges, + srcEdgeData, + srcT->source_indices, srcT->destination_indices, + dstEdgeData, + dstT->source_indices, dstT->destination_indices, + CUSPARSE_INDEX_BASE_ZERO, + dataType); + } else { + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + + /////////////////////////////////////////////////////////////////////////////////////////////////////////// + } else { + return NVGRAPH_STATUS_INVALID_VALUE; + } + + } + NVGRAPH_CATCHES(rc) + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t srcDescrG, + nvgraphGraphDescr_t dstDescrG, + nvgraphTopologyType_t dstTType) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + nvgraphStatus_t status = NVGRAPH_STATUS_SUCCESS; + try + { + if (check_context(handle) || check_graph(srcDescrG)) // Graph must have a topology + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (dstDescrG->graphStatus != IS_EMPTY) // dst Graph must be empty + return NVGRAPH_STATUS_INVALID_VALUE; + + // graphs can only have CSR or CSC topology (EL is for storage only) + if (srcDescrG->TT != NVGRAPH_CSR_32 && srcDescrG->TT != NVGRAPH_CSC_32) + return NVGRAPH_STATUS_INTERNAL_ERROR; // invalid state, you can only create graph with CSR/CSC + if (dstTType != NVGRAPH_CSR_32 && dstTType != NVGRAPH_CSC_32) + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; // only conversion to CSR/CSC is allowed + + int nvertices, nedges; + int *srcOffsets = NULL, *srcIndices = NULL, *dstOffsets = NULL, *dstIndices = NULL; + std::shared_ptr permutation, offsets, indices; + + // Step 1: get source graph structure + nvgraph::CsrGraph *CSRG = + static_cast *>(srcDescrG->graph_handle); + nvertices = static_cast(CSRG->get_num_vertices()); + nedges = static_cast(CSRG->get_num_edges()); + srcOffsets = CSRG->get_raw_row_offsets(); + srcIndices = CSRG->get_raw_column_indices(); + + // Step 2: convert topology and get permutation array. + if (srcDescrG->TT != dstTType) { // Otherwise conversion is not needed, only copy. + offsets = allocateDevice(nvertices + 1, NULL); + indices = allocateDevice(nedges, NULL); + permutation = allocateDevice(nedges, NULL); + csr2cscP(nvertices, nvertices, nedges, + srcOffsets, + srcIndices, + indices.get(), + offsets.get(), permutation.get(), CUSPARSE_INDEX_BASE_ZERO); + dstOffsets = offsets.get(); + dstIndices = indices.get(); + } else { + dstOffsets = srcOffsets; + dstIndices = srcIndices; + } + + // Step 3: Set dst graph structure + if (dstTType == NVGRAPH_CSR_32) { + nvgraphCSRTopology32I_st dstTopology; + dstTopology.nedges = nedges; + dstTopology.nvertices = nvertices; + dstTopology.source_offsets = dstOffsets; + dstTopology.destination_indices = dstIndices; + status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType); + } else if (dstTType == NVGRAPH_CSC_32) { + nvgraphCSCTopology32I_st dstTopology; + dstTopology.nedges = nedges; + dstTopology.nvertices = nvertices; + dstTopology.destination_offsets = dstOffsets; + dstTopology.source_indices = dstIndices; + status = nvgraphSetGraphStructure(handle, dstDescrG, &dstTopology, dstTType); + } else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + if (status != NVGRAPH_STATUS_SUCCESS) + return NVGRAPH_STATUS_INTERNAL_ERROR; + offsets.reset(); + indices.reset(); + + // Step 4: Allocate, convert and set edge+vertex data on the new graph + if (srcDescrG->graphStatus == HAS_VALUES) { + if (srcDescrG->T == CUDA_R_32F) { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(srcDescrG->graph_handle); + size_t vertexDim = MCSRG->get_num_vertex_dim(); + size_t edgesDim = MCSRG->get_num_edge_dim(); + // Step 4.1: allocate and set vertex data (no need for convert) + if (vertexDim > 0) { + std::vector vertexDataType(vertexDim); + std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T); + status = nvgraphAllocateVertexData(handle, + dstDescrG, + vertexDim, + vertexDataType.data()); + if (status != NVGRAPH_STATUS_SUCCESS) + return NVGRAPH_STATUS_INTERNAL_ERROR; + for (size_t i = 0; i < vertexDim; ++i) { + void *vertexData = MCSRG->get_raw_vertex_dim(i); + status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i); + if (status != NVGRAPH_STATUS_SUCCESS) + return NVGRAPH_STATUS_INTERNAL_ERROR; + } + } + // Step 4.2: allocate and set vertex data + if (edgesDim > 0) { + void *dstEdgeData = NULL; + std::shared_ptr dstEdgeDataSP; + + std::vector edgeDataType(edgesDim); + std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T); + status = nvgraphAllocateEdgeData(handle, + dstDescrG, + edgesDim, + edgeDataType.data()); + if (status != NVGRAPH_STATUS_SUCCESS) + return NVGRAPH_STATUS_INTERNAL_ERROR; + // allocate edge data memory (if there is a need) + if (edgesDim > 0 && srcDescrG->TT != dstTType) { + dstEdgeDataSP = allocateDevice(nedges, NULL); + dstEdgeData = dstEdgeDataSP.get(); + } + // Convert and set edge data (using permutation array) + for (size_t i = 0; i < edgesDim; ++i) { + void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i)); + if (srcDescrG->TT != dstTType) // Convert using permutation array + gthrX(nedges, + srcEdgeData, + dstEdgeData, + permutation.get(), + CUSPARSE_INDEX_BASE_ZERO, + &(srcDescrG->T)); + else + dstEdgeData = srcEdgeData; + // set edgedata + status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i); + if (status != NVGRAPH_STATUS_SUCCESS) + return NVGRAPH_STATUS_INTERNAL_ERROR; + } + } + } else if (srcDescrG->T == CUDA_R_64F) { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(srcDescrG->graph_handle); + size_t vertexDim = MCSRG->get_num_vertex_dim(); + size_t edgesDim = MCSRG->get_num_edge_dim(); + // Step 4.1: allocate and set vertex data (no need for convert) + if (vertexDim > 0) { + std::vector vertexDataType(vertexDim); + std::fill(vertexDataType.begin(), vertexDataType.end(), srcDescrG->T); + status = nvgraphAllocateVertexData(handle, + dstDescrG, + vertexDim, + vertexDataType.data()); + if (status != NVGRAPH_STATUS_SUCCESS) + return NVGRAPH_STATUS_INTERNAL_ERROR; + for (size_t i = 0; i < vertexDim; ++i) { + void *vertexData = MCSRG->get_raw_vertex_dim(i); + status = nvgraphSetVertexData(handle, dstDescrG, vertexData, i); + if (status != NVGRAPH_STATUS_SUCCESS) + return NVGRAPH_STATUS_INTERNAL_ERROR; + } + } + // Step 4.2: allocate and set vertex data + if (edgesDim > 0) { + void *dstEdgeData = NULL; + std::shared_ptr dstEdgeDataSP; + + std::vector edgeDataType(edgesDim); + std::fill(edgeDataType.begin(), edgeDataType.end(), srcDescrG->T); + status = nvgraphAllocateEdgeData(handle, + dstDescrG, + edgesDim, + edgeDataType.data()); + if (status != NVGRAPH_STATUS_SUCCESS) + return NVGRAPH_STATUS_INTERNAL_ERROR; + // allocate edge data memory (if there is a need) + if (edgesDim > 0 && srcDescrG->TT != dstTType) { + dstEdgeDataSP = allocateDevice(nedges, NULL); + dstEdgeData = dstEdgeDataSP.get(); + } + // Convert and set edge data (using permutation array) + for (size_t i = 0; i < edgesDim; ++i) { + void *srcEdgeData = (void*) (MCSRG->get_raw_edge_dim((int) i)); + if (srcDescrG->TT != dstTType) // Convert using permutation array + gthrX(nedges, + srcEdgeData, + dstEdgeData, + permutation.get(), + CUSPARSE_INDEX_BASE_ZERO, + &(srcDescrG->T)); + else + dstEdgeData = srcEdgeData; + // set edgedata + status = nvgraphSetEdgeData(handle, dstDescrG, dstEdgeData, i); + if (status != NVGRAPH_STATUS_SUCCESS) + return NVGRAPH_STATUS_INTERNAL_ERROR; + } + } + } else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + return getCAPIStatusForError(rc); + + } + + nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void *edgeData, + size_t setnum) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(setnum) + || check_ptr(edgeData)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->T == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy(MCSRG->get_raw_edge_dim(setnum), + (float*) edgeData, + (size_t) ((MCSRG->get_num_edges()) * sizeof(float)), + cudaMemcpyDefault); + } + else if (descrG->T == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy(MCSRG->get_raw_edge_dim(setnum), + (double*) edgeData, + (size_t) ((MCSRG->get_num_edges()) * sizeof(double)), + cudaMemcpyDefault); + } + else if (descrG->T == CUDA_R_32I) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy(MCSRG->get_raw_edge_dim(setnum), + (int*) edgeData, + (size_t) ((MCSRG->get_num_edges()) * sizeof(int)), + cudaMemcpyDefault); + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + + cudaCheckError(); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + void *edgeData, + size_t setnum) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(setnum) + || check_ptr(edgeData)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->T == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy((float*) edgeData, + MCSRG->get_raw_edge_dim(setnum), + (size_t) ((MCSRG->get_num_edges()) * sizeof(float)), + cudaMemcpyDefault); + } + else if (descrG->T == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (setnum >= MCSRG->get_num_edge_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + cudaMemcpy((double*) edgeData, + MCSRG->get_raw_edge_dim(setnum), + (size_t) ((MCSRG->get_num_edges()) * sizeof(double)), + cudaMemcpyDefault); + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + + cudaCheckError(); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv_impl_cub(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const void *alpha, + const size_t x, + const void *beta, + const size_t y, + const nvgraphSemiring_t SR) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + + try + { + // some basic checks + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + rc = SemiringAPILauncher(handle, descrG, weight_index, alpha, x, beta, y, SR); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphSssp_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int *source_vert, + const size_t sssp) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index) + || check_int_ptr(source_vert)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->TT != NVGRAPH_CSC_32) // supported topologies + return NVGRAPH_STATUS_INVALID_VALUE; // cudaError_t cuda_status; - if (descrG->graphStatus != HAS_VALUES) - return NVGRAPH_STATUS_INVALID_VALUE; - - switch (descrG->T) - { - case CUDA_R_32F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - - int n = static_cast(MCSRG->get_num_vertices()); - nvgraph::Vector co(n, handle->stream); - nvgraph::Sssp sssp_solver(*MCSRG->get_valued_csr_graph(weight_index)); - nvgraph::set_connectivity(n, *source_vert, 0.0, FLT_MAX, co.raw()); - MCSRG->get_vertex_dim(sssp).copy(co); - rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp)); - break; - } - case CUDA_R_64F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - - int n = static_cast(MCSRG->get_num_vertices()); - nvgraph::Vector co(n, handle->stream); - nvgraph::Sssp sssp_solver(*MCSRG->get_valued_csr_graph(weight_index)); - nvgraph::set_connectivity(n, *source_vert, 0.0, DBL_MAX, co.raw()); - MCSRG->get_vertex_dim(sssp).copy(co); - rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp)); - break; - } - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphTraversal_impl(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const nvgraphTraversal_t traversalT, - const int *source_vertex_ptr, - const nvgraphTraversalParameter_t params) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_ptr(source_vertex_ptr)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph (storing results) - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->TT != NVGRAPH_CSR_32) // supported topologies - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->T != CUDA_R_32I) //results are ints - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - - //Results (bfs distances, predecessors..) are written in dimension in mvcsrg - nvgraph::MultiValuedCsrGraph *MCSRG = static_cast*>(descrG->graph_handle); - - // - //Computing traversal parameters - // - - size_t distancesIndex, predecessorsIndex, edgeMaskIndex; - size_t undirectedFlagParam; - size_t alpha_ul, beta_ul; - - int *distances = NULL, *predecessors = NULL, *edge_mask = NULL; - - nvgraphTraversalGetDistancesIndex(params, &distancesIndex); - nvgraphTraversalGetPredecessorsIndex(params, &predecessorsIndex); - nvgraphTraversalGetEdgeMaskIndex(params, &edgeMaskIndex); - nvgraphTraversalGetUndirectedFlag(params, &undirectedFlagParam); - nvgraphTraversalGetAlpha(params, &alpha_ul); - nvgraphTraversalGetBeta(params, &beta_ul); - - int alpha = static_cast(alpha_ul); - int beta = static_cast(beta_ul); - - //If distances_index was set by user, then use it - if (distancesIndex <= MCSRG->get_num_vertex_dim()) { - distances = MCSRG->get_vertex_dim(distancesIndex).raw(); - } - - //If predecessors_index was set by user, then use it - if (predecessorsIndex <= MCSRG->get_num_vertex_dim()) { - predecessors = MCSRG->get_vertex_dim(predecessorsIndex).raw(); - } - - //If edgemask_index was set by user, then use it - if (edgeMaskIndex <= MCSRG->get_num_vertex_dim()) { - edge_mask = MCSRG->get_edge_dim(edgeMaskIndex).raw(); - } - - int source_vertex = *source_vertex_ptr; - - int n = static_cast(MCSRG->get_num_vertices()); - int nnz = static_cast(MCSRG->get_num_edges()); - int *row_offsets = MCSRG->get_raw_row_offsets(); - int *col_indices = MCSRG->get_raw_column_indices(); - - bool undirected = (bool) undirectedFlagParam; - - if (source_vertex < 0 || source_vertex >= n) { - return NVGRAPH_STATUS_INVALID_VALUE; - } - - //Calling corresponding implementation - switch (traversalT) { - case NVGRAPH_TRAVERSAL_BFS: - nvgraph::Bfs bfs_solver(n, - nnz, - row_offsets, - col_indices, - !undirected, - alpha, - beta, - handle->stream); - - //To easily implement multi source with single source, - //loop on those two - rc = bfs_solver.configure(distances, predecessors, edge_mask); - rc = bfs_solver.traverse(source_vertex); - break; - }; - - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - /** - * CAPI Method for calling 2d BFS algorithm. - * @param handle Nvgraph context handle. - * @param descrG Graph handle (must be 2D partitioned) - * @param source_vert The source vertex ID - * @param distances Pointer to memory allocated to store the distances. - * @param predecessors Pointer to memory allocated to store the predecessors - * @return Status code. - */ - nvgraphStatus_t NVGRAPH_API nvgraph2dBfs_impl(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const int32_t source_vert, - int32_t* distances, - int32_t* predecessors) { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try { - if (check_context(handle) || check_graph(descrG)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - if (descrG->graphStatus == IS_EMPTY) - return NVGRAPH_STATUS_INVALID_VALUE; - if (descrG->TT != NVGRAPH_2D_32I_32I) - return NVGRAPH_STATUS_INVALID_VALUE; - if (descrG->T != CUDA_R_32I) - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::Matrix2d* m = static_cast*>(descrG->graph_handle); -// std::cout << m->toString(); - nvgraph::Bfs2d bfs(m, true, 0, 0); - rc = bfs.configure(distances, predecessors); - rc = bfs.traverse(source_vert); - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphWidestPath_impl(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int *source_vert, - const size_t widest_path) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index) - || check_int_ptr(source_vert)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->TT != NVGRAPH_CSC_32) // supported topologies - return NVGRAPH_STATUS_INVALID_VALUE; + if (descrG->graphStatus != HAS_VALUES) + return NVGRAPH_STATUS_INVALID_VALUE; + + switch (descrG->T) + { + case CUDA_R_32F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + + int n = static_cast(MCSRG->get_num_vertices()); + nvgraph::Vector co(n, handle->stream); + nvgraph::Sssp sssp_solver(*MCSRG->get_valued_csr_graph(weight_index)); + nvgraph::set_connectivity(n, *source_vert, 0.0, FLT_MAX, co.raw()); + MCSRG->get_vertex_dim(sssp).copy(co); + rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp)); + break; + } + case CUDA_R_64F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() || sssp >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + + int n = static_cast(MCSRG->get_num_vertices()); + nvgraph::Vector co(n, handle->stream); + nvgraph::Sssp sssp_solver(*MCSRG->get_valued_csr_graph(weight_index)); + nvgraph::set_connectivity(n, *source_vert, 0.0, DBL_MAX, co.raw()); + MCSRG->get_vertex_dim(sssp).copy(co); + rc = sssp_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(sssp)); + break; + } + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphTraversal_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const nvgraphTraversal_t traversalT, + const int *source_vertex_ptr, + const nvgraphTraversalParameter_t params) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_ptr(source_vertex_ptr)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph (storing results) + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->TT != NVGRAPH_CSR_32) // supported topologies + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->T != CUDA_R_32I) //results are ints + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + + //Results (bfs distances, predecessors..) are written in dimension in mvcsrg + nvgraph::MultiValuedCsrGraph *MCSRG = static_cast*>(descrG->graph_handle); + + // + //Computing traversal parameters + // + + size_t distancesIndex, predecessorsIndex, edgeMaskIndex; + size_t undirectedFlagParam; + size_t alpha_ul, beta_ul; + + int *distances = NULL, *predecessors = NULL, *edge_mask = NULL; + + nvgraphTraversalGetDistancesIndex(params, &distancesIndex); + nvgraphTraversalGetPredecessorsIndex(params, &predecessorsIndex); + nvgraphTraversalGetEdgeMaskIndex(params, &edgeMaskIndex); + nvgraphTraversalGetUndirectedFlag(params, &undirectedFlagParam); + nvgraphTraversalGetAlpha(params, &alpha_ul); + nvgraphTraversalGetBeta(params, &beta_ul); + + int alpha = static_cast(alpha_ul); + int beta = static_cast(beta_ul); + + //If distances_index was set by user, then use it + if (distancesIndex <= MCSRG->get_num_vertex_dim()) { + distances = MCSRG->get_vertex_dim(distancesIndex).raw(); + } + + //If predecessors_index was set by user, then use it + if (predecessorsIndex <= MCSRG->get_num_vertex_dim()) { + predecessors = MCSRG->get_vertex_dim(predecessorsIndex).raw(); + } + + //If edgemask_index was set by user, then use it + if (edgeMaskIndex <= MCSRG->get_num_vertex_dim()) { + edge_mask = MCSRG->get_edge_dim(edgeMaskIndex).raw(); + } + + int source_vertex = *source_vertex_ptr; + + int n = static_cast(MCSRG->get_num_vertices()); + int nnz = static_cast(MCSRG->get_num_edges()); + int *row_offsets = MCSRG->get_raw_row_offsets(); + int *col_indices = MCSRG->get_raw_column_indices(); + + bool undirected = (bool) undirectedFlagParam; + + if (source_vertex < 0 || source_vertex >= n) { + return NVGRAPH_STATUS_INVALID_VALUE; + } + + //Calling corresponding implementation + switch (traversalT) { + case NVGRAPH_TRAVERSAL_BFS: + nvgraph::Bfs bfs_solver(n, + nnz, + row_offsets, + col_indices, + !undirected, + alpha, + beta, + handle->stream); + + //To easily implement multi source with single source, + //loop on those two + rc = bfs_solver.configure(distances, predecessors, edge_mask); + rc = bfs_solver.traverse(source_vertex); + break; + }; + + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + /** + * CAPI Method for calling 2d BFS algorithm. + * @param handle Nvgraph context handle. + * @param descrG Graph handle (must be 2D partitioned) + * @param source_vert The source vertex ID + * @param distances Pointer to memory allocated to store the distances. + * @param predecessors Pointer to memory allocated to store the predecessors + * @return Status code. + */ + nvgraphStatus_t NVGRAPH_API nvgraph2dBfs_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const int32_t source_vert, + int32_t* distances, + int32_t* predecessors) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try { + if (check_context(handle) || check_graph(descrG)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + if (descrG->graphStatus == IS_EMPTY) + return NVGRAPH_STATUS_INVALID_VALUE; + if (descrG->TT != NVGRAPH_2D_32I_32I) + return NVGRAPH_STATUS_INVALID_VALUE; + if (descrG->T != CUDA_R_32I) + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::Matrix2d* m = static_cast*>(descrG->graph_handle); +// std::cout << m->toString(); + nvgraph::Bfs2d bfs(m, true, 0, 0); + rc = bfs.configure(distances, predecessors); + rc = bfs.traverse(source_vert); + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphWidestPath_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int *source_vert, + const size_t widest_path) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index) + || check_int_ptr(source_vert)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->TT != NVGRAPH_CSC_32) // supported topologies + return NVGRAPH_STATUS_INVALID_VALUE; // cudaError_t cuda_status; - switch (descrG->T) - { - case CUDA_R_32F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - - int n = static_cast(MCSRG->get_num_vertices()); - nvgraph::Vector co(n, handle->stream); - nvgraph::WidestPath widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index)); - nvgraph::set_connectivity(n, *source_vert, FLT_MAX, -FLT_MAX, co.raw()); - MCSRG->get_vertex_dim(widest_path).copy(co); - rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path)); - break; - } - case CUDA_R_64F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - - int n = static_cast(MCSRG->get_num_vertices()); - nvgraph::Vector co(n, handle->stream); - nvgraph::WidestPath widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index)); - nvgraph::set_connectivity(n, *source_vert, DBL_MAX, -DBL_MAX, co.raw()); - MCSRG->get_vertex_dim(widest_path).copy(co); - rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path)); - break; - } - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphPagerank_impl(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const void *alpha, - const size_t bookmark, - const int has_guess, - const size_t rank, - const float tolerance, - const int max_iter) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index) - || check_ptr(alpha)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->TT != NVGRAPH_CSC_32) // supported topologies - return NVGRAPH_STATUS_INVALID_VALUE; - - if (!(has_guess == 0 || has_guess == 1)) - return NVGRAPH_STATUS_INVALID_VALUE; - - int max_it; - float tol; - - if (max_iter > 0) - max_it = max_iter; - else - max_it = 500; - - if (tolerance == 0.0f) - tol = 1.0E-6f; - else if (tolerance < 1.0f && tolerance > 0.0f) - tol = tolerance; - else - return NVGRAPH_STATUS_INVALID_VALUE; - - switch (descrG->T) - { - case CUDA_R_32F: - { - float alphaT = *static_cast(alpha); - if (alphaT <= 0.0f || alphaT >= 1.0f) - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || bookmark >= MCSRG->get_num_vertex_dim() - || rank >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - - int n = static_cast(MCSRG->get_num_vertices()); - nvgraph::Vector guess(n, handle->stream); - nvgraph::Vector bm(n, handle->stream); - if (has_guess) - guess.copy(MCSRG->get_vertex_dim(rank)); - else - guess.fill(static_cast(1.0 / n)); - bm.copy(MCSRG->get_vertex_dim(bookmark)); - nvgraph::Pagerank pagerank_solver( *MCSRG->get_valued_csr_graph(weight_index), - bm); - rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it); - break; - } - case CUDA_R_64F: - { - double alphaT = *static_cast(alpha); - if (alphaT <= 0.0 || alphaT >= 1.0) - return NVGRAPH_STATUS_INVALID_VALUE; - - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || bookmark >= MCSRG->get_num_vertex_dim() - || rank >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - - int n = static_cast(MCSRG->get_num_vertices()); - nvgraph::Vector guess(n, handle->stream); - nvgraph::Vector bm(n, handle->stream); - bm.copy(MCSRG->get_vertex_dim(bookmark)); - if (has_guess) - guess.copy(MCSRG->get_vertex_dim(rank)); - else - guess.fill(static_cast(1.0 / n)); - nvgraph::Pagerank pagerank_solver( *MCSRG->get_valued_csr_graph(weight_index), - bm); - rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it); - break; - } - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphKrylovPagerank_impl(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const void *alpha, - const size_t bookmark, - const float tolerance, - const int max_iter, - const int subspace_size, - const int has_guess, - const size_t rank) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index) - || check_ptr(alpha)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->TT != NVGRAPH_CSC_32) // supported topologies - return NVGRAPH_STATUS_INVALID_VALUE; + switch (descrG->T) + { + case CUDA_R_32F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + + int n = static_cast(MCSRG->get_num_vertices()); + nvgraph::Vector co(n, handle->stream); + nvgraph::WidestPath widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index)); + nvgraph::set_connectivity(n, *source_vert, FLT_MAX, -FLT_MAX, co.raw()); + MCSRG->get_vertex_dim(widest_path).copy(co); + rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path)); + break; + } + case CUDA_R_64F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || widest_path >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + + int n = static_cast(MCSRG->get_num_vertices()); + nvgraph::Vector co(n, handle->stream); + nvgraph::WidestPath widest_path_solver(*MCSRG->get_valued_csr_graph(weight_index)); + nvgraph::set_connectivity(n, *source_vert, DBL_MAX, -DBL_MAX, co.raw()); + MCSRG->get_vertex_dim(widest_path).copy(co); + rc = widest_path_solver.solve(*source_vert, co, MCSRG->get_vertex_dim(widest_path)); + break; + } + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphPagerank_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const void *alpha, + const size_t bookmark, + const int has_guess, + const size_t rank, + const float tolerance, + const int max_iter) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index) + || check_ptr(alpha)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->TT != NVGRAPH_CSC_32) // supported topologies + return NVGRAPH_STATUS_INVALID_VALUE; + + if (!(has_guess == 0 || has_guess == 1)) + return NVGRAPH_STATUS_INVALID_VALUE; + + int max_it; + float tol; + + if (max_iter > 0) + max_it = max_iter; + else + max_it = 500; + + if (tolerance == 0.0f) + tol = 1.0E-6f; + else if (tolerance < 1.0f && tolerance > 0.0f) + tol = tolerance; + else + return NVGRAPH_STATUS_INVALID_VALUE; + + switch (descrG->T) + { + case CUDA_R_32F: + { + float alphaT = *static_cast(alpha); + if (alphaT <= 0.0f || alphaT >= 1.0f) + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || bookmark >= MCSRG->get_num_vertex_dim() + || rank >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + + int n = static_cast(MCSRG->get_num_vertices()); + nvgraph::Vector guess(n, handle->stream); + nvgraph::Vector bm(n, handle->stream); + if (has_guess) + guess.copy(MCSRG->get_vertex_dim(rank)); + else + guess.fill(static_cast(1.0 / n)); + bm.copy(MCSRG->get_vertex_dim(bookmark)); + nvgraph::Pagerank pagerank_solver(*MCSRG->get_valued_csr_graph(weight_index), bm); + rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it); + break; + } + case CUDA_R_64F: + { + double alphaT = *static_cast(alpha); + if (alphaT <= 0.0 || alphaT >= 1.0) + return NVGRAPH_STATUS_INVALID_VALUE; + + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || bookmark >= MCSRG->get_num_vertex_dim() + || rank >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + + int n = static_cast(MCSRG->get_num_vertices()); + nvgraph::Vector guess(n, handle->stream); + nvgraph::Vector bm(n, handle->stream); + bm.copy(MCSRG->get_vertex_dim(bookmark)); + if (has_guess) + guess.copy(MCSRG->get_vertex_dim(rank)); + else + guess.fill(static_cast(1.0 / n)); + nvgraph::Pagerank pagerank_solver(*MCSRG->get_valued_csr_graph(weight_index), bm); + rc = pagerank_solver.solve(alphaT, guess, MCSRG->get_vertex_dim(rank), tol, max_it); + break; + } + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphKrylovPagerank_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const void *alpha, + const size_t bookmark, + const float tolerance, + const int max_iter, + const int subspace_size, + const int has_guess, + const size_t rank) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index) + || check_ptr(alpha)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->TT != NVGRAPH_CSC_32) // supported topologies + return NVGRAPH_STATUS_INVALID_VALUE; // cudaError_t cuda_status; - int max_it; - int ss_sz; - float tol; - - if (max_iter > 0) - max_it = max_iter; - else - max_it = 500; - - if (subspace_size > 0) - ss_sz = subspace_size; - else - ss_sz = 8; - - if (tolerance == 0.0f) - tol = 1.0E-6f; - else if (tolerance < 1.0f && tolerance > 0.0f) - tol = tolerance; - else - return NVGRAPH_STATUS_INVALID_VALUE; - - switch (descrG->T) - { - case CUDA_R_32F: - { - float alphaT = *static_cast(alpha); - if (alphaT <= 0.0f || alphaT >= 1.0f) - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || bookmark >= MCSRG->get_num_vertex_dim() - || rank >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - - int n = static_cast(MCSRG->get_num_vertices()); - nvgraph::Vector guess(n, handle->stream), eigVals(1, handle->stream); - if (has_guess) - guess.copy(MCSRG->get_vertex_dim(rank)); - else - guess.fill(static_cast(1.0 / n)); - nvgraph::ImplicitArnoldi iram_solver( *MCSRG->get_valued_csr_graph(weight_index), - MCSRG->get_vertex_dim(bookmark), - tol, - max_it, - alphaT); - rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank)); - break; - } - case CUDA_R_64F: - { - // curently iram solver accept float for alpha - double alphaTemp = *static_cast(alpha); - float alphaT = static_cast(alphaTemp); - if (alphaT <= 0.0f || alphaT >= 1.0f) - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || bookmark >= MCSRG->get_num_vertex_dim() - || rank >= MCSRG->get_num_vertex_dim()) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - - int n = static_cast(MCSRG->get_num_vertices()); - nvgraph::Vector guess(n, handle->stream), eigVals(1, handle->stream); - if (has_guess) - guess.copy(MCSRG->get_vertex_dim(rank)); - else - guess.fill(static_cast(1.0 / n)); - nvgraph::ImplicitArnoldi iram_solver( *MCSRG->get_valued_csr_graph(weight_index), - MCSRG->get_vertex_dim(bookmark), - tol, - max_it, - alphaT); - rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank)); - break; - } - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - nvgraphGraphDescr_t subdescrG, - int *subvertices, - size_t numvertices) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - typedef int IndexType; - - try - { - if (check_context(handle) || - check_graph(descrG) || - !subdescrG || - check_int_size(numvertices) || - check_ptr(subvertices)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (!numvertices) - return NVGRAPH_STATUS_INVALID_VALUE; - - subdescrG->TT = descrG->TT; - subdescrG->T = descrG->T; - - switch (descrG->graphStatus) - { - case HAS_TOPOLOGY: //CsrGraph - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - - Graph* subgraph = extract_subgraph_by_vertices(*CSRG, - subvertices, - numvertices, - handle->stream); - - subdescrG->graph_handle = subgraph; - subdescrG->graphStatus = HAS_TOPOLOGY; - } - break; - - case HAS_VALUES: //MultiValuedCsrGraph - if (descrG->T == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - - nvgraph::MultiValuedCsrGraph* subgraph = - extract_subgraph_by_vertices(*MCSRG, - subvertices, - numvertices, - handle->stream); - - subdescrG->graph_handle = subgraph; - subdescrG->graphStatus = HAS_VALUES; - } - else if (descrG->T == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - - nvgraph::MultiValuedCsrGraph* subgraph = - extract_subgraph_by_vertices(*MCSRG, - subvertices, - numvertices, - handle->stream); - - subdescrG->graph_handle = subgraph; - subdescrG->graphStatus = HAS_VALUES; - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - break; - - default: - return NVGRAPH_STATUS_INVALID_VALUE; - } - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - nvgraphGraphDescr_t subdescrG, - int *subedges, - size_t numedges) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - //TODO: extract handle->stream info, from handler/nvgraphContext (?) - typedef int IndexType; - - try - { - if (check_context(handle) || - check_graph(descrG) || - !subdescrG || - check_int_size(numedges) || - check_ptr(subedges)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (!numedges) - return NVGRAPH_STATUS_INVALID_VALUE; - - subdescrG->TT = descrG->TT; - subdescrG->T = descrG->T; - - switch (descrG->graphStatus) - { - case HAS_TOPOLOGY: //CsrGraph - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - Graph* subgraph = extract_subgraph_by_edges(*CSRG, - subedges, - numedges, - handle->stream); - - subdescrG->graph_handle = subgraph; - subdescrG->graphStatus = HAS_TOPOLOGY; - } - break; - - case HAS_VALUES: //MultiValuedCsrGraph - if (descrG->T == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - - nvgraph::MultiValuedCsrGraph* subgraph = - extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream); - - subdescrG->graph_handle = subgraph; - subdescrG->graphStatus = HAS_VALUES; - } - else if (descrG->T == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - - nvgraph::MultiValuedCsrGraph* subgraph = - extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream); - - subdescrG->graph_handle = subgraph; - subdescrG->graphStatus = HAS_VALUES; - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - break; - - default: - return NVGRAPH_STATUS_INVALID_VALUE; - } - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphBalancedCutClustering_impl(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int n_clusters, - const int n_eig_vects, - const int evs_type, - const float evs_tolerance, - const int evs_max_iter, - const float kmean_tolerance, - const int kmean_max_iter, - int* clustering, - void* eig_vals, - void* eig_vects) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->TT != NVGRAPH_CSR_32) // supported topologies - return NVGRAPH_STATUS_INVALID_VALUE; - - int evs_max_it, kmean_max_it; - int iters_lanczos, iters_kmeans; - float evs_tol, kmean_tol; - - if (evs_max_iter > 0) - evs_max_it = evs_max_iter; - else - evs_max_it = 4000; - - if (evs_tolerance == 0.0f) - evs_tol = 1.0E-3f; - else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f) - evs_tol = evs_tolerance; - else - return NVGRAPH_STATUS_INVALID_VALUE; - - if (kmean_max_iter > 0) - kmean_max_it = kmean_max_iter; - else - kmean_max_it = 200; - - if (kmean_tolerance == 0.0f) - kmean_tol = 1.0E-2f; - else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f) - kmean_tol = kmean_tolerance; - else - return NVGRAPH_STATUS_INVALID_VALUE; - - if (n_clusters < 2) - return NVGRAPH_STATUS_INVALID_VALUE; - - if (n_eig_vects > n_clusters) - return NVGRAPH_STATUS_INVALID_VALUE; - - if (!(evs_type == 0 || evs_type == 1)) - return NVGRAPH_STATUS_INVALID_VALUE; - - if (clustering == NULL || eig_vals == NULL || eig_vects == NULL) - return NVGRAPH_STATUS_INVALID_VALUE; - - switch (descrG->T) - { - case CUDA_R_32F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - Vector clust(MCSRG->get_num_vertices(), handle->stream); - Vector eigVals(n_eig_vects, handle->stream); - Vector eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream); - - if (evs_type == 0) - { - int restartIter_lanczos = 15 + n_eig_vects; - rc = partition(network, - n_clusters, - n_eig_vects, - evs_max_it, - restartIter_lanczos, - evs_tol, - kmean_max_it, - kmean_tol, - clust.raw(), - eigVals, - eigVecs, - iters_lanczos, - iters_kmeans); - } - else - { - cusolverDnHandle_t cusolverHandle; - cusolverDnCreate(&cusolverHandle); - rc = partition_lobpcg(network, - NULL, // preconditioner - cusolverHandle, - n_clusters, - n_eig_vects, - evs_max_it, - evs_tol, - kmean_max_it, - kmean_tol, - clust.raw(), - eigVals, - eigVecs, - iters_lanczos, - iters_kmeans); - } - // give a copy of results to the user - if (rc == NVGRAPH_OK) - { - CHECK_CUDA(cudaMemcpy((int* )clustering, - clust.raw(), - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy((float* )eig_vals, - eigVals.raw(), - (size_t )(n_eig_vects * sizeof(float)), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy((float* )eig_vects, - eigVecs.raw(), - (size_t )(n_eig_vects * MCSRG->get_num_vertices() - * sizeof(float)), - cudaMemcpyDefault)); - } - - break; - } - case CUDA_R_64F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - Vector clust(MCSRG->get_num_vertices(), handle->stream); - Vector eigVals(n_eig_vects, handle->stream); - Vector eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream); - if (evs_type == 0) - { - int restartIter_lanczos = 15 + n_eig_vects; - rc = partition(network, - n_clusters, - n_eig_vects, - evs_max_it, - restartIter_lanczos, - evs_tol, - kmean_max_it, - kmean_tol, - clust.raw(), - eigVals, - eigVecs, - iters_lanczos, - iters_kmeans); - } - else - { - cusolverDnHandle_t cusolverHandle; - cusolverDnCreate(&cusolverHandle); - rc = partition_lobpcg(network, - NULL, // preconditioner - cusolverHandle, - n_clusters, - n_eig_vects, - evs_max_it, - evs_tol, - kmean_max_it, - kmean_tol, - clust.raw(), - eigVals, - eigVecs, - iters_lanczos, - iters_kmeans); - } - // give a copy of results to the user - if (rc == NVGRAPH_OK) - { - CHECK_CUDA(cudaMemcpy((int* )clustering, - clust.raw(), - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy((double* )eig_vals, - eigVals.raw(), - (size_t )(n_eig_vects * sizeof(double)), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy((double* )eig_vects, - eigVecs.raw(), - (size_t )(n_eig_vects * MCSRG->get_num_vertices() - * sizeof(double)), - cudaMemcpyDefault)); - } - break; - } - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeBalancedCut_impl(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int n_clusters, - const int* clustering, - float * edgeCut, - float * ratioCut) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->TT != NVGRAPH_CSR_32) // supported topologies - return NVGRAPH_STATUS_INVALID_VALUE; - - if (n_clusters < 2) - return NVGRAPH_STATUS_INVALID_VALUE; - - if (clustering == NULL || edgeCut == NULL || ratioCut == NULL) - return NVGRAPH_STATUS_INVALID_VALUE; - - switch (descrG->T) - { - case CUDA_R_32F: - { - float edge_cut, ratio_cut; - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || n_clusters > static_cast(MCSRG->get_num_vertices())) - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - Vector clust(MCSRG->get_num_vertices(), handle->stream); - CHECK_CUDA(cudaMemcpy(clust.raw(), - (int* )clustering, - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - rc = analyzePartition(network, - n_clusters, - clust.raw(), - edge_cut, - ratio_cut); - *edgeCut = edge_cut; - *ratioCut = ratio_cut; - break; - } - case CUDA_R_64F: - { - double edge_cut, ratio_cut; - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - Vector clust(MCSRG->get_num_vertices(), handle->stream); - CHECK_CUDA(cudaMemcpy(clust.raw(), - (int* )clustering, - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - rc = analyzePartition(network, - n_clusters, - clust.raw(), - edge_cut, - ratio_cut); - *edgeCut = static_cast(edge_cut); - *ratioCut = static_cast(ratio_cut); - break; - } - - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - return getCAPIStatusForError(rc); - - } - - nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching_impl( nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const nvgraphEdgeWeightMatching_t similarity_metric, - int* aggregates, - size_t* num_aggregates) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->TT != NVGRAPH_CSR_32) // supported topologies - return NVGRAPH_STATUS_INVALID_VALUE; - - if (aggregates == NULL) - return NVGRAPH_STATUS_INVALID_VALUE; - Matching_t sim_metric; - switch (similarity_metric) - { - case NVGRAPH_UNSCALED: { - sim_metric = USER_PROVIDED; - break; - } - case NVGRAPH_SCALED_BY_ROW_SUM: { - sim_metric = SCALED_BY_ROW_SUM; - break; - } - case NVGRAPH_SCALED_BY_DIAGONAL: { - sim_metric = SCALED_BY_DIAGONAL; - break; - } - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - - switch (descrG->T) - { - case CUDA_R_32F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim()) - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - Vector agg(MCSRG->get_num_vertices(), handle->stream); - int num_agg = 0; - nvgraph::Size2Selector one_phase_hand_checking(sim_metric); - rc = one_phase_hand_checking.setAggregates(network, agg, num_agg); - *num_aggregates = static_cast(num_agg); - CHECK_CUDA(cudaMemcpy((int* )aggregates, - agg.raw(), - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - break; - } - case CUDA_R_64F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim()) - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - Vector agg(MCSRG->get_num_vertices(), handle->stream); - Vector agg_global(MCSRG->get_num_vertices(), handle->stream); - int num_agg = 0; - nvgraph::Size2Selector one_phase_hand_checking(sim_metric); - rc = one_phase_hand_checking.setAggregates(network, agg, num_agg); - *num_aggregates = static_cast(num_agg); - CHECK_CUDA(cudaMemcpy((int* )aggregates, - agg.raw(), - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - break; - } - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - return getCAPIStatusForError(rc); - - } - - nvgraphStatus_t NVGRAPH_API nvgraphSpectralModularityMaximization_impl( nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int n_clusters, - const int n_eig_vects, - const float evs_tolerance, - const int evs_max_iter, - const float kmean_tolerance, - const int kmean_max_iter, - int* clustering, - void* eig_vals, - void* eig_vects) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->TT != NVGRAPH_CSR_32) // supported topologies - return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED; - - int evs_max_it, kmean_max_it; - int iters_lanczos, iters_kmeans; - float evs_tol, kmean_tol; - - if (evs_max_iter > 0) - evs_max_it = evs_max_iter; - else - evs_max_it = 4000; - - if (evs_tolerance == 0.0f) - evs_tol = 1.0E-3f; - else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f) - evs_tol = evs_tolerance; - else - return NVGRAPH_STATUS_INVALID_VALUE; - - if (kmean_max_iter > 0) - kmean_max_it = kmean_max_iter; - else - kmean_max_it = 200; - - if (kmean_tolerance == 0.0f) - kmean_tol = 1.0E-2f; - else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f) - kmean_tol = kmean_tolerance; - else - return NVGRAPH_STATUS_INVALID_VALUE; - - if (n_clusters < 2) - return NVGRAPH_STATUS_INVALID_VALUE; - - if (n_eig_vects > n_clusters) - return NVGRAPH_STATUS_INVALID_VALUE; - - if (clustering == NULL || eig_vals == NULL || eig_vects == NULL) - return NVGRAPH_STATUS_INVALID_VALUE; - - switch (descrG->T) - { - case CUDA_R_32F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - Vector clust(MCSRG->get_num_vertices(), handle->stream); - Vector eigVals(n_eig_vects, handle->stream); - Vector eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream); - int restartIter_lanczos = 15 + n_eig_vects; - rc = modularity_maximization(network, - n_clusters, - n_eig_vects, - evs_max_it, - restartIter_lanczos, - evs_tol, - kmean_max_it, - kmean_tol, - clust.raw(), - eigVals, - eigVecs, - iters_lanczos, - iters_kmeans); - - // give a copy of results to the user - if (rc == NVGRAPH_OK) - { - CHECK_CUDA(cudaMemcpy((int* )clustering, - clust.raw(), - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy((float* )eig_vals, - eigVals.raw(), - (size_t )(n_eig_vects * sizeof(float)), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy((float* )eig_vects, - eigVecs.raw(), - (size_t )(n_eig_vects * MCSRG->get_num_vertices() - * sizeof(float)), - cudaMemcpyDefault)); - } - - break; - } - case CUDA_R_64F: - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - Vector clust(MCSRG->get_num_vertices(), handle->stream); - Vector eigVals(n_eig_vects, handle->stream); - Vector eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream); - int restartIter_lanczos = 15 + n_eig_vects; - rc = modularity_maximization(network, - n_clusters, - n_eig_vects, - evs_max_it, - restartIter_lanczos, - evs_tol, - kmean_max_it, - kmean_tol, - clust.raw(), - eigVals, - eigVecs, - iters_lanczos, - iters_kmeans); - // give a copy of results to the user - if (rc == NVGRAPH_OK) - { - CHECK_CUDA(cudaMemcpy((int* )clustering, - clust.raw(), - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy((double* )eig_vals, - eigVals.raw(), - (size_t )(n_eig_vects * sizeof(double)), - cudaMemcpyDefault)); - CHECK_CUDA(cudaMemcpy((double* )eig_vects, - eigVecs.raw(), - (size_t )(n_eig_vects * MCSRG->get_num_vertices() - * sizeof(double)), - cudaMemcpyDefault)); - } - break; - } - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - return getCAPIStatusForError(rc); - } - - nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeModularityClustering_impl( nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int n_clusters, - const int* clustering, - float * modularity) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->TT != NVGRAPH_CSR_32) // supported topologies - return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED; - - if (n_clusters < 2) - return NVGRAPH_STATUS_INVALID_VALUE; - - if (clustering == NULL || modularity == NULL) - return NVGRAPH_STATUS_INVALID_VALUE; - - switch (descrG->T) - { - case CUDA_R_32F: - { - float mod; - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || n_clusters > static_cast(MCSRG->get_num_vertices())) - return NVGRAPH_STATUS_INVALID_VALUE; - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - Vector clust(MCSRG->get_num_vertices(), handle->stream); - CHECK_CUDA(cudaMemcpy(clust.raw(), - (int* )clustering, - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - rc = analyzeModularity(network, - n_clusters, - clust.raw(), - mod); - *modularity = mod; - break; - } - case CUDA_R_64F: - { - double mod; - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - if (weight_index >= MCSRG->get_num_edge_dim() - || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 - return NVGRAPH_STATUS_INVALID_VALUE; - Vector clust(MCSRG->get_num_vertices(), handle->stream); - CHECK_CUDA(cudaMemcpy(clust.raw(), - (int* )clustering, - (size_t )(MCSRG->get_num_vertices() * sizeof(int)), - cudaMemcpyDefault)); - nvgraph::ValuedCsrGraph network = - *MCSRG->get_valued_csr_graph(weight_index); - rc = analyzeModularity(network, - n_clusters, - clust.raw(), - mod); - *modularity = static_cast(mod); - break; - } - - default: - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - } - } - NVGRAPH_CATCHES(rc) - return getCAPIStatusForError(rc); - } + int max_it; + int ss_sz; + float tol; + + if (max_iter > 0) + max_it = max_iter; + else + max_it = 500; + + if (subspace_size > 0) + ss_sz = subspace_size; + else + ss_sz = 8; + + if (tolerance == 0.0f) + tol = 1.0E-6f; + else if (tolerance < 1.0f && tolerance > 0.0f) + tol = tolerance; + else + return NVGRAPH_STATUS_INVALID_VALUE; + + switch (descrG->T) + { + case CUDA_R_32F: + { + float alphaT = *static_cast(alpha); + if (alphaT <= 0.0f || alphaT >= 1.0f) + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || bookmark >= MCSRG->get_num_vertex_dim() + || rank >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + + int n = static_cast(MCSRG->get_num_vertices()); + nvgraph::Vector guess(n, handle->stream), eigVals(1, handle->stream); + if (has_guess) + guess.copy(MCSRG->get_vertex_dim(rank)); + else + guess.fill(static_cast(1.0 / n)); + nvgraph::ImplicitArnoldi iram_solver(*MCSRG->get_valued_csr_graph(weight_index), + MCSRG->get_vertex_dim(bookmark), + tol, + max_it, + alphaT); + rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank)); + break; + } + case CUDA_R_64F: + { + // curently iram solver accept float for alpha + double alphaTemp = *static_cast(alpha); + float alphaT = static_cast(alphaTemp); + if (alphaT <= 0.0f || alphaT >= 1.0f) + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || bookmark >= MCSRG->get_num_vertex_dim() + || rank >= MCSRG->get_num_vertex_dim()) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + + int n = static_cast(MCSRG->get_num_vertices()); + nvgraph::Vector guess(n, handle->stream), eigVals(1, handle->stream); + if (has_guess) + guess.copy(MCSRG->get_vertex_dim(rank)); + else + guess.fill(static_cast(1.0 / n)); + nvgraph::ImplicitArnoldi iram_solver(*MCSRG->get_valued_csr_graph(weight_index), + MCSRG->get_vertex_dim(bookmark), + tol, + max_it, + alphaT); + rc = iram_solver.solve(ss_sz, 1, guess, eigVals, MCSRG->get_vertex_dim(rank)); + break; + } + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + nvgraphGraphDescr_t subdescrG, + int *subvertices, + size_t numvertices) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + typedef int IndexType; + + try + { + if (check_context(handle) || + check_graph(descrG) || + !subdescrG || + check_int_size(numvertices) || + check_ptr(subvertices)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (!numvertices) + return NVGRAPH_STATUS_INVALID_VALUE; + + subdescrG->TT = descrG->TT; + subdescrG->T = descrG->T; + + switch (descrG->graphStatus) + { + case HAS_TOPOLOGY: //CsrGraph + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + + Graph* subgraph = extract_subgraph_by_vertices(*CSRG, + subvertices, + numvertices, + handle->stream); + + subdescrG->graph_handle = subgraph; + subdescrG->graphStatus = HAS_TOPOLOGY; + } + break; + + case HAS_VALUES: //MultiValuedCsrGraph + if (descrG->T == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + + nvgraph::MultiValuedCsrGraph* subgraph = + extract_subgraph_by_vertices(*MCSRG, + subvertices, + numvertices, + handle->stream); + + subdescrG->graph_handle = subgraph; + subdescrG->graphStatus = HAS_VALUES; + } + else if (descrG->T == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + + nvgraph::MultiValuedCsrGraph* subgraph = + extract_subgraph_by_vertices(*MCSRG, + subvertices, + numvertices, + handle->stream); + + subdescrG->graph_handle = subgraph; + subdescrG->graphStatus = HAS_VALUES; + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + break; + + default: + return NVGRAPH_STATUS_INVALID_VALUE; + } + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + nvgraphGraphDescr_t subdescrG, + int *subedges, + size_t numedges) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + //TODO: extract handle->stream info, from handler/nvgraphContext (?) + typedef int IndexType; + + try + { + if (check_context(handle) || + check_graph(descrG) || + !subdescrG || + check_int_size(numedges) || + check_ptr(subedges)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (!numedges) + return NVGRAPH_STATUS_INVALID_VALUE; + + subdescrG->TT = descrG->TT; + subdescrG->T = descrG->T; + + switch (descrG->graphStatus) + { + case HAS_TOPOLOGY: //CsrGraph + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + Graph* subgraph = extract_subgraph_by_edges(*CSRG, + subedges, + numedges, + handle->stream); + + subdescrG->graph_handle = subgraph; + subdescrG->graphStatus = HAS_TOPOLOGY; + } + break; + + case HAS_VALUES: //MultiValuedCsrGraph + if (descrG->T == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + + nvgraph::MultiValuedCsrGraph* subgraph = + extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream); + + subdescrG->graph_handle = subgraph; + subdescrG->graphStatus = HAS_VALUES; + } + else if (descrG->T == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + + nvgraph::MultiValuedCsrGraph* subgraph = + extract_subgraph_by_edges(*MCSRG, subedges, numedges, handle->stream); + + subdescrG->graph_handle = subgraph; + subdescrG->graphStatus = HAS_VALUES; + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + break; + + default: + return NVGRAPH_STATUS_INVALID_VALUE; + } + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphBalancedCutClustering_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int n_clusters, + const int n_eig_vects, + const int evs_type, + const float evs_tolerance, + const int evs_max_iter, + const float kmean_tolerance, + const int kmean_max_iter, + int* clustering, + void* eig_vals, + void* eig_vects) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->TT != NVGRAPH_CSR_32) // supported topologies + return NVGRAPH_STATUS_INVALID_VALUE; + + int evs_max_it, kmean_max_it; + int iters_lanczos, iters_kmeans; + float evs_tol, kmean_tol; + + if (evs_max_iter > 0) + evs_max_it = evs_max_iter; + else + evs_max_it = 4000; + + if (evs_tolerance == 0.0f) + evs_tol = 1.0E-3f; + else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f) + evs_tol = evs_tolerance; + else + return NVGRAPH_STATUS_INVALID_VALUE; + + if (kmean_max_iter > 0) + kmean_max_it = kmean_max_iter; + else + kmean_max_it = 200; + + if (kmean_tolerance == 0.0f) + kmean_tol = 1.0E-2f; + else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f) + kmean_tol = kmean_tolerance; + else + return NVGRAPH_STATUS_INVALID_VALUE; + + if (n_clusters < 2) + return NVGRAPH_STATUS_INVALID_VALUE; + + if (n_eig_vects > n_clusters) + return NVGRAPH_STATUS_INVALID_VALUE; + + if (!(evs_type == 0 || evs_type == 1)) + return NVGRAPH_STATUS_INVALID_VALUE; + + if (clustering == NULL || eig_vals == NULL || eig_vects == NULL) + return NVGRAPH_STATUS_INVALID_VALUE; + + switch (descrG->T) + { + case CUDA_R_32F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + Vector clust(MCSRG->get_num_vertices(), handle->stream); + Vector eigVals(n_eig_vects, handle->stream); + Vector eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream); + + if (evs_type == 0) + { + int restartIter_lanczos = 15 + n_eig_vects; + rc = partition(network, + n_clusters, + n_eig_vects, + evs_max_it, + restartIter_lanczos, + evs_tol, + kmean_max_it, + kmean_tol, + clust.raw(), + eigVals, + eigVecs, + iters_lanczos, + iters_kmeans); + } + else + { + cusolverDnHandle_t cusolverHandle; + cusolverDnCreate(&cusolverHandle); + rc = partition_lobpcg(network, + NULL, // preconditioner + cusolverHandle, + n_clusters, + n_eig_vects, + evs_max_it, + evs_tol, + kmean_max_it, + kmean_tol, + clust.raw(), + eigVals, + eigVecs, + iters_lanczos, + iters_kmeans); + } + // give a copy of results to the user + if (rc == NVGRAPH_OK) + { + CHECK_CUDA(cudaMemcpy((int* )clustering, + clust.raw(), + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy((float* )eig_vals, + eigVals.raw(), + (size_t )(n_eig_vects * sizeof(float)), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy((float* )eig_vects, + eigVecs.raw(), + (size_t )(n_eig_vects * MCSRG->get_num_vertices() + * sizeof(float)), + cudaMemcpyDefault)); + } + + break; + } + case CUDA_R_64F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + Vector clust(MCSRG->get_num_vertices(), handle->stream); + Vector eigVals(n_eig_vects, handle->stream); + Vector eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream); + if (evs_type == 0) + { + int restartIter_lanczos = 15 + n_eig_vects; + rc = partition(network, + n_clusters, + n_eig_vects, + evs_max_it, + restartIter_lanczos, + evs_tol, + kmean_max_it, + kmean_tol, + clust.raw(), + eigVals, + eigVecs, + iters_lanczos, + iters_kmeans); + } + else + { + cusolverDnHandle_t cusolverHandle; + cusolverDnCreate(&cusolverHandle); + rc = partition_lobpcg(network, + NULL, // preconditioner + cusolverHandle, + n_clusters, + n_eig_vects, + evs_max_it, + evs_tol, + kmean_max_it, + kmean_tol, + clust.raw(), + eigVals, + eigVecs, + iters_lanczos, + iters_kmeans); + } + // give a copy of results to the user + if (rc == NVGRAPH_OK) + { + CHECK_CUDA(cudaMemcpy((int* )clustering, + clust.raw(), + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy((double* )eig_vals, + eigVals.raw(), + (size_t )(n_eig_vects * sizeof(double)), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy((double* )eig_vects, + eigVecs.raw(), + (size_t )(n_eig_vects * MCSRG->get_num_vertices() + * sizeof(double)), + cudaMemcpyDefault)); + } + break; + } + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeBalancedCut_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int n_clusters, + const int* clustering, + float * edgeCut, + float * ratioCut) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->TT != NVGRAPH_CSR_32) // supported topologies + return NVGRAPH_STATUS_INVALID_VALUE; + + if (n_clusters < 2) + return NVGRAPH_STATUS_INVALID_VALUE; + + if (clustering == NULL || edgeCut == NULL || ratioCut == NULL) + return NVGRAPH_STATUS_INVALID_VALUE; + + switch (descrG->T) + { + case CUDA_R_32F: + { + float edge_cut, ratio_cut; + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || n_clusters > static_cast(MCSRG->get_num_vertices())) + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + Vector clust(MCSRG->get_num_vertices(), handle->stream); + CHECK_CUDA(cudaMemcpy(clust.raw(), + (int* )clustering, + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + rc = analyzePartition(network, + n_clusters, + clust.raw(), + edge_cut, + ratio_cut); + *edgeCut = edge_cut; + *ratioCut = ratio_cut; + break; + } + case CUDA_R_64F: + { + double edge_cut, ratio_cut; + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + Vector clust(MCSRG->get_num_vertices(), handle->stream); + CHECK_CUDA(cudaMemcpy(clust.raw(), + (int* )clustering, + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + rc = analyzePartition(network, + n_clusters, + clust.raw(), + edge_cut, + ratio_cut); + *edgeCut = static_cast(edge_cut); + *ratioCut = static_cast(ratio_cut); + break; + } + + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + return getCAPIStatusForError(rc); + + } + + nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const nvgraphEdgeWeightMatching_t similarity_metric, + int* aggregates, + size_t* num_aggregates) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->TT != NVGRAPH_CSR_32) // supported topologies + return NVGRAPH_STATUS_INVALID_VALUE; + + if (aggregates == NULL) + return NVGRAPH_STATUS_INVALID_VALUE; + Matching_t sim_metric; + switch (similarity_metric) + { + case NVGRAPH_UNSCALED: { + sim_metric = USER_PROVIDED; + break; + } + case NVGRAPH_SCALED_BY_ROW_SUM: { + sim_metric = SCALED_BY_ROW_SUM; + break; + } + case NVGRAPH_SCALED_BY_DIAGONAL: { + sim_metric = SCALED_BY_DIAGONAL; + break; + } + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + + switch (descrG->T) + { + case CUDA_R_32F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim()) + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + Vector agg(MCSRG->get_num_vertices(), handle->stream); + int num_agg = 0; + nvgraph::Size2Selector one_phase_hand_checking(sim_metric); + rc = one_phase_hand_checking.setAggregates(network, agg, num_agg); + *num_aggregates = static_cast(num_agg); + CHECK_CUDA(cudaMemcpy((int* )aggregates, + agg.raw(), + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + break; + } + case CUDA_R_64F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim()) + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + Vector agg(MCSRG->get_num_vertices(), handle->stream); + Vector agg_global(MCSRG->get_num_vertices(), handle->stream); + int num_agg = 0; + nvgraph::Size2Selector one_phase_hand_checking(sim_metric); + rc = one_phase_hand_checking.setAggregates(network, agg, num_agg); + *num_aggregates = static_cast(num_agg); + CHECK_CUDA(cudaMemcpy((int* )aggregates, + agg.raw(), + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + break; + } + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + return getCAPIStatusForError(rc); + + } + + nvgraphStatus_t NVGRAPH_API nvgraphSpectralModularityMaximization_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int n_clusters, + const int n_eig_vects, + const float evs_tolerance, + const int evs_max_iter, + const float kmean_tolerance, + const int kmean_max_iter, + int* clustering, + void* eig_vals, + void* eig_vects) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->TT != NVGRAPH_CSR_32) // supported topologies + return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED; + + int evs_max_it, kmean_max_it; + int iters_lanczos, iters_kmeans; + float evs_tol, kmean_tol; + + if (evs_max_iter > 0) + evs_max_it = evs_max_iter; + else + evs_max_it = 4000; + + if (evs_tolerance == 0.0f) + evs_tol = 1.0E-3f; + else if (evs_tolerance < 1.0f && evs_tolerance > 0.0f) + evs_tol = evs_tolerance; + else + return NVGRAPH_STATUS_INVALID_VALUE; + + if (kmean_max_iter > 0) + kmean_max_it = kmean_max_iter; + else + kmean_max_it = 200; + + if (kmean_tolerance == 0.0f) + kmean_tol = 1.0E-2f; + else if (kmean_tolerance < 1.0f && kmean_tolerance > 0.0f) + kmean_tol = kmean_tolerance; + else + return NVGRAPH_STATUS_INVALID_VALUE; + + if (n_clusters < 2) + return NVGRAPH_STATUS_INVALID_VALUE; + + if (n_eig_vects > n_clusters) + return NVGRAPH_STATUS_INVALID_VALUE; + + if (clustering == NULL || eig_vals == NULL || eig_vects == NULL) + return NVGRAPH_STATUS_INVALID_VALUE; + + switch (descrG->T) + { + case CUDA_R_32F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + Vector clust(MCSRG->get_num_vertices(), handle->stream); + Vector eigVals(n_eig_vects, handle->stream); + Vector eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream); + int restartIter_lanczos = 15 + n_eig_vects; + rc = modularity_maximization(network, + n_clusters, + n_eig_vects, + evs_max_it, + restartIter_lanczos, + evs_tol, + kmean_max_it, + kmean_tol, + clust.raw(), + eigVals, + eigVecs, + iters_lanczos, + iters_kmeans); + + // give a copy of results to the user + if (rc == NVGRAPH_OK) + { + CHECK_CUDA(cudaMemcpy((int* )clustering, + clust.raw(), + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy((float* )eig_vals, + eigVals.raw(), + (size_t )(n_eig_vects * sizeof(float)), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy((float* )eig_vects, + eigVecs.raw(), + (size_t )(n_eig_vects * MCSRG->get_num_vertices() + * sizeof(float)), + cudaMemcpyDefault)); + } + + break; + } + case CUDA_R_64F: + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + Vector clust(MCSRG->get_num_vertices(), handle->stream); + Vector eigVals(n_eig_vects, handle->stream); + Vector eigVecs(MCSRG->get_num_vertices() * n_eig_vects, handle->stream); + int restartIter_lanczos = 15 + n_eig_vects; + rc = modularity_maximization(network, + n_clusters, + n_eig_vects, + evs_max_it, + restartIter_lanczos, + evs_tol, + kmean_max_it, + kmean_tol, + clust.raw(), + eigVals, + eigVecs, + iters_lanczos, + iters_kmeans); + // give a copy of results to the user + if (rc == NVGRAPH_OK) + { + CHECK_CUDA(cudaMemcpy((int* )clustering, + clust.raw(), + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy((double* )eig_vals, + eigVals.raw(), + (size_t )(n_eig_vects * sizeof(double)), + cudaMemcpyDefault)); + CHECK_CUDA(cudaMemcpy((double* )eig_vects, + eigVecs.raw(), + (size_t )(n_eig_vects * MCSRG->get_num_vertices() + * sizeof(double)), + cudaMemcpyDefault)); + } + break; + } + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + return getCAPIStatusForError(rc); + } + + nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeModularityClustering_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int n_clusters, + const int* clustering, + float * modularity) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_int_size(weight_index)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->graphStatus != HAS_VALUES) // need a MultiValuedCsrGraph + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->TT != NVGRAPH_CSR_32) // supported topologies + return NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED; + + if (n_clusters < 2) + return NVGRAPH_STATUS_INVALID_VALUE; + + if (clustering == NULL || modularity == NULL) + return NVGRAPH_STATUS_INVALID_VALUE; + + switch (descrG->T) + { + case CUDA_R_32F: + { + float mod; + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || n_clusters > static_cast(MCSRG->get_num_vertices())) + return NVGRAPH_STATUS_INVALID_VALUE; + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + Vector clust(MCSRG->get_num_vertices(), handle->stream); + CHECK_CUDA(cudaMemcpy(clust.raw(), + (int* )clustering, + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + rc = analyzeModularity(network, + n_clusters, + clust.raw(), + mod); + *modularity = mod; + break; + } + case CUDA_R_64F: + { + double mod; + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + if (weight_index >= MCSRG->get_num_edge_dim() + || n_clusters > static_cast(MCSRG->get_num_vertices())) // base index is 0 + return NVGRAPH_STATUS_INVALID_VALUE; + Vector clust(MCSRG->get_num_vertices(), handle->stream); + CHECK_CUDA(cudaMemcpy(clust.raw(), + (int* )clustering, + (size_t )(MCSRG->get_num_vertices() * sizeof(int)), + cudaMemcpyDefault)); + nvgraph::ValuedCsrGraph network = + *MCSRG->get_valued_csr_graph(weight_index); + rc = analyzeModularity(network, + n_clusters, + clust.raw(), + mod); + *modularity = static_cast(mod); + break; + } + + default: + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + } + } + NVGRAPH_CATCHES(rc) + return getCAPIStatusForError(rc); + } #ifndef NVGRAPH_LIGHT - nvgraphStatus_t NVGRAPH_API nvgraphContractGraph_impl(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - nvgraphGraphDescr_t contrdescrG, - int *aggregates, - size_t numaggregates, - nvgraphSemiringOps_t VertexCombineOp, - nvgraphSemiringOps_t VertexReduceOp, - nvgraphSemiringOps_t EdgeCombineOp, - nvgraphSemiringOps_t EdgeReduceOp, - int flag) //unused, for now - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - typedef int IndexType; - - try { - if (check_context(handle) || - check_graph(descrG) || - !contrdescrG || - check_int_size(numaggregates) || - check_ptr(aggregates)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - contrdescrG->TT = descrG->TT; - contrdescrG->T = descrG->T; - - switch (descrG->graphStatus) - { - case HAS_TOPOLOGY: //CsrGraph - { - nvgraph::CsrGraph *CSRG = - static_cast*>(descrG->graph_handle); - - Graph* contracted_graph = NULL; - - switch (VertexCombineOp) - { - case NVGRAPH_MULTIPLY: - contracted_graph = contract_graph_csr_mul(*CSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - case NVGRAPH_SUM: - contracted_graph = contract_graph_csr_sum(*CSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - case NVGRAPH_MIN: - contracted_graph = contract_graph_csr_min(*CSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - case NVGRAPH_MAX: - contracted_graph = contract_graph_csr_max(*CSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - } - - contrdescrG->graph_handle = contracted_graph; - contrdescrG->graphStatus = HAS_TOPOLOGY; - } - break; - - case HAS_VALUES: //MultiValuedCsrGraph - if (descrG->T == CUDA_R_32F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - nvgraph::MultiValuedCsrGraph* contracted_graph = NULL; - - switch (VertexCombineOp) - { - case NVGRAPH_MULTIPLY: - contracted_graph = contract_graph_mv_float_mul(*MCSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - case NVGRAPH_SUM: - contracted_graph = contract_graph_mv_float_sum(*MCSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - case NVGRAPH_MIN: - contracted_graph = contract_graph_mv_float_min(*MCSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - case NVGRAPH_MAX: - contracted_graph = contract_graph_mv_float_max(*MCSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - } - - contrdescrG->graph_handle = contracted_graph; - contrdescrG->graphStatus = HAS_VALUES; - } - else if (descrG->T == CUDA_R_64F) - { - nvgraph::MultiValuedCsrGraph *MCSRG = - static_cast*>(descrG->graph_handle); - - nvgraph::MultiValuedCsrGraph* contracted_graph = NULL; - - switch (VertexCombineOp) - { - case NVGRAPH_MULTIPLY: - contracted_graph = contract_graph_mv_double_mul(*MCSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - case NVGRAPH_SUM: - contracted_graph = contract_graph_mv_double_sum(*MCSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - case NVGRAPH_MIN: - contracted_graph = contract_graph_mv_double_min(*MCSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - case NVGRAPH_MAX: - contracted_graph = contract_graph_mv_double_max(*MCSRG, - aggregates, - numaggregates, - handle->stream, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp); - break; - } - - contrdescrG->graph_handle = contracted_graph; - contrdescrG->graphStatus = HAS_VALUES; - } - else - return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; - break; - - default: - return NVGRAPH_STATUS_INVALID_VALUE; - } - - } - NVGRAPH_CATCHES(rc) - - return getCAPIStatusForError(rc); - } + nvgraphStatus_t NVGRAPH_API nvgraphContractGraph_impl(nvgraphHandle_t handle, + nvgraphGraphDescr_t descrG, + nvgraphGraphDescr_t contrdescrG, + int *aggregates, + size_t numaggregates, + nvgraphSemiringOps_t VertexCombineOp, + nvgraphSemiringOps_t VertexReduceOp, + nvgraphSemiringOps_t EdgeCombineOp, + nvgraphSemiringOps_t EdgeReduceOp, + int flag) {//unused, for now + NVGRAPH_ERROR rc = NVGRAPH_OK; + typedef int IndexType; + + try { + if (check_context(handle) || + check_graph(descrG) || + !contrdescrG || + check_int_size(numaggregates) || + check_ptr(aggregates)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + contrdescrG->TT = descrG->TT; + contrdescrG->T = descrG->T; + + switch (descrG->graphStatus) + { + case HAS_TOPOLOGY: //CsrGraph + { + nvgraph::CsrGraph *CSRG = + static_cast*>(descrG->graph_handle); + + Graph* contracted_graph = NULL; + + switch (VertexCombineOp) + { + case NVGRAPH_MULTIPLY: + contracted_graph = contract_graph_csr_mul(*CSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + case NVGRAPH_SUM: + contracted_graph = contract_graph_csr_sum(*CSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + case NVGRAPH_MIN: + contracted_graph = contract_graph_csr_min(*CSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + case NVGRAPH_MAX: + contracted_graph = contract_graph_csr_max(*CSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + } + + contrdescrG->graph_handle = contracted_graph; + contrdescrG->graphStatus = HAS_TOPOLOGY; + } + break; + + case HAS_VALUES: //MultiValuedCsrGraph + if (descrG->T == CUDA_R_32F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + nvgraph::MultiValuedCsrGraph* contracted_graph = NULL; + + switch (VertexCombineOp) + { + case NVGRAPH_MULTIPLY: + contracted_graph = contract_graph_mv_float_mul(*MCSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + case NVGRAPH_SUM: + contracted_graph = contract_graph_mv_float_sum(*MCSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + case NVGRAPH_MIN: + contracted_graph = contract_graph_mv_float_min(*MCSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + case NVGRAPH_MAX: + contracted_graph = contract_graph_mv_float_max(*MCSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + } + + contrdescrG->graph_handle = contracted_graph; + contrdescrG->graphStatus = HAS_VALUES; + } + else if (descrG->T == CUDA_R_64F) + { + nvgraph::MultiValuedCsrGraph *MCSRG = + static_cast*>(descrG->graph_handle); + + nvgraph::MultiValuedCsrGraph* contracted_graph = NULL; + + switch (VertexCombineOp) + { + case NVGRAPH_MULTIPLY: + contracted_graph = contract_graph_mv_double_mul(*MCSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + case NVGRAPH_SUM: + contracted_graph = contract_graph_mv_double_sum(*MCSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + case NVGRAPH_MIN: + contracted_graph = contract_graph_mv_double_min(*MCSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + case NVGRAPH_MAX: + contracted_graph = contract_graph_mv_double_max(*MCSRG, + aggregates, + numaggregates, + handle->stream, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp); + break; + } + + contrdescrG->graph_handle = contracted_graph; + contrdescrG->graphStatus = HAS_VALUES; + } + else + return NVGRAPH_STATUS_TYPE_NOT_SUPPORTED; + break; + + default: + return NVGRAPH_STATUS_INVALID_VALUE; + } + + } + NVGRAPH_CATCHES(rc) + + return getCAPIStatusForError(rc); + } #endif - - nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle. - const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights) - const size_t weight_index, // Index of the edge set for the weights. - const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter - int* clustering, // (output) clustering - void* eig_vals, // (output) eigenvalues - void* eig_vects) // (output) eigenvectors - { - if (check_ptr(params) || check_ptr(clustering) || check_ptr(eig_vals) || check_ptr(eig_vects)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - if (params->algorithm == NVGRAPH_MODULARITY_MAXIMIZATION) - return nvgraph::nvgraphSpectralModularityMaximization_impl(handle, - descrG, - weight_index, - params->n_clusters, - params->n_eig_vects, - params->evs_tolerance, - params->evs_max_iter, - params->kmean_tolerance, - params->kmean_max_iter, - clustering, - eig_vals, - eig_vects); - else if (params->algorithm == NVGRAPH_BALANCED_CUT_LANCZOS) - return nvgraph::nvgraphBalancedCutClustering_impl(handle, - descrG, - weight_index, - params->n_clusters, - params->n_eig_vects, - 0, - params->evs_tolerance, - params->evs_max_iter, - params->kmean_tolerance, - params->kmean_max_iter, - clustering, - eig_vals, - eig_vects); - else if (params->algorithm == NVGRAPH_BALANCED_CUT_LOBPCG) - return nvgraph::nvgraphBalancedCutClustering_impl(handle, - descrG, - weight_index, - params->n_clusters, - params->n_eig_vects, - 1, - params->evs_tolerance, - params->evs_max_iter, - params->kmean_tolerance, - params->kmean_max_iter, - clustering, - eig_vals, - eig_vects); - else - return NVGRAPH_STATUS_INVALID_VALUE; - } - - nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle. - const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights) - const size_t weight_index, // Index of the edge set for the weights. - const int n_clusters, //number of clusters - const int* clustering, // clustering to analyse - nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality - float * score) // (output) clustering score telling how good the clustering is for the selected metric. - { - if (check_ptr(clustering) || check_ptr(score)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - if (metric == NVGRAPH_MODULARITY) - return nvgraphAnalyzeModularityClustering_impl(handle, - descrG, - weight_index, - n_clusters, - clustering, - score); - else if (metric == NVGRAPH_EDGE_CUT) - { - float dummy = 0; - return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle, - descrG, - weight_index, - n_clusters, - clustering, - score, - &dummy); - } - else if (metric == NVGRAPH_RATIO_CUT) - { - float dummy = 0; - return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle, - descrG, - weight_index, - n_clusters, - clustering, - &dummy, - score); - } - else - return NVGRAPH_STATUS_INVALID_VALUE; - } - - nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount_impl(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - uint64_t* result) - { - NVGRAPH_ERROR rc = NVGRAPH_OK; - try - { - if (check_context(handle) || check_graph(descrG) || check_ptr(result)) - FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); - - if (descrG->TT != NVGRAPH_CSR_32 && descrG->TT != NVGRAPH_CSC_32) // supported topologies - return NVGRAPH_STATUS_INVALID_VALUE; - - if (descrG->graphStatus != HAS_TOPOLOGY && descrG->graphStatus != HAS_VALUES) - { - return NVGRAPH_STATUS_INVALID_VALUE; // should have topology - } - - nvgraph::CsrGraph *CSRG = static_cast*>(descrG->graph_handle); - if (CSRG == NULL) - return NVGRAPH_STATUS_MAPPING_ERROR; - nvgraph::triangles_counting::TrianglesCount counter(*CSRG); /* stream, device */ - rc = counter.count(); - uint64_t s_res = counter.get_triangles_count(); - *result = static_cast(s_res); - - } - NVGRAPH_CATCHES(rc) - return getCAPIStatusForError(rc); - } + + nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle. + const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights) + const size_t weight_index, // Index of the edge set for the weights. + const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter + int* clustering, // (output) clustering + void* eig_vals, // (output) eigenvalues + void* eig_vects) {// (output) eigenvectors + if (check_ptr(params) || check_ptr(clustering) || check_ptr(eig_vals) || check_ptr(eig_vects)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + if (params->algorithm == NVGRAPH_MODULARITY_MAXIMIZATION) + return nvgraph::nvgraphSpectralModularityMaximization_impl(handle, + descrG, + weight_index, + params->n_clusters, + params->n_eig_vects, + params->evs_tolerance, + params->evs_max_iter, + params->kmean_tolerance, + params->kmean_max_iter, + clustering, + eig_vals, + eig_vects); + else if (params->algorithm == NVGRAPH_BALANCED_CUT_LANCZOS) + return nvgraph::nvgraphBalancedCutClustering_impl(handle, + descrG, + weight_index, + params->n_clusters, + params->n_eig_vects, + 0, + params->evs_tolerance, + params->evs_max_iter, + params->kmean_tolerance, + params->kmean_max_iter, + clustering, + eig_vals, + eig_vects); + else if (params->algorithm == NVGRAPH_BALANCED_CUT_LOBPCG) + return nvgraph::nvgraphBalancedCutClustering_impl(handle, + descrG, + weight_index, + params->n_clusters, + params->n_eig_vects, + 1, + params->evs_tolerance, + params->evs_max_iter, + params->kmean_tolerance, + params->kmean_max_iter, + clustering, + eig_vals, + eig_vects); + else + return NVGRAPH_STATUS_INVALID_VALUE; + } + + nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering_impl(nvgraphHandle_t handle, // nvGRAPH library handle. + const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights) + const size_t weight_index, // Index of the edge set for the weights. + const int n_clusters, //number of clusters + const int* clustering, // clustering to analyse + nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality + float * score) {// (output) clustering score telling how good the clustering is for the selected metric. + if (check_ptr(clustering) || check_ptr(score)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + if (metric == NVGRAPH_MODULARITY) + return nvgraphAnalyzeModularityClustering_impl(handle, + descrG, + weight_index, + n_clusters, + clustering, + score); + else if (metric == NVGRAPH_EDGE_CUT) { + float dummy = 0; + return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle, + descrG, + weight_index, + n_clusters, + clustering, + score, + &dummy); + } + else if (metric == NVGRAPH_RATIO_CUT) { + float dummy = 0; + return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle, + descrG, + weight_index, + n_clusters, + clustering, + &dummy, + score); + } + else + return NVGRAPH_STATUS_INVALID_VALUE; + } + + nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount_impl(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + uint64_t* result) { + NVGRAPH_ERROR rc = NVGRAPH_OK; + try + { + if (check_context(handle) || check_graph(descrG) || check_ptr(result)) + FatalError("Incorrect parameters.", NVGRAPH_ERR_BAD_PARAMETERS); + + if (descrG->TT != NVGRAPH_CSR_32 && descrG->TT != NVGRAPH_CSC_32) // supported topologies + return NVGRAPH_STATUS_INVALID_VALUE; + + if (descrG->graphStatus != HAS_TOPOLOGY && descrG->graphStatus != HAS_VALUES) + { + return NVGRAPH_STATUS_INVALID_VALUE; // should have topology + } + + nvgraph::CsrGraph *CSRG = static_cast*>(descrG->graph_handle); + if (CSRG == NULL) + return NVGRAPH_STATUS_MAPPING_ERROR; + nvgraph::triangles_counting::TrianglesCount counter(*CSRG); /* stream, device */ + rc = counter.count(); + uint64_t s_res = counter.get_triangles_count(); + *result = static_cast(s_res); + + } + NVGRAPH_CATCHES(rc) + return getCAPIStatusForError(rc); + } } /*namespace nvgraph*/ @@ -3538,337 +3433,324 @@ namespace nvgraph * API *************************/ -nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value) - { - switch (type) { - case MAJOR_VERSION: - *value = CUDART_VERSION / 1000; - break; - case MINOR_VERSION: - *value = (CUDART_VERSION % 1000) / 10; - break; - case PATCH_LEVEL: - *value = 0; - break; - default: - return NVGRAPH_STATUS_INVALID_VALUE; - } - return NVGRAPH_STATUS_SUCCESS; +nvgraphStatus_t NVGRAPH_API nvgraphGetProperty(libraryPropertyType type, int *value) { + switch (type) { + case MAJOR_VERSION: + *value = CUDART_VERSION / 1000; + break; + case MINOR_VERSION: + *value = (CUDART_VERSION % 1000) / 10; + break; + case PATCH_LEVEL: + *value = 0; + break; + default: + return NVGRAPH_STATUS_INVALID_VALUE; + } + return NVGRAPH_STATUS_SUCCESS; } -nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle) - { - return nvgraph::nvgraphCreate_impl(handle); +nvgraphStatus_t NVGRAPH_API nvgraphCreate(nvgraphHandle_t *handle) { + return nvgraph::nvgraphCreate_impl(handle); } nvgraphStatus_t NVGRAPH_API nvgraphCreateMulti(nvgraphHandle_t *handle, - int numDevices, - int* devices) { - return nvgraph::nvgraphCreateMulti_impl(handle, numDevices, devices); + int numDevices, + int* devices) { + return nvgraph::nvgraphCreateMulti_impl(handle, numDevices, devices); } -nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle) - { - return nvgraph::nvgraphDestroy_impl(handle); +nvgraphStatus_t NVGRAPH_API nvgraphDestroy(nvgraphHandle_t handle) { + return nvgraph::nvgraphDestroy_impl(handle); } nvgraphStatus_t NVGRAPH_API nvgraphCreateGraphDescr(nvgraphHandle_t handle, - nvgraphGraphDescr_t *descrG) - { - return nvgraph::nvgraphCreateGraphDescr_impl(handle, descrG); + nvgraphGraphDescr_t *descrG) { + return nvgraph::nvgraphCreateGraphDescr_impl(handle, descrG); } nvgraphStatus_t NVGRAPH_API nvgraphDestroyGraphDescr(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG) - { - return nvgraph::nvgraphDestroyGraphDescr_impl(handle, descrG); + nvgraphGraphDescr_t descrG) { + return nvgraph::nvgraphDestroyGraphDescr_impl(handle, descrG); } -nvgraphStatus_t NVGRAPH_API nvgraphSetStream(nvgraphHandle_t handle, cudaStream_t stream) - { - return nvgraph::nvgraphSetStream_impl(handle, stream); +nvgraphStatus_t NVGRAPH_API nvgraphSetStream(nvgraphHandle_t handle, cudaStream_t stream) { + return nvgraph::nvgraphSetStream_impl(handle, stream); } nvgraphStatus_t NVGRAPH_API nvgraphSetGraphStructure(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void* topologyData, - nvgraphTopologyType_t topologyType) - { - return nvgraph::nvgraphSetGraphStructure_impl(handle, descrG, topologyData, topologyType); + nvgraphGraphDescr_t descrG, + void* topologyData, + nvgraphTopologyType_t topologyType) { + return nvgraph::nvgraphSetGraphStructure_impl(handle, descrG, topologyData, topologyType); } + nvgraphStatus_t NVGRAPH_API nvgraphGetGraphStructure(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void* topologyData, - nvgraphTopologyType_t* topologyType) - { - return nvgraph::nvgraphGetGraphStructure_impl(handle, descrG, topologyData, topologyType); + nvgraphGraphDescr_t descrG, + void* topologyData, + nvgraphTopologyType_t* topologyType) { + return nvgraph::nvgraphGetGraphStructure_impl(handle, descrG, topologyData, topologyType); } nvgraphStatus_t NVGRAPH_API nvgraphAllocateVertexData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t numsets, - cudaDataType_t *settypes) - { - return nvgraph::nvgraphAllocateVertexData_impl(handle, descrG, numsets, settypes); + nvgraphGraphDescr_t descrG, + size_t numsets, + cudaDataType_t *settypes) { + return nvgraph::nvgraphAllocateVertexData_impl(handle, descrG, numsets, settypes); } nvgraphStatus_t NVGRAPH_API nvgraphAllocateEdgeData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t numsets, - cudaDataType_t *settypes) - { - return nvgraph::nvgraphAllocateEdgeData_impl(handle, descrG, numsets, settypes); + nvgraphGraphDescr_t descrG, + size_t numsets, + cudaDataType_t *settypes) { + return nvgraph::nvgraphAllocateEdgeData_impl(handle, descrG, numsets, settypes); } nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByVertex(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - nvgraphGraphDescr_t subdescrG, - int *subvertices, - size_t numvertices) - { - return nvgraph::nvgraphExtractSubgraphByVertex_impl(handle, - descrG, - subdescrG, - subvertices, - numvertices); + nvgraphGraphDescr_t descrG, + nvgraphGraphDescr_t subdescrG, + int *subvertices, + size_t numvertices) { + return nvgraph::nvgraphExtractSubgraphByVertex_impl(handle, + descrG, + subdescrG, + subvertices, + numvertices); } nvgraphStatus_t NVGRAPH_API nvgraphExtractSubgraphByEdge(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - nvgraphGraphDescr_t subdescrG, - int *subedges, - size_t numedges) - { - return nvgraph::nvgraphExtractSubgraphByEdge_impl(handle, descrG, subdescrG, subedges, numedges); + nvgraphGraphDescr_t descrG, + nvgraphGraphDescr_t subdescrG, + int *subedges, + size_t numedges) { + return nvgraph::nvgraphExtractSubgraphByEdge_impl(handle, descrG, subdescrG, subedges, numedges); } nvgraphStatus_t NVGRAPH_API nvgraphSetVertexData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *vertexData, - size_t setnum) - { - return nvgraph::nvgraphSetVertexData_impl(handle, descrG, vertexData, setnum); + nvgraphGraphDescr_t descrG, + void *vertexData, + size_t setnum) { + return nvgraph::nvgraphSetVertexData_impl(handle, descrG, vertexData, setnum); } nvgraphStatus_t NVGRAPH_API nvgraphGetVertexData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *vertexData, - size_t setnum) - { - return nvgraph::nvgraphGetVertexData_impl(handle, descrG, vertexData, setnum); + nvgraphGraphDescr_t descrG, + void *vertexData, + size_t setnum) { + return nvgraph::nvgraphGetVertexData_impl(handle, descrG, vertexData, setnum); } nvgraphStatus_t NVGRAPH_API nvgraphConvertTopology(nvgraphHandle_t handle, - nvgraphTopologyType_t srcTType, - void *srcTopology, - void *srcEdgeData, - cudaDataType_t *dataType, - nvgraphTopologyType_t dstTType, - void *dstTopology, - void *dstEdgeData) { - return nvgraph::nvgraphConvertTopology_impl(handle, - srcTType, - srcTopology, - srcEdgeData, - dataType, - dstTType, - dstTopology, - dstEdgeData); + nvgraphTopologyType_t srcTType, + void *srcTopology, + void *srcEdgeData, + cudaDataType_t *dataType, + nvgraphTopologyType_t dstTType, + void *dstTopology, + void *dstEdgeData) { + return nvgraph::nvgraphConvertTopology_impl(handle, + srcTType, + srcTopology, + srcEdgeData, + dataType, + dstTType, + dstTopology, + dstEdgeData); } nvgraphStatus_t NVGRAPH_API nvgraphConvertGraph(nvgraphHandle_t handle, - nvgraphGraphDescr_t srcDescrG, - nvgraphGraphDescr_t dstDescrG, - nvgraphTopologyType_t dstTType) { - return nvgraph::nvgraphConvertGraph_impl(handle, srcDescrG, dstDescrG, dstTType); + nvgraphGraphDescr_t srcDescrG, + nvgraphGraphDescr_t dstDescrG, + nvgraphTopologyType_t dstTType) { + return nvgraph::nvgraphConvertGraph_impl(handle, srcDescrG, dstDescrG, dstTType); } nvgraphStatus_t NVGRAPH_API nvgraphSetEdgeData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *edgeData, - size_t setnum) { - return nvgraph::nvgraphSetEdgeData_impl(handle, descrG, edgeData, setnum); + nvgraphGraphDescr_t descrG, + void *edgeData, + size_t setnum) { + return nvgraph::nvgraphSetEdgeData_impl(handle, descrG, edgeData, setnum); } nvgraphStatus_t NVGRAPH_API nvgraphGetEdgeData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void *edgeData, - size_t setnum) { - return nvgraph::nvgraphGetEdgeData_impl(handle, descrG, edgeData, setnum); + nvgraphGraphDescr_t descrG, + void *edgeData, + size_t setnum) { + return nvgraph::nvgraphGetEdgeData_impl(handle, descrG, edgeData, setnum); } nvgraphStatus_t NVGRAPH_API nvgraphSrSpmv(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const void *alpha, - const size_t x, - const void *beta, - const size_t y, - const nvgraphSemiring_t SR) { - return nvgraph::nvgraphSrSpmv_impl_cub(handle, descrG, weight_index, alpha, x, beta, y, SR); + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const void *alpha, + const size_t x, + const void *beta, + const size_t y, + const nvgraphSemiring_t SR) { + return nvgraph::nvgraphSrSpmv_impl_cub(handle, descrG, weight_index, alpha, x, beta, y, SR); } nvgraphStatus_t NVGRAPH_API nvgraphSssp(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int *source_vert, - const size_t sssp) { - return nvgraph::nvgraphSssp_impl(handle, descrG, weight_index, source_vert, sssp); + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int *source_vert, + const size_t sssp) { + return nvgraph::nvgraphSssp_impl(handle, descrG, weight_index, source_vert, sssp); } //nvgraphTraversal typedef enum { - NVGRAPH_TRAVERSAL_DISTANCES_INDEX = 0, - NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX = 1, - NVGRAPH_TRAVERSAL_MASK_INDEX = 2, - NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX = 3, - NVGRAPH_TRAVERSAL_ALPHA = 4, - NVGRAPH_TRAVERSAL_BETA = 5 + NVGRAPH_TRAVERSAL_DISTANCES_INDEX = 0, + NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX = 1, + NVGRAPH_TRAVERSAL_MASK_INDEX = 2, + NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX = 3, + NVGRAPH_TRAVERSAL_ALPHA = 4, + NVGRAPH_TRAVERSAL_BETA = 5 } nvgraphTraversalParameterIndex_t; nvgraphStatus_t NVGRAPH_API nvgraphTraversalParameterInit(nvgraphTraversalParameter_t *param) { - if (check_ptr(param)) - return NVGRAPH_STATUS_INVALID_VALUE; + if (check_ptr(param)) + return NVGRAPH_STATUS_INVALID_VALUE; - param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = INT_MAX; - param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = INT_MAX; - param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = INT_MAX; - param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = 0; - param->pad[NVGRAPH_TRAVERSAL_ALPHA] = TRAVERSAL_DEFAULT_ALPHA; - param->pad[NVGRAPH_TRAVERSAL_BETA] = TRAVERSAL_DEFAULT_BETA; + param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = INT_MAX; + param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = INT_MAX; + param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = INT_MAX; + param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = 0; + param->pad[NVGRAPH_TRAVERSAL_ALPHA] = TRAVERSAL_DEFAULT_ALPHA; + param->pad[NVGRAPH_TRAVERSAL_BETA] = TRAVERSAL_DEFAULT_BETA; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetDistancesIndex(nvgraphTraversalParameter_t *param, - const size_t value) { - if (check_ptr(param)) - return NVGRAPH_STATUS_INVALID_VALUE; + const size_t value) { + if (check_ptr(param)) + return NVGRAPH_STATUS_INVALID_VALUE; - param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = value; + param->pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX] = value; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } -nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex( const nvgraphTraversalParameter_t param, - size_t *value) { - if (check_ptr(value)) - return NVGRAPH_STATUS_INVALID_VALUE; +nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetDistancesIndex(const nvgraphTraversalParameter_t param, + size_t *value) { + if (check_ptr(value)) + return NVGRAPH_STATUS_INVALID_VALUE; - *value = param.pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX]; + *value = param.pad[NVGRAPH_TRAVERSAL_DISTANCES_INDEX]; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetPredecessorsIndex(nvgraphTraversalParameter_t *param, - const size_t value) { - if (check_ptr(param)) - return NVGRAPH_STATUS_INVALID_VALUE; + const size_t value) { + if (check_ptr(param)) + return NVGRAPH_STATUS_INVALID_VALUE; - param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = value; + param->pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX] = value; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } -nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex( const nvgraphTraversalParameter_t param, - size_t *value) { - if (check_ptr(value)) - return NVGRAPH_STATUS_INVALID_VALUE; +nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetPredecessorsIndex(const nvgraphTraversalParameter_t param, + size_t *value) { + if (check_ptr(value)) + return NVGRAPH_STATUS_INVALID_VALUE; - *value = param.pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX]; + *value = param.pad[NVGRAPH_TRAVERSAL_PREDECESSORS_INDEX]; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetEdgeMaskIndex(nvgraphTraversalParameter_t *param, - const size_t value) { - if (check_ptr(param)) - return NVGRAPH_STATUS_INVALID_VALUE; + const size_t value) { + if (check_ptr(param)) + return NVGRAPH_STATUS_INVALID_VALUE; - param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = value; + param->pad[NVGRAPH_TRAVERSAL_MASK_INDEX] = value; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } -nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex( const nvgraphTraversalParameter_t param, - size_t *value) { - if (check_ptr(value)) - return NVGRAPH_STATUS_INVALID_VALUE; +nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetEdgeMaskIndex(const nvgraphTraversalParameter_t param, + size_t *value) { + if (check_ptr(value)) + return NVGRAPH_STATUS_INVALID_VALUE; - *value = param.pad[NVGRAPH_TRAVERSAL_MASK_INDEX]; + *value = param.pad[NVGRAPH_TRAVERSAL_MASK_INDEX]; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetUndirectedFlag(nvgraphTraversalParameter_t *param, - const size_t value) { - if (check_ptr(param)) - return NVGRAPH_STATUS_INVALID_VALUE; + const size_t value) { + if (check_ptr(param)) + return NVGRAPH_STATUS_INVALID_VALUE; - param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = value; + param->pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX] = value; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } -nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag( const nvgraphTraversalParameter_t param, - size_t *value) { - if (check_ptr(value)) - return NVGRAPH_STATUS_INVALID_VALUE; +nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetUndirectedFlag(const nvgraphTraversalParameter_t param, + size_t *value) { + if (check_ptr(value)) + return NVGRAPH_STATUS_INVALID_VALUE; - *value = param.pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX]; + *value = param.pad[NVGRAPH_TRAVERSAL_UNDIRECTED_FLAG_INDEX]; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetAlpha(nvgraphTraversalParameter_t *param, - const size_t value) { - if (check_ptr(param)) - return NVGRAPH_STATUS_INVALID_VALUE; + const size_t value) { + if (check_ptr(param)) + return NVGRAPH_STATUS_INVALID_VALUE; - param->pad[NVGRAPH_TRAVERSAL_ALPHA] = value; + param->pad[NVGRAPH_TRAVERSAL_ALPHA] = value; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetAlpha(const nvgraphTraversalParameter_t param, - size_t *value) { - if (check_ptr(value)) - return NVGRAPH_STATUS_INVALID_VALUE; + size_t *value) { + if (check_ptr(value)) + return NVGRAPH_STATUS_INVALID_VALUE; - *value = param.pad[NVGRAPH_TRAVERSAL_ALPHA]; + *value = param.pad[NVGRAPH_TRAVERSAL_ALPHA]; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } nvgraphStatus_t NVGRAPH_API nvgraphTraversalSetBeta(nvgraphTraversalParameter_t *param, - const size_t value) { - if (check_ptr(param)) - return NVGRAPH_STATUS_INVALID_VALUE; + const size_t value) { + if (check_ptr(param)) + return NVGRAPH_STATUS_INVALID_VALUE; - param->pad[NVGRAPH_TRAVERSAL_BETA] = value; + param->pad[NVGRAPH_TRAVERSAL_BETA] = value; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } nvgraphStatus_t NVGRAPH_API nvgraphTraversalGetBeta(const nvgraphTraversalParameter_t param, - size_t *value) { - if (check_ptr(value)) - return NVGRAPH_STATUS_INVALID_VALUE; + size_t *value) { + if (check_ptr(value)) + return NVGRAPH_STATUS_INVALID_VALUE; - *value = param.pad[NVGRAPH_TRAVERSAL_BETA]; + *value = param.pad[NVGRAPH_TRAVERSAL_BETA]; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const nvgraphTraversal_t traversalT, - const int *source_vert, - const nvgraphTraversalParameter_t params) { - return nvgraph::nvgraphTraversal_impl(handle, descrG, traversalT, source_vert, params); + const nvgraphGraphDescr_t descrG, + const nvgraphTraversal_t traversalT, + const int *source_vert, + const nvgraphTraversalParameter_t params) { + return nvgraph::nvgraphTraversal_impl(handle, descrG, traversalT, source_vert, params); } /** @@ -3881,243 +3763,234 @@ nvgraphStatus_t NVGRAPH_API nvgraphTraversal(nvgraphHandle_t handle, * @return Status code. */ nvgraphStatus_t NVGRAPH_API nvgraph2dBfs(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const int32_t source_vert, - int32_t* distances, - int32_t* predecessors) { - return nvgraph::nvgraph2dBfs_impl(handle, descrG, source_vert, distances, predecessors); + const nvgraphGraphDescr_t descrG, + const int32_t source_vert, + int32_t* distances, + int32_t* predecessors) { + return nvgraph::nvgraph2dBfs_impl(handle, descrG, source_vert, distances, predecessors); } //nvgraphWidestPath nvgraphStatus_t NVGRAPH_API nvgraphWidestPath(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int *source_vert, - const size_t widest_path) - { - return nvgraph::nvgraphWidestPath_impl(handle, descrG, weight_index, source_vert, widest_path); + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int *source_vert, + const size_t widest_path) { + return nvgraph::nvgraphWidestPath_impl(handle, descrG, weight_index, source_vert, widest_path); } nvgraphStatus_t NVGRAPH_API nvgraphPagerank(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const void *alpha, - const size_t bookmark, - const int has_guess, - const size_t pagerank_index, - const float tolerance, - const int max_iter) - { - return nvgraph::nvgraphPagerank_impl(handle, - descrG, - weight_index, - alpha, - bookmark, - has_guess, - pagerank_index, - tolerance, - max_iter); + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const void *alpha, + const size_t bookmark, + const int has_guess, + const size_t pagerank_index, + const float tolerance, + const int max_iter) { + return nvgraph::nvgraphPagerank_impl(handle, + descrG, + weight_index, + alpha, + bookmark, + has_guess, + pagerank_index, + tolerance, + max_iter); } nvgraphStatus_t NVGRAPH_API nvgraphKrylovPagerank(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const void *alpha, - const size_t bookmark, - const float tolerance, - const int max_iter, - const int subspace_size, - const int has_guess, - const size_t rank) - { - return nvgraph::nvgraphKrylovPagerank_impl(handle, - descrG, - weight_index, - alpha, - bookmark, - tolerance, - max_iter, - subspace_size, - has_guess, - rank); + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const void *alpha, + const size_t bookmark, + const float tolerance, + const int max_iter, + const int subspace_size, + const int has_guess, + const size_t rank) { + return nvgraph::nvgraphKrylovPagerank_impl(handle, + descrG, + weight_index, + alpha, + bookmark, + tolerance, + max_iter, + subspace_size, + has_guess, + rank); } nvgraphStatus_t NVGRAPH_API nvgraphBalancedCutClustering(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int n_clusters, - const int n_eig_vects, - const int evs_type, - const float evs_tolerance, - const int evs_max_iter, - const float kmean_tolerance, - const int kmean_max_iter, - int* clustering, - void* eig_vals, - void* eig_vects) - { - return nvgraph::nvgraphBalancedCutClustering_impl(handle, - descrG, - weight_index, - n_clusters, - n_eig_vects, - evs_type, - evs_tolerance, - evs_max_iter, - kmean_tolerance, - kmean_max_iter, - clustering, - eig_vals, - eig_vects); + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int n_clusters, + const int n_eig_vects, + const int evs_type, + const float evs_tolerance, + const int evs_max_iter, + const float kmean_tolerance, + const int kmean_max_iter, + int* clustering, + void* eig_vals, + void* eig_vects) { + return nvgraph::nvgraphBalancedCutClustering_impl(handle, + descrG, + weight_index, + n_clusters, + n_eig_vects, + evs_type, + evs_tolerance, + evs_max_iter, + kmean_tolerance, + kmean_max_iter, + clustering, + eig_vals, + eig_vects); } nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeBalancedCut(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int n_clusters, - const int* clustering, - float * edgeCut, - float * ratioCut) - { - return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle, - descrG, - weight_index, - n_clusters, - clustering, - edgeCut, - ratioCut); + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int n_clusters, + const int* clustering, + float * edgeCut, + float * ratioCut) { + return nvgraph::nvgraphAnalyzeBalancedCut_impl(handle, + descrG, + weight_index, + n_clusters, + clustering, + edgeCut, + ratioCut); } -nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching( nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const nvgraphEdgeWeightMatching_t similarity_metric, - int* aggregates, - size_t* num_aggregates) - { - return nvgraph::nvgraphHeavyEdgeMatching_impl(handle, - descrG, - weight_index, - similarity_metric, - aggregates, - num_aggregates); +nvgraphStatus_t NVGRAPH_API nvgraphHeavyEdgeMatching(nvgraphHandle_t handle, + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const nvgraphEdgeWeightMatching_t similarity_metric, + int* aggregates, + size_t* num_aggregates) { + return nvgraph::nvgraphHeavyEdgeMatching_impl(handle, + descrG, + weight_index, + similarity_metric, + aggregates, + num_aggregates); } nvgraphStatus_t NVGRAPH_API nvgraphSpectralModularityMaximization(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int n_clusters, - const int n_eig_vects, - const float evs_tolerance, - const int evs_max_iter, - const float kmean_tolerance, - const int kmean_max_iter, - int* clustering, - void* eig_vals, - void* eig_vects) - { - return nvgraph::nvgraphSpectralModularityMaximization_impl(handle, - descrG, - weight_index, - n_clusters, - n_eig_vects, - evs_tolerance, - evs_max_iter, - kmean_tolerance, - kmean_max_iter, - clustering, - eig_vals, - eig_vects); + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int n_clusters, + const int n_eig_vects, + const float evs_tolerance, + const int evs_max_iter, + const float kmean_tolerance, + const int kmean_max_iter, + int* clustering, + void* eig_vals, + void* eig_vects) { + return nvgraph::nvgraphSpectralModularityMaximization_impl(handle, + descrG, + weight_index, + n_clusters, + n_eig_vects, + evs_tolerance, + evs_max_iter, + kmean_tolerance, + kmean_max_iter, + clustering, + eig_vals, + eig_vects); } nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeModularityClustering(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - const size_t weight_index, - const int n_clusters, - const int* clustering, - float * modularity) - { - return nvgraph::nvgraphAnalyzeModularityClustering_impl(handle, - descrG, - weight_index, - n_clusters, - clustering, - modularity); + const nvgraphGraphDescr_t descrG, + const size_t weight_index, + const int n_clusters, + const int* clustering, + float * modularity) { + return nvgraph::nvgraphAnalyzeModularityClustering_impl(handle, + descrG, + weight_index, + n_clusters, + clustering, + modularity); } #ifndef NVGRAPH_LIGHT nvgraphStatus_t NVGRAPH_API nvgraphContractGraph(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - nvgraphGraphDescr_t contrdescrG, - int *aggregates, - size_t numaggregates, - nvgraphSemiringOps_t VertexCombineOp, - nvgraphSemiringOps_t VertexReduceOp, - nvgraphSemiringOps_t EdgeCombineOp, - nvgraphSemiringOps_t EdgeReduceOp, - int flag) - { - return nvgraph::nvgraphContractGraph_impl(handle, - descrG, - contrdescrG, - aggregates, - numaggregates, - VertexCombineOp, - VertexReduceOp, - EdgeCombineOp, - EdgeReduceOp, - flag); + nvgraphGraphDescr_t descrG, + nvgraphGraphDescr_t contrdescrG, + int *aggregates, + size_t numaggregates, + nvgraphSemiringOps_t VertexCombineOp, + nvgraphSemiringOps_t VertexReduceOp, + nvgraphSemiringOps_t EdgeCombineOp, + nvgraphSemiringOps_t EdgeReduceOp, + int flag) { + return nvgraph::nvgraphContractGraph_impl(handle, + descrG, + contrdescrG, + aggregates, + numaggregates, + VertexCombineOp, + VertexReduceOp, + EdgeCombineOp, + EdgeReduceOp, + flag); } -#endif +#endif nvgraphStatus_t NVGRAPH_API nvgraphSpectralClustering(nvgraphHandle_t handle, // nvGRAPH library handle. - const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights) - const size_t weight_index, // Index of the edge set for the weights. - const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter - int* clustering, // (output) clustering - void* eig_vals, // (output) eigenvalues - void* eig_vects) // (output) eigenvectors - { - return nvgraph::nvgraphSpectralClustering_impl(handle, - descrG, - weight_index, - params, - clustering, - eig_vals, - eig_vects); + const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 or NVGRAPH_CSR_32 at least 1 edge set (weights) + const size_t weight_index, // Index of the edge set for the weights. + const struct SpectralClusteringParameter *params, //parameters, see struct SpectralClusteringParameter + int* clustering, // (output) clustering + void* eig_vals, // (output) eigenvalues + void* eig_vects) // (output) eigenvectors +{ + return nvgraph::nvgraphSpectralClustering_impl(handle, + descrG, + weight_index, + params, + clustering, + eig_vals, + eig_vects); } nvgraphStatus_t NVGRAPH_API nvgraphAnalyzeClustering(nvgraphHandle_t handle, // nvGRAPH library handle. - const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights) - const size_t weight_index, // Index of the edge set for the weights. - const int n_clusters, //number of clusters - const int* clustering, // clustering to analyse - nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality - float * score) // (output) clustering score telling how good the clustering is for the selected metric. - { - return nvgraph::nvgraphAnalyzeClustering_impl(handle, - descrG, - weight_index, - n_clusters, - clustering, - metric, - score); + const nvgraphGraphDescr_t descrG, // nvGRAPH graph descriptor, should contain the connectivity information in NVGRAPH_CSR_32 at least 1 edge set (weights) + const size_t weight_index, // Index of the edge set for the weights. + const int n_clusters, //number of clusters + const int* clustering, // clustering to analyse + nvgraphClusteringMetric_t metric, // metric to compute to measure the clustering quality + float * score) // (output) clustering score telling how good the clustering is for the selected metric. +{ + return nvgraph::nvgraphAnalyzeClustering_impl(handle, + descrG, + weight_index, + n_clusters, + clustering, + metric, + score); } nvgraphStatus_t NVGRAPH_API nvgraphTriangleCount(nvgraphHandle_t handle, - const nvgraphGraphDescr_t descrG, - uint64_t* result) - { - return nvgraph::nvgraphTriangleCount_impl(handle, descrG, result); + const nvgraphGraphDescr_t descrG, + uint64_t* result) +{ + return nvgraph::nvgraphTriangleCount_impl(handle, descrG, result); } -nvgraphStatus_t NVGRAPH_API nvgraphLouvain (cudaDataType_t index_type, cudaDataType_t val_type, const size_t num_vertex, const size_t num_edges, - void* csr_ptr, void* csr_ind, void* csr_val, int weighted, int has_init_cluster, void* init_cluster, +nvgraphStatus_t NVGRAPH_API nvgraphLouvain (cudaDataType_t index_type, cudaDataType_t val_type, const size_t num_vertex, const size_t num_edges, + void* csr_ptr, void* csr_ind, void* csr_val, int weighted, int has_init_cluster, void* init_cluster, void* final_modularity, void* best_cluster_vec, void* num_level) { NVLOUVAIN_STATUS status = NVLOUVAIN_OK; - if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) || + if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) || ((init_cluster == NULL) && (has_init_cluster == 1)) || (final_modularity == NULL) || (best_cluster_vec == NULL) || (num_level == NULL)) return NVGRAPH_STATUS_INVALID_VALUE; @@ -4125,71 +3998,72 @@ nvgraphStatus_t NVGRAPH_API nvgraphLouvain (cudaDataType_t index_type, cudaDataT bool weighted_b = weighted; bool has_init_cluster_b = has_init_cluster; if (val_type == CUDA_R_32F) - status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (float*)csr_val, num_vertex, num_edges, - weighted_b, has_init_cluster_b, (int*)init_cluster, *((float*)final_modularity), + status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (float*)csr_val, num_vertex, num_edges, + weighted_b, has_init_cluster_b, (int*)init_cluster, *((float*)final_modularity), (int*)best_cluster_vec,*((int*)num_level), log); else - status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (double*)csr_val, num_vertex, num_edges, - weighted_b, has_init_cluster_b, (int*)init_cluster, *((double*)final_modularity), + status = nvlouvain::louvain ((int*)csr_ptr, (int*)csr_ind, (double*)csr_val, num_vertex, num_edges, + weighted_b, has_init_cluster_b, (int*)init_cluster, *((double*)final_modularity), (int*)best_cluster_vec,*((int*)num_level), log); if (status != NVLOUVAIN_OK) return NVGRAPH_STATUS_INTERNAL_ERROR; - return NVGRAPH_STATUS_SUCCESS; + return NVGRAPH_STATUS_SUCCESS; } -nvgraphStatus_t NVGRAPH_API nvgraphJaccard (cudaDataType_t index_type, cudaDataType_t val_type, const size_t n, +nvgraphStatus_t NVGRAPH_API nvgraphJaccard (cudaDataType_t index_type, cudaDataType_t val_type, const size_t n, const size_t e, void* csr_ptr, void* csr_ind, void* csr_val, int weighted, void* v, void* gamma, void* weight_j) { int status = 0; - if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) || (gamma == NULL) || (weight_j == NULL)) + if ((csr_ptr == NULL) || (csr_ind == NULL) || ((csr_val == NULL) && (weighted == 1)) || (gamma == NULL) || (weight_j == NULL)) return NVGRAPH_STATUS_INVALID_VALUE; bool weighted_b = weighted; + cudaStream_t stream{nullptr}; if (val_type == CUDA_R_32F) { - float* weight_i = NULL, *weight_s = NULL, *work = NULL; - NVG_CUDA_TRY(cudaMalloc ((void**)&weight_i, sizeof(float) * e)); - NVG_CUDA_TRY(cudaMalloc ((void**)&weight_s, sizeof(float) * e)); + float* weight_i = NULL, *weight_s = NULL, *work = NULL; + NVG_RMM_TRY(RMM_ALLOC((void**)&weight_i, sizeof(float) * e, stream)); + NVG_RMM_TRY(RMM_ALLOC((void**)&weight_s, sizeof(float) * e, stream)); if (weighted_b == true) { - NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(float) * n)); + NVG_RMM_TRY(RMM_ALLOC((void**)&work, sizeof(float) * n, stream)); status = nvlouvain::jaccard (n, e, (int*) csr_ptr, (int*) csr_ind, (float*) csr_val, (float*) v, work, *((float*) gamma), weight_i, weight_s, (float*)weight_j); - NVG_CUDA_TRY(cudaFree (work)); + NVG_RMM_TRY(RMM_FREE(work, stream)); } else { - NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(float) * n)); + NVG_RMM_TRY(RMM_ALLOC((void**)&work, sizeof(float) * n, stream)); nvlouvain::fill(e, (float*)weight_j, (float)1.0); status = nvlouvain::jaccard (n, e, (int*) csr_ptr, (int*) csr_ind, (float*) csr_val, (float*) v, work, *((float*) gamma), weight_i, weight_s, (float*)weight_j); - NVG_CUDA_TRY(cudaFree (work)); + NVG_RMM_TRY(RMM_FREE(work, stream)); } - NVG_CUDA_TRY(cudaFree (weight_s)); - NVG_CUDA_TRY(cudaFree (weight_i)); + NVG_RMM_TRY(RMM_FREE(weight_s, stream)); + NVG_RMM_TRY(RMM_FREE(weight_i, stream)); } else { - double* weight_i = NULL, *weight_s = NULL, *work = NULL; - NVG_CUDA_TRY(cudaMalloc ((void**)&weight_i, sizeof(double) * e)); - NVG_CUDA_TRY(cudaMalloc ((void**)&weight_s, sizeof(double) * e)); + double* weight_i = NULL, *weight_s = NULL, *work = NULL; + NVG_RMM_TRY(RMM_ALLOC((void**)&weight_i, sizeof(double) * e, stream)); + NVG_RMM_TRY(RMM_ALLOC((void**)&weight_s, sizeof(double) * e, stream)); if (weighted_b == true) { - NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(double) * n)); + NVG_RMM_TRY(RMM_ALLOC((void**)&work, sizeof(double) * n, stream)); status = nvlouvain::jaccard (n, e, (int*) csr_ptr, (int*) csr_ind, (double*) csr_val, (double*) v, work, *((double*) gamma), weight_i, weight_s, (double*)weight_j); - NVG_CUDA_TRY(cudaFree (work)); + NVG_RMM_TRY(RMM_FREE(work, stream)); } else { - NVG_CUDA_TRY(cudaMalloc ((void**)&work, sizeof(double) * n)); + NVG_RMM_TRY(RMM_ALLOC((void**)&work, sizeof(double) * n, stream)); nvlouvain::fill(e, (double*)weight_j, (double)1.0); status = nvlouvain::jaccard (n, e, (int*) csr_ptr, (int*) csr_ind, (double*) csr_val, (double*) v, work, *((double*) gamma), weight_i, weight_s, (double*)weight_j); - NVG_CUDA_TRY(cudaFree (work)); + NVG_RMM_TRY(RMM_FREE(work, stream)); } - NVG_CUDA_TRY(cudaFree (weight_s)); - NVG_CUDA_TRY(cudaFree (weight_i)); + NVG_RMM_TRY(RMM_FREE(weight_s, stream)); + NVG_RMM_TRY(RMM_FREE(weight_i, stream)); } if (status != 0) @@ -4198,27 +4072,25 @@ nvgraphStatus_t NVGRAPH_API nvgraphJaccard (cudaDataType_t index_type, cudaDataT return NVGRAPH_STATUS_SUCCESS; } - nvgraphStatus_t NVGRAPH_API nvgraphAttachGraphStructure(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - void* topologyData, - nvgraphTopologyType_t TT) { - return nvgraph::nvgraphAttachGraphStructure_impl( handle, descrG, topologyData, TT); + nvgraphGraphDescr_t descrG, + void* topologyData, + nvgraphTopologyType_t TT) { + return nvgraph::nvgraphAttachGraphStructure_impl( handle, descrG, topologyData, TT); } nvgraphStatus_t NVGRAPH_API nvgraphAttachVertexData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t setnum, - cudaDataType_t settype, - void *vertexData) { - return nvgraph::nvgraphAttachVertexData_impl( handle, descrG, setnum, settype, vertexData); + nvgraphGraphDescr_t descrG, + size_t setnum, + cudaDataType_t settype, + void *vertexData) { + return nvgraph::nvgraphAttachVertexData_impl( handle, descrG, setnum, settype, vertexData); } nvgraphStatus_t NVGRAPH_API nvgraphAttachEdgeData(nvgraphHandle_t handle, - nvgraphGraphDescr_t descrG, - size_t setnum, - cudaDataType_t settype, - void *edgeData) { - return nvgraph::nvgraphAttachEdgeData_impl( handle, descrG, setnum, settype, edgeData); + nvgraphGraphDescr_t descrG, + size_t setnum, + cudaDataType_t settype, + void *edgeData) { + return nvgraph::nvgraphAttachEdgeData_impl( handle, descrG, setnum, settype, edgeData); } - diff --git a/cpp/nvgraph/cpp/src/pagerank.cu b/cpp/nvgraph/cpp/src/pagerank.cu index 30ecc3165f5..479bd2326d9 100644 --- a/cpp/nvgraph/cpp/src/pagerank.cu +++ b/cpp/nvgraph/cpp/src/pagerank.cu @@ -30,12 +30,6 @@ #include #include - -#include "debug_macros.h" -#ifdef DEBUG - #define PR_VERBOSE -#endif - namespace nvgraph { template @@ -167,18 +161,6 @@ bool Pagerank::solve_it() template NVGRAPH_ERROR Pagerank::solve(ValueType damping_factor, Vector& initial_guess, Vector& pagerank_vector, float tolerance, int max_it) { - - #ifdef PR_VERBOSE - std::stringstream ss; - ss.str(std::string()); - size_t used_mem, free_mem, total_mem; - ss <<" ------------------PageRank------------------"<< std::endl; - ss <<" --------------------------------------------"<< std::endl; - ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl; - ss <<" --------------------------------------------"<< std::endl; - COUT()<(tolerance); setup(damping_factor, initial_guess, pagerank_vector); @@ -190,25 +172,9 @@ NVGRAPH_ERROR Pagerank::solve(ValueType damping_factor, m_iterations = i; converged = solve_it(); i++; - #ifdef PR_VERBOSE - ss.str(std::string()); - cnmemMemGetInfo(&free_mem, &total_mem, NULL); - used_mem=total_mem-free_mem; - ss << std::setw(10) << i ; - ss.precision(3); - ss << std::setw(20) << std::fixed << used_mem/1024.0/1024.0; - ss << std::setw(15) << std::scientific << m_residual << std::endl; - COUT()< -#include "debug_macros.h" -#ifdef DEBUG - #define SP_VERBOSE 0 -#endif + namespace nvgraph { template @@ -136,41 +133,12 @@ NVGRAPH_ERROR Sssp::solve(IndexType source_index, Vector bool converged = false; int max_it = static_cast(m_network.get_num_edges()), i = 0; - - #ifdef SP_VERBOSE - //int n = static_cast(m_network.get_num_vertices()), nnz = static_cast(m_network.get_num_edges()); - //dump_raw_vec(m_network.get_raw_row_offsets(), n, 0); - //dump_raw_vec(m_network.get_raw_column_indices(),n, 0); - //dump_raw_vec(m_network.get_raw_values(), nnz, 0); - - std::stringstream ss; - ss.str(std::string()); - size_t used_mem, free_mem, total_mem; - ss <<" --------------------Sssp--------------------"<< std::endl; - ss <<" --------------------------------------------"<< std::endl; - ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl; - ss <<" --------------------------------------------"<< std::endl; - COUT()<; diff --git a/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu b/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu index d2a961ebd84..740a2afd341 100644 --- a/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu +++ b/cpp/nvgraph/cpp/src/triangles_counting_kernels.cu @@ -33,7 +33,7 @@ #include "sm_utils.h" using namespace cub; -#include "cnmem.h" +#include "rmm/rmm.h" #define TH_CENT_K_LOCLEN (34) #define WP_LEN_TH1 (24) @@ -58,29 +58,25 @@ namespace nvgraph namespace triangles_counting { - -// hide behind - void* tmp_get(size_t size, cudaStream_t stream) - { - void *t = NULL; - cnmemStatus_t status = cnmemMalloc(&t, size, stream); - if (status == CNMEM_STATUS_OUT_OF_MEMORY) - { + // Better return std::unique_ptr than a raw pointer, but we haven't decide + // whether to create our own unique_ptr with RMM's deleter or to implement + // this in librmm. So, we may wait till this decision is made. + void* get_temp_storage(size_t size, cudaStream_t stream) { + auto t = static_cast(nullptr); + auto status = RMM_ALLOC(&t, size, stream); + if (status == RMM_ERROR_OUT_OF_MEMORY) { FatalError("Not enough memory", NVGRAPH_ERR_NO_MEMORY); } - else if (status != CNMEM_STATUS_SUCCESS) - { + else if (status != RMM_SUCCESS) { FatalError("Memory manager internal error (alloc)", NVGRAPH_ERR_UNKNOWN); } return t; } - void tmp_release(void* ptr, cudaStream_t stream) - { - cnmemStatus_t status = cnmemFree(ptr, stream); - if (status != CNMEM_STATUS_SUCCESS) - { + void free_temp_storage(void* ptr, cudaStream_t stream) { + auto status = RMM_FREE(ptr, stream); + if (status != RMM_SUCCESS) { FatalError("Memory manager internal error (release)", NVGRAPH_ERR_UNKNOWN); } } @@ -107,7 +103,7 @@ namespace nvgraph stream, debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, @@ -115,7 +111,7 @@ namespace nvgraph stream, debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -135,14 +131,14 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -165,7 +161,7 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items, @@ -174,7 +170,7 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -200,7 +196,7 @@ namespace nvgraph stream, debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, @@ -210,7 +206,7 @@ namespace nvgraph stream, debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -235,7 +231,7 @@ namespace nvgraph stream, debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, @@ -243,7 +239,7 @@ namespace nvgraph num_items, begin_bit, end_bit, stream, debug_synchronous); - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -267,7 +263,7 @@ namespace nvgraph stream, debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, @@ -275,7 +271,7 @@ namespace nvgraph stream, debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -301,7 +297,7 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, @@ -310,7 +306,7 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -331,14 +327,14 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -359,14 +355,14 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -392,7 +388,7 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, @@ -401,7 +397,7 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -427,7 +423,7 @@ namespace nvgraph stream, debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, @@ -435,7 +431,7 @@ namespace nvgraph stream, debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -456,14 +452,14 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -484,14 +480,14 @@ namespace nvgraph debug_synchronous); cudaCheckError() ; - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, stream, debug_synchronous); cudaCheckError() ; - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -525,7 +521,7 @@ namespace nvgraph num_items, stream, debug_synchronous); cudaCheckError(); - d_temp_storage = tmp_get(temp_storage_bytes, stream); + d_temp_storage = get_temp_storage(temp_storage_bytes, stream); cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, @@ -536,7 +532,7 @@ namespace nvgraph num_items, stream, debug_synchronous); cudaCheckError(); - tmp_release(d_temp_storage, stream); + free_temp_storage(d_temp_storage, stream); return; } @@ -1175,12 +1171,12 @@ namespace nvgraph return; thrust::counting_iterator it(0); NonEmptyRow temp_func(roff); - T* d_out_num = (T*) tmp_get(sizeof(*n_nonempty), stream); + T* d_out_num = (T*) get_temp_storage(sizeof(*n_nonempty), stream); cubIf(it, p_nonempty, d_out_num, n, temp_func, stream); cudaMemcpy(n_nonempty, d_out_num, sizeof(*n_nonempty), cudaMemcpyDeviceToHost); cudaCheckError(); - tmp_release(d_out_num, stream); + free_temp_storage(d_out_num, stream); cudaCheckError(); } @@ -1188,13 +1184,13 @@ namespace nvgraph uint64_t reduce(uint64_t *v_d, T n, cudaStream_t stream) { uint64_t n_h; - uint64_t *n_d = (uint64_t *) tmp_get(sizeof(*n_d), stream); + uint64_t *n_d = (uint64_t *) get_temp_storage(sizeof(*n_d), stream); cubSum(v_d, n_d, n, stream); cudaCheckError(); cudaMemcpy(&n_h, n_d, sizeof(*n_d), cudaMemcpyDeviceToHost); cudaCheckError(); - tmp_release(n_d, stream); + free_temp_storage(n_d, stream); return n_h; } diff --git a/cpp/nvgraph/cpp/src/valued_csr_graph.cpp b/cpp/nvgraph/cpp/src/valued_csr_graph.cpp index 3882c1607c2..abc46d80524 100644 --- a/cpp/nvgraph/cpp/src/valued_csr_graph.cpp +++ b/cpp/nvgraph/cpp/src/valued_csr_graph.cpp @@ -15,7 +15,6 @@ */ #include "valued_csr_graph.hxx" -#include "cnmem_shared_ptr.hxx" // interface with CuMem (memory pool lib) for shared ptr namespace nvgraph { diff --git a/cpp/nvgraph/cpp/src/widest_path.cu b/cpp/nvgraph/cpp/src/widest_path.cu index 4da42856574..101dde6a4e0 100644 --- a/cpp/nvgraph/cpp/src/widest_path.cu +++ b/cpp/nvgraph/cpp/src/widest_path.cu @@ -30,10 +30,6 @@ #include "nvgraph_csrmv.hxx" #include "widest_path.hxx" -#include "debug_macros.h" -#ifdef DEBUG -#define MF_VERBOSE 0 -#endif namespace nvgraph { template @@ -157,35 +153,12 @@ NVGRAPH_ERROR WidestPath::solve(IndexType source_index, setup(source_index, source_connection, widest_path_result); bool converged = false; int max_it = 100000, i = 0; - #ifdef MF_VERBOSE - std::stringstream ss; - ss.str(std::string()); - size_t used_mem, free_mem, total_mem; - ss <<" ------------------WidestPath------------------"<< std::endl; - ss <<" --------------------------------------------"<< std::endl; - ss << std::setw(10) << "Iteration" << std::setw(20) << " Mem Usage (MB)" << std::setw(15) << "Residual" << std::endl; - ss <<" --------------------------------------------"<< std::endl; - COUT()<; diff --git a/cpp/nvgraph/cpp/tests/CMakeLists.txt b/cpp/nvgraph/cpp/tests/CMakeLists.txt index eda3443f398..2db70117943 100644 --- a/cpp/nvgraph/cpp/tests/CMakeLists.txt +++ b/cpp/nvgraph/cpp/tests/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR) -project(CUDF_TESTS LANGUAGES C CXX CUDA) +project(NVGRAPH_TESTS LANGUAGES C CXX CUDA) ################################################################################################### # - compiler function ----------------------------------------------------------------------------- @@ -8,7 +8,7 @@ project(CUDF_TESTS LANGUAGES C CXX CUDA) function(ConfigureTest CMAKE_TEST_NAME CMAKE_TEST_SRC) add_executable(${CMAKE_TEST_NAME} ${CMAKE_TEST_SRC}) set_target_properties(${CMAKE_TEST_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_link_libraries(${CMAKE_TEST_NAME} gmock gtest gmock_main gtest_main pthread nvgraph_rapids cublas cusparse curand cusolver cudart) + target_link_libraries(${CMAKE_TEST_NAME} gmock gtest gmock_main gtest_main pthread nvgraph_rapids) set_target_properties(${CMAKE_TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gtests") add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME}) @@ -20,10 +20,10 @@ endfunction(ConfigureTest) include_directories( "${CMAKE_BINARY_DIR}/include" "${CMAKE_SOURCE_DIR}/include" - "${CMAKE_SOURCE_DIR}/thirdparty/cnmem/include" "${CMAKE_SOURCE_DIR}/thirdparty/cub" "${CMAKE_SOURCE_DIR}/../external" "${CMAKE_SOURCE_DIR}/../external/cusp" + "${RMM_INCLUDE}" "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" ) @@ -32,7 +32,8 @@ include_directories( link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc "${CMAKE_BINARY_DIR}/lib" - "${GTEST_LIBRARY_DIR}") + "${GTEST_LIBRARY_DIR}" + "${RMM_LIBRARY_DIR}") ################################################################################################### ### test sources ################################################################################## diff --git a/cpp/nvgraph/cpp/thirdparty/cnmem b/cpp/nvgraph/cpp/thirdparty/cnmem deleted file mode 160000 index 37896cc9bfc..00000000000 --- a/cpp/nvgraph/cpp/thirdparty/cnmem +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 37896cc9bfc6536a8c878a1e675835c22d827821 diff --git a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh index 7432d58d69a..7a4691b55c3 100644 --- a/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh +++ b/cpp/nvgraph/external/cub_semiring/device/dispatch/dispatch_spmv_orig.cuh @@ -762,7 +762,7 @@ struct DispatchSpmv #if (CUB_PTX_ARCH == 0) // Init textures - if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; + if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x, spmv_params.num_cols * sizeof(ValueT)))) break; #endif if (search_grid_size < sm_count) diff --git a/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh index 623609452fd..d47b214ca82 100644 --- a/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh +++ b/cpp/nvgraph/external/cub_semiring/iterator/tex_obj_input_iterator.cuh @@ -161,7 +161,7 @@ public: template cudaError_t BindTexture( QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment - size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t bytes, ///< Number of bytes in the range size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator { this->ptr = const_cast::Type *>(ptr); diff --git a/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh b/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh index 0305c78cd2c..e67b52c07f0 100644 --- a/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh +++ b/cpp/nvgraph/external/cub_semiring/iterator/tex_ref_input_iterator.cuh @@ -91,13 +91,13 @@ struct IteratorTexRef static TexRef ref; /// Bind texture - static cudaError_t BindTexture(void *d_in, size_t &offset) + static cudaError_t BindTexture(void *d_in, size_t &bytes, size_t &offset) { if (d_in) { cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); ref.channelDesc = tex_desc; - return (CubDebug(cudaBindTexture(&offset, ref, d_in))); + return (CubDebug(cudaBindTexture(&offset, ref, d_in, bytes))); } return cudaSuccess; @@ -245,12 +245,12 @@ public: template cudaError_t BindTexture( QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment - size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t bytes, ///< Number of bytes in the range size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator { this->ptr = const_cast::Type *>(ptr); size_t offset; - cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset); + cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, bytes, offset); this->tex_offset = (difference_type) (offset / sizeof(QualifiedT)); return retval; } diff --git a/cpp/src/bfs.cu b/cpp/src/bfs.cu deleted file mode 100644 index 903a514018d..00000000000 --- a/cpp/src/bfs.cu +++ /dev/null @@ -1,471 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - * - */ - -#include -#include -#include "bfs.cuh" -#include -#include "rmm_utils.h" - -#include "graph_utils.cuh" -#include "bfs_kernels.cuh" - -using namespace bfs_kernels; - -namespace cugraph { - enum BFS_ALGO_STATE { - TOPDOWN, BOTTOMUP - }; - - template - void Bfs::setup() { - - // Determinism flag, false by default - deterministic = false; - //Working data - //Each vertex can be in the frontier at most once - ALLOC_MANAGED_TRY(&frontier, n * sizeof(IndexType), nullptr); - - //We will update frontier during the execution - //We need the orig to reset frontier, or cudaFree - original_frontier = frontier; - - //size of bitmaps for vertices - vertices_bmap_size = (n / (8 * sizeof(int)) + 1); - //ith bit of visited_bmap is set <=> ith vertex is visited - ALLOC_MANAGED_TRY(&visited_bmap, sizeof(int) * vertices_bmap_size, nullptr); - - //ith bit of isolated_bmap is set <=> degree of ith vertex = 0 - ALLOC_MANAGED_TRY(&isolated_bmap, sizeof(int) * vertices_bmap_size, nullptr); - - //vertices_degree[i] = degree of vertex i - ALLOC_MANAGED_TRY(&vertex_degree, sizeof(IndexType) * n, nullptr); - - //Cub working data - cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes); - - //We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive - ALLOC_MANAGED_TRY(&buffer_np1_1, (n + 1) * sizeof(IndexType), nullptr); - ALLOC_MANAGED_TRY(&buffer_np1_2, (n + 1) * sizeof(IndexType), nullptr); - - //Using buffers : top down - - //frontier_vertex_degree[i] is the degree of vertex frontier[i] - frontier_vertex_degree = buffer_np1_1; - //exclusive sum of frontier_vertex_degree - exclusive_sum_frontier_vertex_degree = buffer_np1_2; - - //Using buffers : bottom up - //contains list of unvisited vertices - unvisited_queue = buffer_np1_1; - //size of the "last" unvisited queue : size_last_unvisited_queue - //refers to the size of unvisited_queue - //which may not be up to date (the queue may contains vertices that are now visited) - - //We may leave vertices unvisited after bottom up main kernels - storing them here - left_unvisited_queue = buffer_np1_2; - - //We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket - //See top down kernels for more details - ALLOC_MANAGED_TRY(&exclusive_sum_frontier_vertex_buckets_offsets, - ((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType), nullptr); - - //Init device-side counters - //Those counters must be/can be reset at each bfs iteration - //Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck - ALLOC_MANAGED_TRY(&d_counters_pad, 4 * sizeof(IndexType), nullptr); - - d_new_frontier_cnt = &d_counters_pad[0]; - d_mu = &d_counters_pad[1]; - d_unvisited_cnt = &d_counters_pad[2]; - d_left_unvisited_cnt = &d_counters_pad[3]; - - //Lets use this int* for the next 3 lines - //Its dereferenced value is not initialized - so we dont care about what we put in it - IndexType * d_nisolated = d_new_frontier_cnt; - cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream); - - //Computing isolated_bmap - //Only dependent on graph - not source vertex - done once - flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream); - cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - - //We need nisolated to be ready to use - cudaStreamSynchronize(stream); - } - - template - void Bfs::configure(IndexType *_distances, - IndexType *_predecessors, - int *_edge_mask) - { - distances = _distances; - predecessors = _predecessors; - edge_mask = _edge_mask; - - useEdgeMask = (edge_mask != NULL); - computeDistances = (distances != NULL); - computePredecessors = (predecessors != NULL); - - //We need distances to use bottom up - if (directed && !computeDistances) - ALLOC_MANAGED_TRY(&distances, n * sizeof(IndexType), nullptr); - } - - template - void Bfs::traverse(IndexType source_vertex) { - - //Init visited_bmap - //If the graph is undirected, we not that - //we will never discover isolated vertices (in degree = out degree = 0) - //we avoid a lot of work by flagging them now - //in g500 graphs they represent ~25% of total vertices - //more than that for wiki and twitter graphs - - if (directed) { - cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream); - } - else { - cudaMemcpyAsync(visited_bmap, - isolated_bmap, - vertices_bmap_size * sizeof(int), - cudaMemcpyDeviceToDevice, - stream); - } - - //If needed, setting all vertices as undiscovered (inf distance) - //We dont use computeDistances here - //if the graph is undirected, we may need distances even if - //computeDistances is false - if (distances) - fill_vec(distances, n, vec_t::max, stream); - - //If needed, setting all predecessors to non-existent (-1) - if (computePredecessors) { - cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream); - } - - // - //Initial frontier - // - - frontier = original_frontier; - - if (distances) { - cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream); - } - - //Setting source_vertex as visited - //There may be bit already set on that bmap (isolated vertices) - if the graph is undirected - int current_visited_bmap_source_vert = 0; - - if (!directed) { - cudaMemcpyAsync(¤t_visited_bmap_source_vert, - &visited_bmap[source_vertex / INT_SIZE], - sizeof(int), - cudaMemcpyDeviceToHost); - //We need current_visited_bmap_source_vert - cudaStreamSynchronize(stream); - } - - int m = (1 << (source_vertex % INT_SIZE)); - - //In that case, source is isolated, done now - if (!directed && (m & current_visited_bmap_source_vert)) { - //Init distances and predecessors are done, (cf Streamsync in previous if) - return; - } - - m |= current_visited_bmap_source_vert; - - cudaMemcpyAsync(&visited_bmap[source_vertex / INT_SIZE], - &m, - sizeof(int), - cudaMemcpyHostToDevice, - stream); - - //Adding source_vertex to init frontier - cudaMemcpyAsync(&frontier[0], - &source_vertex, - sizeof(IndexType), - cudaMemcpyHostToDevice, - stream); - - //mf : edges in frontier - //nf : vertices in frontier - //mu : edges undiscovered - //nu : nodes undiscovered - //lvl : current frontier's depth - IndexType mf, nf, mu, nu; - bool growing; - IndexType lvl = 1; - - //Frontier has one vertex - nf = 1; - - //all edges are undiscovered (by def isolated vertices have 0 edges) - mu = nnz; - - //all non isolated vertices are undiscovered (excepted source vertex, which is in frontier) - //That number is wrong if source_vertex is also isolated - but it's not important - nu = n - nisolated - nf; - - //Last frontier was 0, now it is 1 - growing = true; - - IndexType size_last_left_unvisited_queue = n; //we just need value > 0 - IndexType size_last_unvisited_queue = 0; //queue empty - - //Typical pre-top down workflow. set_frontier_degree + exclusive-scan - set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream); - exclusive_sum(d_cub_exclusive_sum_storage, - cub_exclusive_sum_storage_bytes, - frontier_vertex_degree, - exclusive_sum_frontier_vertex_degree, - nf + 1, - stream); - - cudaMemcpyAsync(&mf, - &exclusive_sum_frontier_vertex_degree[nf], - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - - //We need mf - cudaStreamSynchronize(stream); - - //At first we know we have to use top down - BFS_ALGO_STATE algo_state = TOPDOWN; - - //useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data - //undirected g : need parents to be in children's neighbors - bool can_use_bottom_up = !directed && distances; - - while (nf > 0) { - //Each vertices can appear only once in the frontierer array - we know it will fit - new_frontier = frontier + nf; - IndexType old_nf = nf; - resetDevicePointers(); - - if (can_use_bottom_up) { - //Choosing algo - //Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf - - switch (algo_state) { - case TOPDOWN: - if (mf > mu / alpha) - algo_state = BOTTOMUP; - break; - case BOTTOMUP: - if (!growing && nf < n / beta) { - - //We need to prepare the switch back to top down - //We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here - count_unvisited_edges(unvisited_queue, - size_last_unvisited_queue, - visited_bmap, - vertex_degree, - d_mu, - stream); - - //Typical pre-top down workflow. set_frontier_degree + exclusive-scan - set_frontier_degree(frontier_vertex_degree, - frontier, - vertex_degree, - nf, - stream); - exclusive_sum(d_cub_exclusive_sum_storage, - cub_exclusive_sum_storage_bytes, - frontier_vertex_degree, - exclusive_sum_frontier_vertex_degree, - nf + 1, - stream); - - cudaMemcpyAsync(&mf, - &exclusive_sum_frontier_vertex_degree[nf], - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - - cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); - - //We will need mf and mu - cudaStreamSynchronize(stream); - algo_state = TOPDOWN; - } - break; - } - } - - //Executing algo - - switch (algo_state) { - case TOPDOWN: - compute_bucket_offsets(exclusive_sum_frontier_vertex_degree, - exclusive_sum_frontier_vertex_buckets_offsets, - nf, - mf, - stream); - frontier_expand(row_offsets, - col_indices, - frontier, - nf, - mf, - lvl, - new_frontier, - d_new_frontier_cnt, - exclusive_sum_frontier_vertex_degree, - exclusive_sum_frontier_vertex_buckets_offsets, - visited_bmap, - distances, - predecessors, - edge_mask, - isolated_bmap, - directed, - stream, - deterministic); - - mu -= mf; - - cudaMemcpyAsync(&nf, - d_new_frontier_cnt, - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - cudaCheckError(); - - //We need nf - cudaStreamSynchronize(stream); - - if (nf) { - //Typical pre-top down workflow. set_frontier_degree + exclusive-scan - set_frontier_degree(frontier_vertex_degree, - new_frontier, - vertex_degree, - nf, - stream); - exclusive_sum(d_cub_exclusive_sum_storage, - cub_exclusive_sum_storage_bytes, - frontier_vertex_degree, - exclusive_sum_frontier_vertex_degree, - nf + 1, - stream); - cudaMemcpyAsync(&mf, - &exclusive_sum_frontier_vertex_degree[nf], - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - - //We need mf - cudaStreamSynchronize(stream); - } - break; - - case BOTTOMUP: - fill_unvisited_queue(visited_bmap, - vertices_bmap_size, - n, - unvisited_queue, - d_unvisited_cnt, - stream, - deterministic); - - size_last_unvisited_queue = nu; - - bottom_up_main(unvisited_queue, - size_last_unvisited_queue, - left_unvisited_queue, - d_left_unvisited_cnt, - visited_bmap, - row_offsets, - col_indices, - lvl, - new_frontier, - d_new_frontier_cnt, - distances, - predecessors, - edge_mask, - stream, - deterministic); - - //The number of vertices left unvisited decreases - //If it wasnt necessary last time, it wont be this time - if (size_last_left_unvisited_queue) { - cudaMemcpyAsync(&size_last_left_unvisited_queue, - d_left_unvisited_cnt, - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - cudaCheckError() - //We need last_left_unvisited_size - cudaStreamSynchronize(stream); - bottom_up_large(left_unvisited_queue, - size_last_left_unvisited_queue, - visited_bmap, - row_offsets, - col_indices, - lvl, - new_frontier, - d_new_frontier_cnt, - distances, - predecessors, - edge_mask, - stream, - deterministic); - } - cudaMemcpyAsync(&nf, - d_new_frontier_cnt, - sizeof(IndexType), - cudaMemcpyDeviceToHost, - stream); - cudaCheckError() - - //We will need nf - cudaStreamSynchronize(stream); - break; - } - - //Updating undiscovered edges count - nu -= nf; - - //Using new frontier - frontier = new_frontier; - growing = (nf > old_nf); - - ++lvl; - } - } - - template - void Bfs::resetDevicePointers() { - cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream); - } - - template - void Bfs::clean() { - //the vectors have a destructor that takes care of cleaning - ALLOC_FREE_TRY(original_frontier, nullptr); - ALLOC_FREE_TRY(visited_bmap, nullptr); - ALLOC_FREE_TRY(isolated_bmap, nullptr); - ALLOC_FREE_TRY(vertex_degree, nullptr); - ALLOC_FREE_TRY(d_cub_exclusive_sum_storage, nullptr); - ALLOC_FREE_TRY(buffer_np1_1, nullptr); - ALLOC_FREE_TRY(buffer_np1_2, nullptr); - ALLOC_FREE_TRY(exclusive_sum_frontier_vertex_buckets_offsets, nullptr); - ALLOC_FREE_TRY(d_counters_pad, nullptr); - - //In that case, distances is a working data - if (directed && !computeDistances) - ALLOC_FREE_TRY(distances, nullptr); - } - - template class Bfs ; -} // end namespace cugraph diff --git a/cpp/src/bfs_kernels.cuh b/cpp/src/bfs_kernels.cuh deleted file mode 100644 index c12ac40f533..00000000000 --- a/cpp/src/bfs_kernels.cuh +++ /dev/null @@ -1,1575 +0,0 @@ -/* - * Copyright (c) 2018 NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include - -#include -#include - -#define MAXBLOCKS 65535 -#define WARP_SIZE 32 -#define INT_SIZE 32 - -// -// Bottom up macros -// - -#define FILL_UNVISITED_QUEUE_DIMX 256 - -#define COUNT_UNVISITED_EDGES_DIMX 256 - -#define MAIN_BOTTOMUP_DIMX 256 -#define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX/WARP_SIZE) - -#define LARGE_BOTTOMUP_DIMX 256 - -//Number of edges processed in the main bottom up kernel -#define MAIN_BOTTOMUP_MAX_EDGES 6 - -//Power of 2 < 32 (strict <) -#define BOTTOM_UP_LOGICAL_WARP_SIZE 4 - -// -// Top down macros -// - -// We will precompute the results the binsearch_maxle every TOP_DOWN_BUCKET_SIZE edges -#define TOP_DOWN_BUCKET_SIZE 32 - -// DimX of the kernel -#define TOP_DOWN_EXPAND_DIMX 256 - -// TOP_DOWN_EXPAND_DIMX edges -> NBUCKETS_PER_BLOCK buckets -#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX/TOP_DOWN_BUCKET_SIZE) - -// How many items_per_thread we can process with one bucket_offset loading -// the -1 is here because we need the +1 offset -#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1) - -// instruction parallelism -// for how many edges will we create instruction parallelism -#define TOP_DOWN_BATCH_SIZE 2 - -#define COMPUTE_BUCKET_OFFSETS_DIMX 512 - -//Other macros - -#define FLAG_ISOLATED_VERTICES_DIMX 128 - -//Number of vertices handled by one thread -//Must be power of 2, lower than 32 -#define FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD 4 - -//Number of threads involved in the "construction" of one int in the bitset -#define FLAG_ISOLATED_VERTICES_THREADS_PER_INT (INT_SIZE/FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD) - -// -// Parameters of the heuristic to switch between bottomup/topdown -//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf -// - -using namespace cugraph; - -namespace bfs_kernels { - // - // gives the equivalent vectors from a type - // for the max val, would be better to use numeric_limits<>::max() once - // cpp11 is allowed in nvgraph - // - - template - struct vec_t { - typedef int4 vec4; - typedef int2 vec2; - }; - - template<> - struct vec_t { - typedef int4 vec4; - typedef int2 vec2; - static const int max = INT_MAX; - }; - - template<> - struct vec_t { - typedef longlong4 vec4; - typedef longlong2 vec2; - static const long long int max = LLONG_MAX; - }; - - // - // ------------------------- Helper device functions ------------------- - // - - __forceinline__ __device__ int getMaskNRightmostBitSet(int n) { - if (n == INT_SIZE) - return (~0); - int mask = (1 << n) - 1; - return mask; - } - - __forceinline__ __device__ int getMaskNLeftmostBitSet(int n) { - if (n == 0) - return 0; - int mask = ~((1 << (INT_SIZE - n)) - 1); - return mask; - } - - __forceinline__ __device__ int getNextZeroBit(int& val) { - int ibit = __ffs(~val) - 1; - val |= (1 << ibit); - - return ibit; - } - - struct BitwiseAnd - { - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return (a & b); - } - }; - - struct BitwiseOr - { - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return (a | b); - } - }; - - template - __device__ IndexType binsearch_maxle( const IndexType *vec, - const IndexType val, - IndexType low, - IndexType high) { - while (true) { - if (low == high) - return low; //we know it exists - if ((low + 1) == high) - return (vec[high] <= val) ? high : low; - - IndexType mid = low + (high - low) / 2; - - if (vec[mid] > val) - high = mid - 1; - else - low = mid; - - } - } - - // - // ------------------------- Bottom up ------------------------- - // - - // - // fill_unvisited_queue_kernel - // - // Finding unvisited vertices in the visited_bmap, and putting them in the queue - // Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted - // For instance, the queue can look like this : - // 34 38 45 58 61 4 18 24 29 71 84 85 90 - // Because they are represented by those ints in the bitmap : - // [34 38 45 58 61] [4 18 24 29] [71 84 85 90] - - //visited_bmap_nints = the visited_bmap is made of that number of ints - - template - __global__ void fill_unvisited_queue_kernel( int *visited_bmap, - IndexType visited_bmap_nints, - IndexType n, - IndexType *unvisited, - IndexType *unvisited_cnt) { - typedef cub::BlockScan BlockScan; - __shared__ typename BlockScan::TempStorage scan_temp_storage; - - //When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) ) - //We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in - //unvisited_common_block_offset - __shared__ IndexType unvisited_common_block_offset; - - //We don't want threads divergence in the loop (we're going to call __syncthreads) - //Using a block-only dependent in the condition of the loop - for (IndexType block_v_idx = blockIdx.x * blockDim.x; - block_v_idx < visited_bmap_nints; - block_v_idx += blockDim.x * gridDim.x) { - - //Index of visited_bmap that this thread will compute - IndexType v_idx = block_v_idx + threadIdx.x; - - int thread_visited_int = (v_idx < visited_bmap_nints) - ? visited_bmap[v_idx] - : - (~0); //will be neutral in the next lines (virtual vertices all visited) - - //The last int can only be partially valid - //If we are indeed taking care of the last visited int in this thread, - //We need to first disable (ie set as "visited") the inactive bits (vertices >= n) - if (v_idx == (visited_bmap_nints - 1)) { - int active_bits = n - (INT_SIZE * v_idx); - int inactive_bits = INT_SIZE - active_bits; - int mask = getMaskNLeftmostBitSet(inactive_bits); - thread_visited_int |= mask; //Setting inactive bits as visited - } - - //Counting number of unvisited vertices represented by this int - int n_unvisited_in_int = __popc(~thread_visited_int); - int unvisited_thread_offset; - - //We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue - //We ask for that space when computing the block scan, that will tell where to write those - //vertices in the queue, using the common offset of the block (see below) - BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset); - - //Last thread knows how many vertices will be written to the queue by this block - //Asking for that space in the queue using the global count, and saving the common offset - if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) { - IndexType total = unvisited_thread_offset + n_unvisited_in_int; - unvisited_common_block_offset = atomicAdd(unvisited_cnt, total); - } - - //syncthreads for two reasons : - // - we need to broadcast unvisited_common_block_offset - // - we will reuse scan_temp_storage (cf CUB doc) - __syncthreads(); - - IndexType current_unvisited_index = unvisited_common_block_offset - + unvisited_thread_offset; - int nvertices_to_write = n_unvisited_in_int; - - // getNextZeroBit uses __ffs, which gives least significant bit set - // which means that as long as n_unvisited_in_int is valid, - // we will use valid bits - - while (nvertices_to_write > 0) { - if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) { - typename vec_t::vec4 vec_v; - - vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - - typename vec_t::vec4 *unvisited_i4 = reinterpret_cast::vec4*>(&unvisited[current_unvisited_index]); - *unvisited_i4 = vec_v; - - current_unvisited_index += 4; - nvertices_to_write -= 4; - } - else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) { - typename vec_t::vec2 vec_v; - - vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - - typename vec_t::vec2 *unvisited_i2 = reinterpret_cast::vec2*>(&unvisited[current_unvisited_index]); - *unvisited_i2 = vec_v; - - current_unvisited_index += 2; - nvertices_to_write -= 2; - } else { - IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); - - unvisited[current_unvisited_index] = v; - - current_unvisited_index += 1; - nvertices_to_write -= 1; - } - - } - } - } - - //Wrapper - template - void fill_unvisited_queue( int *visited_bmap, - IndexType visited_bmap_nints, - IndexType n, - IndexType *unvisited, - IndexType *unvisited_cnt, - cudaStream_t m_stream, - bool deterministic) { - dim3 grid, block; - block.x = FILL_UNVISITED_QUEUE_DIMX; - - grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x); - - fill_unvisited_queue_kernel<<>>( visited_bmap, - visited_bmap_nints, - n, - unvisited, - unvisited_cnt); - cudaCheckError() - ; - } - - // - // count_unvisited_edges_kernel - // Couting the total number of unvisited edges in the graph - using an potentially unvisited queue - // We need the current unvisited vertices to be in the unvisited queue - // But visited vertices can be in the potentially_unvisited queue - // We first check if the vertex is still unvisited before using it - // Useful when switching from "Bottom up" to "Top down" - // - - template - __global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited, - const IndexType potentially_unvisited_size, - const int *visited_bmap, - IndexType *degree_vertices, - IndexType *mu) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage reduce_temp_storage; - - //number of undiscovered edges counted by this thread - IndexType thread_unvisited_edges_count = 0; - - for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < potentially_unvisited_size; - idx += blockDim.x * gridDim.x) { - - IndexType u = potentially_unvisited[idx]; - int u_visited_bmap = visited_bmap[u / INT_SIZE]; - int is_visited = u_visited_bmap & (1 << (u % INT_SIZE)); - - if (!is_visited) - thread_unvisited_edges_count += degree_vertices[u]; - - } - - //We need all thread_unvisited_edges_count to be ready before reducing - __syncthreads(); - - IndexType block_unvisited_edges_count = - BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count); - - //block_unvisited_edges_count is only defined is th.x == 0 - if (threadIdx.x == 0) - atomicAdd(mu, block_unvisited_edges_count); - } - - //Wrapper - template - void count_unvisited_edges(const IndexType *potentially_unvisited, - const IndexType potentially_unvisited_size, - const int *visited_bmap, - IndexType *node_degree, - IndexType *mu, - cudaStream_t m_stream) { - dim3 grid, block; - block.x = COUNT_UNVISITED_EDGES_DIMX; - grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x); - - count_unvisited_edges_kernel<<>>( potentially_unvisited, - potentially_unvisited_size, - visited_bmap, - node_degree, - mu); - cudaCheckError() - ; - } - - // - // Main Bottom Up kernel - // Here we will start to process unvisited vertices in the unvisited queue - // We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges - // If it's not possible to define a valid parent using only those edges, - // add it to the "left_unvisited_queue" - // - - // - // We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property - // It is used to do a reduction locally and fully build the new visited_bmap - // - - template - __global__ void main_bottomup_kernel( const IndexType *unvisited, - const IndexType unvisited_size, - IndexType *left_unvisited, - IndexType *left_unvisited_cnt, - int *visited_bmap, - const IndexType *row_ptr, - const IndexType *col_ind, - IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_cnt, - IndexType *distances, - IndexType *predecessors, - int *edge_mask) { - typedef cub::BlockDiscontinuity BlockDiscontinuity; - typedef cub::WarpReduce WarpReduce; - typedef cub::BlockScan BlockScan; - - __shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage; - __shared__ typename WarpReduce::TempStorage reduce_temp_storage; - __shared__ typename BlockScan::TempStorage scan_temp_storage; - - //To write vertices in the frontier, - //We will use a block scan to locally compute the offsets - //frontier_common_block_offset contains the common offset for the block - __shared__ IndexType frontier_common_block_offset; - - // When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints - // from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23) - // vertices represented by the same int will be designed as part of the same "group" - // To detect the deliminations between those groups, we use BlockDiscontinuity - // Then we need to create the new "visited_bmap" within those group. - // We use a warp reduction that takes into account limits between groups to do it - // But a group can be cut in two different warps : in that case, the second warp - // put the result of its local reduction in local_visited_bmap_warp_head - // the first warp will then read it and finish the reduction - - __shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS]; - - const int warpid = threadIdx.x / WARP_SIZE; - const int laneid = threadIdx.x % WARP_SIZE; - - // we will call __syncthreads inside the loop - // we need to keep complete block active - for (IndexType block_off = blockIdx.x * blockDim.x; - block_off < unvisited_size; - block_off += blockDim.x * gridDim.x) - { - IndexType idx = block_off + threadIdx.x; - - // This thread will take care of unvisited_vertex - // in the visited_bmap, it is represented by the int at index - // visited_bmap_index = unvisited_vertex/INT_SIZE - // it will be used by BlockDiscontinuity - // to flag the separation between groups of vertices (vertices represented by different in in visited_bmap) - IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one - visited_bmap_index[0] = -1; - IndexType unvisited_vertex = -1; - - // local_visited_bmap gives info on the visited bit of unvisited_vertex - // - // By default, everything is visited - // This is because we only take care of unvisited vertices here, - // The other are by default unvisited - // If a vertex remain unvisited, we will notice it here - // That's why by default we consider everything visited ( ie ~0 ) - // If we fail to assign one parent to an unvisited vertex, we will - // explicitly unset the bit - int local_visited_bmap = (~0); - int found = 0; - int more_to_visit = 0; - IndexType valid_parent; - IndexType left_unvisited_off; - - if (idx < unvisited_size) - { - //Processing first STPV edges of unvisited v - //If bigger than that, push to left_unvisited queue - unvisited_vertex = unvisited[idx]; - - IndexType edge_begin = row_ptr[unvisited_vertex]; - IndexType edge_end = row_ptr[unvisited_vertex + 1]; - - visited_bmap_index[0] = unvisited_vertex / INT_SIZE; - - IndexType degree = edge_end - edge_begin; - - for (IndexType edge = edge_begin; - edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge) - { - if (edge_mask && !edge_mask[edge]) - continue; - - IndexType parent_candidate = col_ind[edge]; - - if (distances[parent_candidate] == (lvl - 1)) - { - found = 1; - valid_parent = parent_candidate; - break; - } - } - - // This vertex will remain unvisited at the end of this kernel - // Explicitly say it - if (!found) - local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited - else - { - if (distances) - distances[unvisited_vertex] = lvl; - if (predecessors) - predecessors[unvisited_vertex] = valid_parent; - } - - //If we haven't found a parent and there's more edge to check - if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES) - { - left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1); - more_to_visit = 1; - } - - } - - // - // We will separate vertices in group - // Two vertices are in the same group if represented by same int in visited_bmap - // ie u and v in same group <=> u/32 == v/32 - // - // We will now flag the head of those group (first element of each group) - // - // 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue) - // 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained - // at most by two warps - - int is_head_a[1]; //CUB need an array - BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a, - visited_bmap_index, - cub::Inequality()); - int is_head = is_head_a[0]; - - // Computing the warp reduce within group - // This primitive uses the is_head flags to know where the limits of the groups are - // We use bitwise and as operator, because of the fact that 1 is the default value - // If a vertex is unvisited, we have to explicitly ask for it - int local_bmap_agg = - WarpReduce(reduce_temp_storage).HeadSegmentedReduce( local_visited_bmap, - is_head, - BitwiseAnd()); - - // We need to take care of the groups cut in two in two different warps - // Saving second part of the reduce here, then applying it on the first part bellow - // Corner case : if the first thread of the warp is a head, then this group is not cut in two - // and then we have to be neutral (for an bitwise and, it's an ~0) - if (laneid == 0) - { - local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg; - } - - //broadcasting local_visited_bmap_warp_head - __syncthreads(); - - int head_ballot = cugraph::utils::ballot(is_head); - - //As long as idx < unvisited_size, we know there's at least one head per warp - int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot); - - int is_last_head_in_warp = (laneid == laneid_last_head_in_warp); - - // if laneid == 0 && is_last_head_in_warp, it's a special case where - // a group of size 32 starts exactly at lane 0 - // in that case, nothing to do (this group is not cut by a warp delimitation) - // we also have to make sure that a warp actually exists after this one (this corner case is handled after) - if (laneid != 0 && (is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS)) - { - local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1]; - } - - //Three cases : - // -> This is the first group of the block - it may be cut in two (with previous block) - // -> This is the last group of the block - same thing - // -> This group is completely contained in this block - - if (warpid == 0 && laneid == 0) - { - //The first elt of this group considered in this block is unvisited_vertex - //We know that's the case because elts are sorted in a group, and we are at laneid == 0 - //We will do an atomicOr - we have to be neutral about elts < unvisited_vertex - int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid - int mask = getMaskNLeftmostBitSet(INT_SIZE - iv); - local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex - atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); - } - else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) && - laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case - idx < unvisited_size //we could be out - ) - { - //Last head of the block - //We don't know if this group is complete - - //last_v is the last unvisited_vertex of the group IN THIS block - //we dont know about the rest - we have to be neutral about elts > last_v - - //the destination thread of the __shfl is active - int laneid_max = min((IndexType) (WARP_SIZE - 1), - (unvisited_size - (block_off + 32 * warpid))); - IndexType last_v = cugraph::utils::shfl( unvisited_vertex, - laneid_max, - WARP_SIZE, - __activemask()); - - if (is_last_head_in_warp) - { - int ilast_v = last_v % INT_SIZE + 1; - int mask = getMaskNRightmostBitSet(ilast_v); - local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex - atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); - } - } - else - { - //group completely in block - if (is_head && idx < unvisited_size) { - visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int - } - } - - //Saving in frontier - - int thread_frontier_offset; - BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset); - IndexType inclusive_sum = thread_frontier_offset + found; - if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum) - { - frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); - } - - //1) Broadcasting frontier_common_block_offset - //2) we want to reuse the *_temp_storage - __syncthreads(); - - if (found) - new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex; - if (more_to_visit) - left_unvisited[left_unvisited_off] = unvisited_vertex; - - } - } - - template - void bottom_up_main( IndexType *unvisited, - IndexType unvisited_size, - IndexType *left_unvisited, - IndexType *d_left_unvisited_idx, - int *visited, - const IndexType *row_ptr, - const IndexType *col_ind, - IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_idx, - IndexType *distances, - IndexType *predecessors, - int *edge_mask, - cudaStream_t m_stream, - bool deterministic) { - dim3 grid, block; - block.x = MAIN_BOTTOMUP_DIMX; - - grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x); - - main_bottomup_kernel<<>>(unvisited, - unvisited_size, - left_unvisited, - d_left_unvisited_idx, - visited, - row_ptr, - col_ind, - lvl, - new_frontier, - new_frontier_idx, - distances, - predecessors, - edge_mask); - cudaCheckError() - ; - } - - // - // bottom_up_large_degree_kernel - // finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found - // - template - __global__ void bottom_up_large_degree_kernel( IndexType *left_unvisited, - IndexType left_unvisited_size, - int *visited, - const IndexType *row_ptr, - const IndexType *col_ind, - IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_cnt, - IndexType *distances, - IndexType *predecessors, - int *edge_mask) { - - int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE; - int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE; - int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE; - - //Inactive threads are not a pb for __ballot (known behaviour) - for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id; - idx < left_unvisited_size; - idx += gridDim.x * logical_warps_per_block) { - - //Unvisited vertices - potentially in the next frontier - IndexType v = left_unvisited[idx]; - - //Used only with symmetric graphs - //Parents are included in v's neighbors - IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited - - IndexType end_i_edge = row_ptr[v + 1]; - - //We can have warp divergence in the next loop - //It's not a pb because the behaviour of __ballot - //is know with inactive threads - for (IndexType i_edge = first_i_edge + logical_lane_id; - i_edge < end_i_edge; - i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) { - - IndexType valid_parent = -1; - - if (!edge_mask || edge_mask[i_edge]) { - IndexType u = col_ind[i_edge]; - IndexType lvl_u = distances[u]; - - if (lvl_u == (lvl - 1)) { - valid_parent = u; - } - } - - unsigned int warp_valid_p_ballot = cugraph::utils::ballot((valid_parent != -1)); - - int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE; - unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1; - unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot - >> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp); - logical_warp_valid_p_ballot &= mask; - - int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1; - - if (chosen_thread == logical_lane_id) { - //Using only one valid parent (reduce bw) - IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1); - int m = 1 << (v % INT_SIZE); - atomicOr(&visited[v / INT_SIZE], m); - distances[v] = lvl; - - if (predecessors) - predecessors[v] = valid_parent; - - new_frontier[off] = v; - } - - if (logical_warp_valid_p_ballot) { - break; - } - } - - } - } - - template - void bottom_up_large(IndexType *left_unvisited, - IndexType left_unvisited_size, - int *visited, - const IndexType *row_ptr, - const IndexType *col_ind, - IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_idx, - IndexType *distances, - IndexType *predecessors, - int *edge_mask, - cudaStream_t m_stream, - bool deterministic) { - dim3 grid, block; - block.x = LARGE_BOTTOMUP_DIMX; - grid.x = min( (IndexType) MAXBLOCKS, - ((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x); - - bottom_up_large_degree_kernel<<>>(left_unvisited, - left_unvisited_size, - visited, - row_ptr, - col_ind, - lvl, - new_frontier, - new_frontier_idx, - distances, - predecessors, - edge_mask); - cudaCheckError() - ; - } - - // - // - // ------------------------------ Top down ------------------------------ - // - // - - // - // compute_bucket_offsets_kernel - // simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer - // - - template - __global__ void compute_bucket_offsets_kernel( const IndexType *frontier_degrees_exclusive_sum, - IndexType *bucket_offsets, - const IndexType frontier_size, - IndexType total_degree) { - IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX - * NBUCKETS_PER_BLOCK + 1); - - for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x; - bid <= end; - bid += gridDim.x * blockDim.x) { - - IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1); - - bucket_offsets[bid] = binsearch_maxle( frontier_degrees_exclusive_sum, - eid, - (IndexType) 0, - frontier_size - 1); - - } - } - - template - void compute_bucket_offsets( IndexType *cumul, - IndexType *bucket_offsets, - IndexType frontier_size, - IndexType total_degree, - cudaStream_t m_stream) { - dim3 grid, block; - block.x = COMPUTE_BUCKET_OFFSETS_DIMX; - - grid.x = min( (IndexType) MAXBLOCKS, - ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX - * NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x); - - compute_bucket_offsets_kernel<<>>(cumul, - bucket_offsets, - frontier_size, - total_degree); - cudaCheckError() - ; - } - - // - // topdown_expand_kernel - // Read current frontier and compute new one with top down paradigm - // One thread = One edge - // To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than) - // This index k will give us the origin of this edge, which is frontier[k] - // This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k] - // - // To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches - // We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges - // - // Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k - // To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory - // We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below) - // - // We will then look which vertices are not visited yet : - // 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on - // 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue - // - // We then treat the candidates queue using the threadIdx.x < ncandidates - // If we are indeed the first thread to discover that vertex (result of atomicOr(visited)) - // We add it to the new frontier - // - - template - __global__ void topdown_expand_kernel( const IndexType *row_ptr, - const IndexType *col_ind, - const IndexType *frontier, - const IndexType frontier_size, - const IndexType totaldegree, - const IndexType max_items_per_thread, - const IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_cnt, - const IndexType *frontier_degrees_exclusive_sum, - const IndexType *frontier_degrees_exclusive_sum_buckets_offsets, - int *bmap, - IndexType *distances, - IndexType *predecessors, - const int *edge_mask, - const int *isolated_bmap, - bool directed) { - //BlockScan - typedef cub::BlockScan BlockScan; - __shared__ typename BlockScan::TempStorage scan_storage; - - // We will do a scan to know where to write in frontier - // This will contain the common offset of the block - __shared__ IndexType frontier_common_block_offset; - - __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1]; - __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1]; - - // - // Frontier candidates local queue - // We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything - // We also save the predecessors here, because we will not be able to retrieve it after - // - __shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE - * TOP_DOWN_EXPAND_DIMX]; - __shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE - * TOP_DOWN_EXPAND_DIMX]; - __shared__ IndexType block_n_frontier_candidates; - - IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread; - IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1) - / TOP_DOWN_EXPAND_DIMX; - - n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left); - - for (; - (n_items_per_thread_left > 0) && (block_offset < totaldegree); - - block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x, - n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) { - - // In this loop, we will process batch_set_size batches - IndexType nitems_per_thread = min( n_items_per_thread_left, - (IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD); - - // Loading buckets offset (see compute_bucket_offsets_kernel) - - if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1)) - shared_buckets_offsets[threadIdx.x] = - frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE - + threadIdx.x]; - - // We will use shared_buckets_offsets - __syncthreads(); - - // - // shared_buckets_offsets gives us a range of the possible indexes - // for edge of linear_threadx, we are looking for the value k such as - // k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx - // - // we have 0 <= k < frontier_size - // but we also have : - // - // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE] - // <= k - // <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1] - // - // To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below) - // We will load them here - // We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop - // Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below) - - //We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[ - //If it doesn't fit, --right until it does, then loop - //It is excepted to fit on the first try, that's why we start right = nitems_per_thread - - IndexType left = 0; - IndexType right = nitems_per_thread; - - while (left < nitems_per_thread) { - // - // Values that are necessary to compute the local binary searches - // We only need those with indexes between extremes indexes of buckets_offsets - // We need the next val for the binary search, hence the +1 - // - - IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] - - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; - - //If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1 - while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) { - --right; - - nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] - - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; - } - - IndexType nitems_per_thread_for_this_load = right - left; - - IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left - * NBUCKETS_PER_BLOCK]; - - if (threadIdx.x < nvalues_to_load) { - shared_frontier_degrees_exclusive_sum[threadIdx.x] = - frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset - + threadIdx.x]; - } - - if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) { - shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] = - frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset - + TOP_DOWN_EXPAND_DIMX]; - } - - //shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync - __syncthreads(); - - // Now we will process the edges - // Here each thread will process nitems_per_thread_for_this_load - for (IndexType item_index = 0; - item_index < nitems_per_thread_for_this_load; - item_index += TOP_DOWN_BATCH_SIZE) { - - // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism) - // Reduces latency - - IndexType current_max_edge_index = min(block_offset - + (left - + nitems_per_thread_for_this_load) - * blockDim.x, - totaldegree); - - //We will need vec_u (source of the edge) until the end if we need to save the predecessors - //For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case) - - IndexType vec_u[TOP_DOWN_BATCH_SIZE]; - IndexType local_buf1[TOP_DOWN_BATCH_SIZE]; - IndexType local_buf2[TOP_DOWN_BATCH_SIZE]; - - IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0]; - -#pragma unroll - for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - - IndexType ibatch = left + item_index + iv; - IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x; - - if (gid < current_max_edge_index) { - IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x) - / TOP_DOWN_BUCKET_SIZE; - IndexType bucket_start = shared_buckets_offsets[start_off_idx] - - frontier_degrees_exclusive_sum_block_offset; - IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1] - - frontier_degrees_exclusive_sum_block_offset; - - IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum, - gid, - bucket_start, - bucket_end) - + frontier_degrees_exclusive_sum_block_offset; - vec_u[iv] = frontier[k]; // origin of this edge - vec_frontier_degrees_exclusive_sum_index[iv] = - frontier_degrees_exclusive_sum[k]; - } else { - vec_u[iv] = -1; - vec_frontier_degrees_exclusive_sum_index[iv] = -1; - } - - } - - IndexType *vec_row_ptr_u = &local_buf1[0]; -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType u = vec_u[iv]; - //row_ptr for this vertex origin u - vec_row_ptr_u[iv] = (u != -1) - ? row_ptr[u] - : - -1; - } - - //We won't need row_ptr after that, reusing pointer - IndexType *vec_dest_v = vec_row_ptr_u; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType thread_item_index = left + item_index + iv; - IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x; - - IndexType row_ptr_u = vec_row_ptr_u[iv]; - IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv]; - - if (edge_mask && !edge_mask[edge]) - row_ptr_u = -1; //disabling edge - - //Destination of this edge - vec_dest_v[iv] = (row_ptr_u != -1) - ? col_ind[edge] - : - -1; - } - - //We don't need vec_frontier_degrees_exclusive_sum_index anymore - IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index; -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_dest_v[iv]; - vec_v_visited_bmap[iv] = (v != -1) - ? bmap[v / INT_SIZE] - : - (~0); //will look visited - } - - // From now on we will consider v as a frontier candidate - // If for some reason vec_candidate[iv] should be put in the new_frontier - // Then set vec_candidate[iv] = -1 - IndexType *vec_frontier_candidate = vec_dest_v; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - int m = 1 << (v % INT_SIZE); - - int is_visited = vec_v_visited_bmap[iv] & m; - - if (is_visited) - vec_frontier_candidate[iv] = -1; - } - - if (directed) { - //vec_v_visited_bmap is available - - IndexType *vec_is_isolated_bmap = vec_v_visited_bmap; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - vec_is_isolated_bmap[iv] = (v != -1) - ? isolated_bmap[v / INT_SIZE] - : - -1; - } - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - int m = 1 << (v % INT_SIZE); - int is_isolated = vec_is_isolated_bmap[iv] & m; - - //If v is isolated, we will not add it to the frontier (it's not a frontier candidate) - // 1st reason : it's useless - // 2nd reason : it will make top down algo fail - // we need each node in frontier to have a degree > 0 - // If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr - - if (is_isolated && v != -1) { - int m = 1 << (v % INT_SIZE); - atomicOr(&bmap[v / INT_SIZE], m); - if (distances) - distances[v] = lvl; - - if (predecessors) - predecessors[v] = vec_u[iv]; - - //This is no longer a candidate, neutralize it - vec_frontier_candidate[iv] = -1; - } - - } - } - - //Number of successor candidate hold by this thread - IndexType thread_n_frontier_candidates = 0; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - IndexType v = vec_frontier_candidate[iv]; - if (v != -1) - ++thread_n_frontier_candidates; - } - - // We need to have all nfrontier_candidates to be ready before doing the scan - __syncthreads(); - - // We will put the frontier candidates in a local queue - // Computing offsets - IndexType thread_frontier_candidate_offset = 0; //offset inside block - BlockScan(scan_storage).ExclusiveSum( thread_n_frontier_candidates, - thread_frontier_candidate_offset); - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - //May have bank conflicts - IndexType frontier_candidate = vec_frontier_candidate[iv]; - - if (frontier_candidate != -1) { - shared_local_new_frontier_candidates[thread_frontier_candidate_offset] = - frontier_candidate; - shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] = - vec_u[iv]; - ++thread_frontier_candidate_offset; - } - } - - if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { - //No need to add nsuccessor_candidate, even if its an - //exclusive sum - //We incremented the thread_frontier_candidate_offset - block_n_frontier_candidates = thread_frontier_candidate_offset; - } - - //broadcast block_n_frontier_candidates - __syncthreads(); - - IndexType naccepted_vertices = 0; - //We won't need vec_frontier_candidate after that - IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate; - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - const int idx_shared = iv * blockDim.x + threadIdx.x; - vec_frontier_accepted_vertex[iv] = -1; - - if (idx_shared < block_n_frontier_candidates) { - IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue - int m = 1 << (v % INT_SIZE); - int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old - - if (!(m & q)) { //if this thread was the first to discover this node - if (distances) - distances[v] = lvl; - - if (predecessors) { - IndexType pred = shared_local_new_frontier_predecessors[idx_shared]; - predecessors[v] = pred; - } - - vec_frontier_accepted_vertex[iv] = v; - ++naccepted_vertices; - } - } - - } - - //We need naccepted_vertices to be ready - __syncthreads(); - - IndexType thread_new_frontier_offset; - - BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset); - - if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { - - IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices; - //for this thread, thread_new_frontier_offset + has_successor (exclusive sum) - if (inclusive_sum) - frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); - } - - //Broadcasting frontier_common_block_offset - __syncthreads(); - -#pragma unroll - for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { - const int idx_shared = iv * blockDim.x + threadIdx.x; - if (idx_shared < block_n_frontier_candidates) { - - IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv]; - - if (new_frontier_vertex != -1) { - IndexType off = frontier_common_block_offset + thread_new_frontier_offset++; - new_frontier[off] = new_frontier_vertex; - } - } - } - - } - - //We need to keep shared_frontier_degrees_exclusive_sum coherent - __syncthreads(); - - //Preparing for next load - left = right; - right = nitems_per_thread; - } - - //we need to keep shared_buckets_offsets coherent - __syncthreads(); - } - - } - - template - void frontier_expand(const IndexType *row_ptr, - const IndexType *col_ind, - const IndexType *frontier, - const IndexType frontier_size, - const IndexType totaldegree, - const IndexType lvl, - IndexType *new_frontier, - IndexType *new_frontier_cnt, - const IndexType *frontier_degrees_exclusive_sum, - const IndexType *frontier_degrees_exclusive_sum_buckets_offsets, - int *visited_bmap, - IndexType *distances, - IndexType *predecessors, - const int *edge_mask, - const int *isolated_bmap, - bool directed, - cudaStream_t m_stream, - bool deterministic) { - if (!totaldegree) - return; - - dim3 block; - block.x = TOP_DOWN_EXPAND_DIMX; - - IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1) - / (MAXBLOCKS * block.x); - - dim3 grid; - grid.x = min( (totaldegree + max_items_per_thread * block.x - 1) - / (max_items_per_thread * block.x), - (IndexType) MAXBLOCKS); - - topdown_expand_kernel<<>>( row_ptr, - col_ind, - frontier, - frontier_size, - totaldegree, - max_items_per_thread, - lvl, - new_frontier, - new_frontier_cnt, - frontier_degrees_exclusive_sum, - frontier_degrees_exclusive_sum_buckets_offsets, - visited_bmap, - distances, - predecessors, - edge_mask, - isolated_bmap, - directed); - cudaCheckError() - ; - } - - template - __global__ void flag_isolated_vertices_kernel( IndexType n, - int *isolated_bmap, - const IndexType *row_ptr, - IndexType *degrees, - IndexType *nisolated) { - typedef cub::BlockLoad BlockLoad; - typedef cub::BlockStore BlockStore; - typedef cub::BlockReduce BlockReduce; - typedef cub::WarpReduce WarpReduce; - - __shared__ typename BlockLoad::TempStorage load_temp_storage; - __shared__ typename BlockStore::TempStorage store_temp_storage; - __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage; - - __shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX - / FLAG_ISOLATED_VERTICES_THREADS_PER_INT]; - - __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX]; - - for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - * (blockDim.x * blockIdx.x); - block_off < n; - block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) { - - IndexType thread_off = block_off - + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x; - IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1; - - IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; - IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1] - - BlockLoad(load_temp_storage).Load( row_ptr + block_off, - thread_row_ptr, - block_valid_items, - -1); - - //To compute 4 degrees, we need 5 values of row_ptr - //Saving the "5th" value in shared memory for previous thread to use - if (threadIdx.x > 0) { - row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0]; - } - - //If this is the last thread, it needs to load its row ptr tail value - if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) { - row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1]; - - } - __syncthreads(); // we may reuse temp_storage - - int local_isolated_bmap = 0; - - IndexType imax = (n - thread_off); - - IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; - -#pragma unroll - for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) { - IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i]; - - if (i < imax) - local_isolated_bmap |= ((degree == 0) << i); - } - - if (last_node_thread < n) { - IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] = - row_ptr_tail[threadIdx.x] - - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1]; - - local_isolated_bmap |= ((degree == 0) - << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1)); - - } - - local_isolated_bmap <<= (thread_off % INT_SIZE); - - IndexType local_nisolated = __popc(local_isolated_bmap); - - //We need local_nisolated and local_isolated_bmap to be ready for next steps - __syncthreads(); - - IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated); - - if (threadIdx.x == 0 && total_nisolated) { - atomicAdd(nisolated, total_nisolated); - } - - int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT; - - //Building int for bmap - int int_aggregate_isolated_bmap = - WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce( local_isolated_bmap, - BitwiseOr()); - - int is_head_of_visited_int = - ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0); - if (is_head_of_visited_int) { - isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap; - } - - BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items); - } - } - - template - void flag_isolated_vertices( IndexType n, - int *isolated_bmap, - const IndexType *row_ptr, - IndexType *degrees, - IndexType *nisolated, - cudaStream_t m_stream) { - dim3 grid, block; - block.x = FLAG_ISOLATED_VERTICES_DIMX; - - grid.x = min( (IndexType) MAXBLOCKS, - (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x); - - flag_isolated_vertices_kernel<<>>(n, - isolated_bmap, - row_ptr, - degrees, - nisolated); - cudaCheckError() - ; - } - - // - // - // - // Some utils functions - // - // - - //Creates CUB data for graph size n - template - void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) { - // Determine temporary device storage requirements for exclusive prefix scan - d_temp_storage = NULL; - temp_storage_bytes = 0; - IndexType *d_in = NULL, *d_out = NULL; - cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n); - // Allocate temporary storage for exclusive prefix scan - cudaMalloc(&d_temp_storage, temp_storage_bytes); - } - - template - __global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) { - for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x; - u < n; - u += gridDim.x * blockDim.x) - vec[u] = val; - - } - - template - void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) { - dim3 grid, block; - block.x = 256; - grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS); - fill_kernel<<>>(vec, n, val); - cudaCheckError() - ; - } - - template - __global__ void set_frontier_degree_kernel( IndexType *frontier_degree, - IndexType *frontier, - const IndexType *degree, - IndexType n) { - for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x; - idx < n; - idx += gridDim.x * blockDim.x) { - IndexType u = frontier[idx]; - frontier_degree[idx] = degree[u]; - } - } - - template - void set_frontier_degree( IndexType *frontier_degree, - IndexType *frontier, - const IndexType *degree, - IndexType n, - cudaStream_t m_stream) { - dim3 grid, block; - block.x = 256; - grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS); - set_frontier_degree_kernel<<>>(frontier_degree, - frontier, - degree, - n); - cudaCheckError() - ; - } - - template - void exclusive_sum( void *d_temp_storage, - size_t temp_storage_bytes, - IndexType *d_in, - IndexType *d_out, - IndexType num_items, - cudaStream_t m_stream) { - if (num_items <= 1) - return; //DeviceScan fails if n==1 - cub::DeviceScan::ExclusiveSum(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - m_stream); - } - - template - __global__ void fill_vec_kernel(T *vec, T n, T val) { - for (T idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < n; - idx += blockDim.x * gridDim.x) - vec[idx] = val; - } - - template - void fill_vec(T *vec, T n, T val, cudaStream_t stream) { - dim3 grid, block; - block.x = 256; - grid.x = (n + block.x - 1) / block.x; - - fill_vec_kernel<<>>(vec, n, val); - cudaCheckError() - ; - } -} -// diff --git a/cpp/src/nvgraph_gdf.cu b/cpp/src/community/nvgraph_gdf.cu similarity index 54% rename from cpp/src/nvgraph_gdf.cu rename to cpp/src/community/nvgraph_gdf.cu index 7f493ecafe9..4e605fb91f3 100644 --- a/cpp/src/nvgraph_gdf.cu +++ b/cpp/src/community/nvgraph_gdf.cu @@ -21,248 +21,19 @@ * @file nvgraph_gdf.cu * ---------------------------------------------------------------------------**/ +#include #include #include #include #include #include "utilities/error_utils.h" +#include "converters/nvgraph.cuh" //RMM: // #include -template -using Vector = thrust::device_vector>; - -gdf_error nvgraph2gdf_error(nvgraphStatus_t nvg_stat) { - switch (nvg_stat) { - case NVGRAPH_STATUS_SUCCESS: - return GDF_SUCCESS; - case NVGRAPH_STATUS_NOT_INITIALIZED: - return GDF_INVALID_API_CALL; - case NVGRAPH_STATUS_INVALID_VALUE: - return GDF_INVALID_API_CALL; - case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED: - return GDF_UNSUPPORTED_DTYPE; - case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED: - return GDF_INVALID_API_CALL; - default: - return GDF_CUDA_ERROR; - } -} - -gdf_error nvgraph2gdf_error_verbose(nvgraphStatus_t nvg_stat) { - switch (nvg_stat) { - case NVGRAPH_STATUS_NOT_INITIALIZED: - std::cerr << "nvGRAPH not initialized"; - return GDF_CUDA_ERROR; - case NVGRAPH_STATUS_ALLOC_FAILED: - std::cerr << "nvGRAPH alloc failed"; - return GDF_CUDA_ERROR; - case NVGRAPH_STATUS_INVALID_VALUE: - std::cerr << "nvGRAPH invalid value"; - return GDF_CUDA_ERROR; - case NVGRAPH_STATUS_ARCH_MISMATCH: - std::cerr << "nvGRAPH arch mismatch"; - return GDF_CUDA_ERROR; - case NVGRAPH_STATUS_MAPPING_ERROR: - std::cerr << "nvGRAPH mapping error"; - return GDF_CUDA_ERROR; - case NVGRAPH_STATUS_EXECUTION_FAILED: - std::cerr << "nvGRAPH execution failed"; - return GDF_CUDA_ERROR; - case NVGRAPH_STATUS_INTERNAL_ERROR: - std::cerr << "nvGRAPH internal error"; - return GDF_CUDA_ERROR; - case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED: - std::cerr << "nvGRAPH type not supported"; - return GDF_CUDA_ERROR; - case NVGRAPH_STATUS_NOT_CONVERGED: - std::cerr << "nvGRAPH algorithm failed to converge"; - return GDF_CUDA_ERROR; - case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED: - std::cerr << "nvGRAPH graph type not supported"; - return GDF_CUDA_ERROR; - default: - std::cerr << "Unknown nvGRAPH Status"; - return GDF_CUDA_ERROR; - } -} - -#ifdef VERBOSE -#define NVG_TRY(call) \ -{ \ - if ((call)!=NVGRAPH_STATUS_SUCCESS) \ - return nvgraph2gdf_error_verbose((call)); \ -} -#else -#define NVG_TRY(call) \ -{ \ - nvgraphStatus_t err_code = (call); \ - if (err_code != NVGRAPH_STATUS_SUCCESS) \ - return nvgraph2gdf_error(err_code); \ -} -#endif - -gdf_error gdf_createGraph_nvgraph(nvgraphHandle_t nvg_handle, - gdf_graph* gdf_G, - nvgraphGraphDescr_t* nvg_G, - bool use_transposed) { - - // check input - GDF_REQUIRE(!((gdf_G->edgeList == nullptr) && - (gdf_G->adjList == nullptr) && - (gdf_G->transposedAdjList == nullptr)), - GDF_INVALID_API_CALL); - nvgraphTopologyType_t TT; - cudaDataType_t settype; - // create an nvgraph graph handle - NVG_TRY(nvgraphCreateGraphDescr(nvg_handle, nvg_G)); - // setup nvgraph variables - if (use_transposed) { - // convert edgeList to transposedAdjList - if (gdf_G->transposedAdjList == nullptr) { - GDF_TRY(gdf_add_transposed_adj_list(gdf_G)); - } - // using exiting transposedAdjList if it exisits and if adjList is missing - TT = NVGRAPH_CSC_32; - nvgraphCSCTopology32I_st topoData; - topoData.nvertices = gdf_G->transposedAdjList->offsets->size - 1; - topoData.nedges = gdf_G->transposedAdjList->indices->size; - topoData.destination_offsets = (int *) gdf_G->transposedAdjList->offsets->data; - topoData.source_indices = (int *) gdf_G->transposedAdjList->indices->data; - // attach the transposed adj list - NVG_TRY(nvgraphAttachGraphStructure(nvg_handle, *nvg_G, (void * )&topoData, TT)); - //attach edge values - if (gdf_G->transposedAdjList->edge_data) { - switch (gdf_G->transposedAdjList->edge_data->dtype) { - case GDF_FLOAT32: - settype = CUDA_R_32F; - NVG_TRY(nvgraphAttachEdgeData(nvg_handle, - *nvg_G, - 0, - settype, - (float * ) gdf_G->transposedAdjList->edge_data->data)) - break; - case GDF_FLOAT64: - settype = CUDA_R_64F; - NVG_TRY(nvgraphAttachEdgeData(nvg_handle, - *nvg_G, - 0, - settype, - (double * ) gdf_G->transposedAdjList->edge_data->data)) - break; - default: - return GDF_UNSUPPORTED_DTYPE; - } - } - - } - else { - // convert edgeList to adjList - if (gdf_G->adjList == nullptr) { - GDF_TRY(gdf_add_adj_list(gdf_G)); - } - TT = NVGRAPH_CSR_32; - nvgraphCSRTopology32I_st topoData; - topoData.nvertices = gdf_G->adjList->offsets->size - 1; - topoData.nedges = gdf_G->adjList->indices->size; - topoData.source_offsets = (int *) gdf_G->adjList->offsets->data; - topoData.destination_indices = (int *) gdf_G->adjList->indices->data; - - // attach adj list - NVG_TRY(nvgraphAttachGraphStructure(nvg_handle, *nvg_G, (void * )&topoData, TT)); - //attach edge values - if (gdf_G->adjList->edge_data) { - switch (gdf_G->adjList->edge_data->dtype) { - case GDF_FLOAT32: - settype = CUDA_R_32F; - NVG_TRY(nvgraphAttachEdgeData(nvg_handle, - *nvg_G, - 0, - settype, - (float * ) gdf_G->adjList->edge_data->data)) - break; - case GDF_FLOAT64: - settype = CUDA_R_64F; - NVG_TRY(nvgraphAttachEdgeData(nvg_handle, - *nvg_G, - 0, - settype, - (double * ) gdf_G->adjList->edge_data->data)) - break; - default: - return GDF_UNSUPPORTED_DTYPE; - } - } - } - return GDF_SUCCESS; -} - -gdf_error gdf_sssp_nvgraph(gdf_graph *gdf_G, - const int *source_vert, - gdf_column *sssp_distances) { - - GDF_REQUIRE(gdf_G != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(*source_vert >= 0, GDF_INVALID_API_CALL); - GDF_REQUIRE(*source_vert < sssp_distances->size, GDF_INVALID_API_CALL); - GDF_REQUIRE(sssp_distances != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(sssp_distances->data != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(!sssp_distances->valid, GDF_VALIDITY_UNSUPPORTED); - GDF_REQUIRE(sssp_distances->size > 0, GDF_INVALID_API_CALL); - - // init nvgraph - // TODO : time this call - nvgraphHandle_t nvg_handle = 0; - nvgraphGraphDescr_t nvgraph_G = 0; - cudaDataType_t settype; - - NVG_TRY(nvgraphCreate(&nvg_handle)); - GDF_TRY(gdf_createGraph_nvgraph(nvg_handle, gdf_G, &nvgraph_G, true)); - - int sssp_index = 0; - int weight_index = 0; - Vector d_val; - - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - if (gdf_G->transposedAdjList->edge_data == nullptr) { - // use a fp32 vector [1,...,1] - settype = CUDA_R_32F; - d_val.resize(gdf_G->transposedAdjList->indices->size); - thrust::fill(thrust::cuda::par(allocator).on(stream), d_val.begin(), d_val.end(), 1.0); - NVG_TRY(nvgraphAttachEdgeData(nvg_handle, - nvgraph_G, - weight_index, - settype, - (void * ) thrust::raw_pointer_cast(d_val.data()))); - } - else { - switch (gdf_G->transposedAdjList->edge_data->dtype) { - case GDF_FLOAT32: - settype = CUDA_R_32F; - break; - case GDF_FLOAT64: - settype = CUDA_R_64F; - break; - default: - return GDF_UNSUPPORTED_DTYPE; - } - } - - NVG_TRY(nvgraphAttachVertexData(nvg_handle, nvgraph_G, 0, settype, sssp_distances->data)); - - NVG_TRY(nvgraphSssp(nvg_handle, nvgraph_G, weight_index, source_vert, sssp_index)); - - NVG_TRY(nvgraphDestroyGraphDescr(nvg_handle, nvgraph_G)); - NVG_TRY(nvgraphDestroy(nvg_handle)); - - return GDF_SUCCESS; -} - gdf_error gdf_balancedCutClustering_nvgraph(gdf_graph* gdf_G, const int num_clusters, const int num_eigen_vects, @@ -285,37 +56,37 @@ gdf_error gdf_balancedCutClustering_nvgraph(gdf_graph* gdf_G, nvgraphHandle_t nvg_handle = nullptr; nvgraphGraphDescr_t nvgraph_G = nullptr; cudaDataType_t settype; - Vector d_val; + rmm::device_vector d_val; NVG_TRY(nvgraphCreate(&nvg_handle)); GDF_TRY(gdf_createGraph_nvgraph(nvg_handle, gdf_G, &nvgraph_G, false)); int weight_index = 0; - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - if (gdf_G->adjList->edge_data == nullptr) { - // use a fp64 vector [1,...,1] - settype = CUDA_R_64F; - d_val.resize(gdf_G->adjList->indices->size); - thrust::fill(thrust::cuda::par(allocator).on(stream), d_val.begin(), d_val.end(), 1.0); - NVG_TRY(nvgraphAttachEdgeData(nvg_handle, - nvgraph_G, - weight_index, - settype, - (void * ) thrust::raw_pointer_cast(d_val.data()))); - } - else { - switch (gdf_G->adjList->edge_data->dtype) { - case GDF_FLOAT32: - settype = CUDA_R_32F; - break; - case GDF_FLOAT64: - settype = CUDA_R_64F; - break; - default: - return GDF_UNSUPPORTED_DTYPE; - } - } + cudaStream_t stream{nullptr}; + + if (gdf_G->adjList->edge_data == nullptr) { + // use a fp64 vector [1,...,1] + settype = CUDA_R_64F; + d_val.resize(gdf_G->adjList->indices->size); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_val.begin(), d_val.end(), 1.0); + NVG_TRY(nvgraphAttachEdgeData(nvg_handle, + nvgraph_G, + weight_index, + settype, + (void * ) thrust::raw_pointer_cast(d_val.data()))); + } + else { + switch (gdf_G->adjList->edge_data->dtype) { + case GDF_FLOAT32: + settype = CUDA_R_32F; + break; + case GDF_FLOAT64: + settype = CUDA_R_64F; + break; + default: + return GDF_UNSUPPORTED_DTYPE; + } + } // Pack parameters for call to Nvgraph @@ -442,36 +213,36 @@ gdf_error gdf_AnalyzeClustering_edge_cut_nvgraph(gdf_graph* gdf_G, nvgraphHandle_t nvg_handle = nullptr; nvgraphGraphDescr_t nvgraph_G = nullptr; cudaDataType_t settype; - Vector d_val; + rmm::device_vector d_val; NVG_TRY(nvgraphCreate(&nvg_handle)); GDF_TRY(gdf_createGraph_nvgraph(nvg_handle, gdf_G, &nvgraph_G, false)); int weight_index = 0; - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - if (gdf_G->adjList->edge_data == nullptr) { - // use a fp64 vector [1,...,1] - settype = CUDA_R_64F; - d_val.resize(gdf_G->adjList->indices->size); - thrust::fill(thrust::cuda::par(allocator).on(stream), d_val.begin(), d_val.end(), 1.0); - NVG_TRY(nvgraphAttachEdgeData(nvg_handle, - nvgraph_G, - weight_index, - settype, - (void * ) thrust::raw_pointer_cast(d_val.data()))); - } - else { - switch (gdf_G->adjList->edge_data->dtype) { - case GDF_FLOAT32: - settype = CUDA_R_32F; - break; - case GDF_FLOAT64: - settype = CUDA_R_64F; - break; - default: - return GDF_UNSUPPORTED_DTYPE; - } + cudaStream_t stream{nullptr}; + + if (gdf_G->adjList->edge_data == nullptr) { + // use a fp64 vector [1,...,1] + settype = CUDA_R_64F; + d_val.resize(gdf_G->adjList->indices->size); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_val.begin(), d_val.end(), 1.0); + NVG_TRY(nvgraphAttachEdgeData(nvg_handle, + nvgraph_G, + weight_index, + settype, + (void * ) thrust::raw_pointer_cast(d_val.data()))); + } + else { + switch (gdf_G->adjList->edge_data->dtype) { + case GDF_FLOAT32: + settype = CUDA_R_32F; + break; + case GDF_FLOAT64: + settype = CUDA_R_64F; + break; + default: + return GDF_UNSUPPORTED_DTYPE; + } } // Make Nvgraph call @@ -560,8 +331,8 @@ gdf_error gdf_extract_subgraph_vertex_nvgraph(gdf_graph* gdf_G, cudaStream_t stream { nullptr }; - ALLOC_MANAGED_TRY((void**) &offsets, sizeof(int32_t) * (num_verts + 1), stream); - ALLOC_MANAGED_TRY((void**) &indices, sizeof(int32_t) * num_edges, stream); + ALLOC_TRY((void**) &offsets, sizeof(int32_t) * (num_verts + 1), stream); + ALLOC_TRY((void**) &indices, sizeof(int32_t) * num_edges, stream); gdf_column_view(result->adjList->offsets, offsets, @@ -598,3 +369,48 @@ gdf_error gdf_triangle_count_nvgraph(gdf_graph* G, uint64_t* result) { NVG_TRY(nvgraphTriangleCount(nvg_handle, nvg_G, result)); return GDF_SUCCESS; } + +gdf_error gdf_louvain(gdf_graph *graph, void *final_modularity, void *num_level, gdf_column *louvain_parts) { + GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL); + gdf_error err = gdf_add_adj_list(graph); + if (err != GDF_SUCCESS) + return err; + + size_t n = graph->adjList->offsets->size - 1; + size_t e = graph->adjList->indices->size; + + void* offsets_ptr = graph->adjList->offsets->data; + void* indices_ptr = graph->adjList->indices->data; + + void* value_ptr; + rmm::device_vector d_values; + if(graph->adjList->edge_data) { + value_ptr = graph->adjList->edge_data->data; + } + else { + cudaStream_t stream {nullptr}; + d_values.resize(graph->adjList->indices->size); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_values.begin(), d_values.end(), 1.0); + value_ptr = (void * ) thrust::raw_pointer_cast(d_values.data()); + } + + void* louvain_parts_ptr = louvain_parts->data; + + auto gdf_to_cudadtype= [](gdf_column *col){ + cudaDataType_t cuda_dtype; + switch(col->dtype){ + case GDF_INT8: cuda_dtype = CUDA_R_8I; break; + case GDF_INT32: cuda_dtype = CUDA_R_32I; break; + case GDF_FLOAT32: cuda_dtype = CUDA_R_32F; break; + case GDF_FLOAT64: cuda_dtype = CUDA_R_64F; break; + default: throw new std::invalid_argument("Cannot convert data type"); + }return cuda_dtype; + }; + + cudaDataType_t index_type = gdf_to_cudadtype(graph->adjList->indices); + cudaDataType_t val_type = graph->adjList->edge_data? gdf_to_cudadtype(graph->adjList->edge_data): CUDA_R_32F; + + nvgraphLouvain(index_type, val_type, n, e, offsets_ptr, indices_ptr, value_ptr, 1, 0, NULL, + final_modularity, louvain_parts_ptr, num_level); + return GDF_SUCCESS; +} diff --git a/cpp/src/COOtoCSR.cuh b/cpp/src/converters/COOtoCSR.cuh similarity index 63% rename from cpp/src/COOtoCSR.cuh rename to cpp/src/converters/COOtoCSR.cuh index 2ed2da4cd50..f00b352f0e4 100644 --- a/cpp/src/COOtoCSR.cuh +++ b/cpp/src/converters/COOtoCSR.cuh @@ -38,33 +38,33 @@ template struct CSR_Result { - std::int64_t size; - std::int64_t nnz; - T* rowOffsets; - T* colIndices; + std::int64_t size; + std::int64_t nnz; + T* rowOffsets; + T* colIndices; - CSR_Result() : size(0), nnz(0), rowOffsets(nullptr), colIndices(nullptr){} + CSR_Result() : size(0), nnz(0), rowOffsets(nullptr), colIndices(nullptr){} }; template struct CSR_Result_Weighted { - std::int64_t size; - std::int64_t nnz; - T* rowOffsets; - T* colIndices; - W* edgeWeights; + std::int64_t size; + std::int64_t nnz; + T* rowOffsets; + T* colIndices; + W* edgeWeights; - CSR_Result_Weighted() : size(0), nnz(0), rowOffsets(nullptr), colIndices(nullptr), edgeWeights(nullptr){} + CSR_Result_Weighted() : size(0), nnz(0), rowOffsets(nullptr), colIndices(nullptr), edgeWeights(nullptr){} }; // Define kernel for copying run length encoded values into offset slots. template __global__ void offsetsKernel(T runCounts, T* unique, T* counts, T* offsets) { - uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid < runCounts) - offsets[unique[tid]] = counts[tid]; + uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < runCounts) + offsets[unique[tid]] = counts[tid]; } // Method for constructing CSR from COO @@ -73,13 +73,11 @@ gdf_error ConvertCOOtoCSR(T* sources, T* destinations, int64_t nnz, CSR_Resulton(stream), dests, dests + nnz, srcs); + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), srcs, srcs + nnz, dests); - // Find max id (since this may be in the dests array but not the srcs array we need to check both) + // Find max id (since this may be in the dests array but not the srcs array we need to check both) T maxId = -1; // Max from srcs after sorting is just the last element CUDA_TRY(cudaMemcpy(&maxId, &(srcs[nnz-1]), sizeof(T), cudaMemcpyDefault)); - auto maxId_it = thrust::max_element(thrust::cuda::par(allocator).on(stream), dests, dests + nnz); + auto maxId_it = thrust::max_element(rmm::exec_policy(stream)->on(stream), dests, dests + nnz); T maxId2; CUDA_TRY(cudaMemcpy(&maxId2, maxId_it, sizeof(T), cudaMemcpyDefault)); maxId = maxId > maxId2 ? maxId : maxId2; result.size = maxId + 1; // Allocate offsets array - ALLOC_MANAGED_TRY((void**)&result.rowOffsets, (maxId + 2) * sizeof(T), stream); + ALLOC_TRY((void**)&result.rowOffsets, (maxId + 2) * sizeof(T), stream); // Set all values in offsets array to zeros CUDA_TRY(cudaMemset(result.rowOffsets, 0,(maxId + 2) * sizeof(int))); // Allocate temporary arrays same size as sources array, and single value to get run counts T* unique{nullptr}, *counts{nullptr}, *runCount{nullptr}; - ALLOC_MANAGED_TRY((void**)&unique, (maxId + 1) * sizeof(T), stream); - ALLOC_MANAGED_TRY((void**)&counts, (maxId + 1) * sizeof(T), stream); - ALLOC_MANAGED_TRY((void**)&runCount, sizeof(T), stream); + ALLOC_TRY((void**)&unique, (maxId + 1) * sizeof(T), stream); + ALLOC_TRY((void**)&counts, (maxId + 1) * sizeof(T), stream); + ALLOC_TRY((void**)&runCount, sizeof(T), stream); // Use CUB run length encoding to get unique values and run lengths tmpStorage = nullptr; - cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz); - ALLOC_MANAGED_TRY((void**)&tmpStorage, tmpBytes, stream); - cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz); + CUDA_TRY(cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz)); + ALLOC_TRY((void**)&tmpStorage, tmpBytes, stream); + CUDA_TRY(cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz)); ALLOC_FREE_TRY(tmpStorage, stream); // Set offsets to run sizes for each index @@ -128,7 +126,7 @@ gdf_error ConvertCOOtoCSR(T* sources, T* destinations, int64_t nnz, CSR_Result>>(runCount_h, unique, counts, result.rowOffsets); // Scan offsets to get final offsets - thrust::exclusive_scan(thrust::cuda::par(allocator).on(stream), result.rowOffsets, result.rowOffsets + maxId + 2, result.rowOffsets); + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), result.rowOffsets, result.rowOffsets + maxId + 2, result.rowOffsets); // Clean up temporary allocations result.nnz = nnz; @@ -149,26 +147,24 @@ gdf_error ConvertCOOtoCSR_weighted(T* sources, T* destinations, W* edgeWeights, T* dests{nullptr}; W* weights{nullptr}; - //RMM: - // - cudaStream_t stream{nullptr}; - rmm_temp_allocator allocator(stream); - ALLOC_MANAGED_TRY((void**)&srcs, sizeof(T) * nnz, stream); - ALLOC_MANAGED_TRY((void**)&dests, sizeof(T) * nnz, stream); - ALLOC_MANAGED_TRY((void**)&weights, sizeof(W) * nnz, stream); + cudaStream_t stream {nullptr}; + + ALLOC_TRY((void**)&srcs, sizeof(T) * nnz, stream); + ALLOC_TRY((void**)&dests, sizeof(T) * nnz, stream); + ALLOC_TRY((void**)&weights, sizeof(W) * nnz, stream); CUDA_TRY(cudaMemcpy(srcs, sources, sizeof(T) * nnz, cudaMemcpyDefault)); CUDA_TRY(cudaMemcpy(dests, destinations, sizeof(T) * nnz, cudaMemcpyDefault)); CUDA_TRY(cudaMemcpy(weights, edgeWeights, sizeof(W) * nnz, cudaMemcpyDefault)); // Call Thrust::sort_by_key to sort the arrays with srcs as keys: - thrust::stable_sort_by_key(thrust::cuda::par(allocator).on(stream), dests, dests + nnz, thrust::make_zip_iterator(thrust::make_tuple(srcs, weights))); - thrust::stable_sort_by_key(thrust::cuda::par(allocator).on(stream), srcs, srcs + nnz, thrust::make_zip_iterator(thrust::make_tuple(dests, weights))); + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), dests, dests + nnz, thrust::make_zip_iterator(thrust::make_tuple(srcs, weights))); + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), srcs, srcs + nnz, thrust::make_zip_iterator(thrust::make_tuple(dests, weights))); - // Find max id (since this may be in the dests array but not the srcs array we need to check both) + // Find max id (since this may be in the dests array but not the srcs array we need to check both) T maxId = -1; // Max from srcs after sorting is just the last element CUDA_TRY(cudaMemcpy(&maxId, &(srcs[nnz-1]), sizeof(T), cudaMemcpyDefault)); - auto maxId_it = thrust::max_element(thrust::cuda::par(allocator).on(stream), dests, dests + nnz); + auto maxId_it = thrust::max_element(rmm::exec_policy(stream)->on(stream), dests, dests + nnz); // Max from dests requires a scan to find T maxId2; CUDA_TRY(cudaMemcpy(&maxId2, maxId_it, sizeof(T), cudaMemcpyDefault)); @@ -176,7 +172,7 @@ gdf_error ConvertCOOtoCSR_weighted(T* sources, T* destinations, W* edgeWeights, result.size = maxId + 1; // Allocate offsets array - ALLOC_MANAGED_TRY((void**)&result.rowOffsets, (maxId + 2) * sizeof(T), stream); + ALLOC_TRY((void**)&result.rowOffsets, (maxId + 2) * sizeof(T), stream); // Set all values in offsets array to zeros // /CUDA_TRY( @@ -186,16 +182,16 @@ gdf_error ConvertCOOtoCSR_weighted(T* sources, T* destinations, W* edgeWeights, // Allocate temporary arrays same size as sources array, and single value to get run counts T* unique, *counts, *runCount; - ALLOC_MANAGED_TRY((void**)&unique, (maxId + 1) * sizeof(T), stream); - ALLOC_MANAGED_TRY((void**)&counts, (maxId + 1) * sizeof(T), stream); - ALLOC_MANAGED_TRY((void**)&runCount, sizeof(T), stream); + ALLOC_TRY((void**)&unique, (maxId + 1) * sizeof(T), stream); + ALLOC_TRY((void**)&counts, (maxId + 1) * sizeof(T), stream); + ALLOC_TRY((void**)&runCount, sizeof(T), stream); // Use CUB run length encoding to get unique values and run lengths void *tmpStorage = nullptr; size_t tmpBytes = 0; - cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz); - ALLOC_MANAGED_TRY(&tmpStorage, tmpBytes, stream); - cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz); + CUDA_TRY(cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz)); + ALLOC_TRY(&tmpStorage, tmpBytes, stream); + CUDA_TRY(cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz)); ALLOC_FREE_TRY(tmpStorage, stream); // Set offsets to run sizes for each index @@ -206,7 +202,7 @@ gdf_error ConvertCOOtoCSR_weighted(T* sources, T* destinations, W* edgeWeights, offsetsKernel<<>>(runCount_h, unique, counts, result.rowOffsets); // Scan offsets to get final offsets - thrust::exclusive_scan(thrust::cuda::par(allocator).on(stream), result.rowOffsets, result.rowOffsets + maxId + 2, result.rowOffsets); + thrust::exclusive_scan(rmm::exec_policy(stream)->on(stream), result.rowOffsets, result.rowOffsets + maxId + 2, result.rowOffsets); // Clean up temporary allocations result.nnz = nnz; diff --git a/cpp/src/converters/nvgraph.cu b/cpp/src/converters/nvgraph.cu new file mode 100644 index 00000000000..cc448b54494 --- /dev/null +++ b/cpp/src/converters/nvgraph.cu @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** ---------------------------------------------------------------------------* + * @brief Wrapper functions for Nvgraph + * + * @file nvgraph_gdf.cu + * ---------------------------------------------------------------------------**/ + +#include +#include +#include "utilities/error_utils.h" +#include "converters/nvgraph.cuh" + +gdf_error gdf_createGraph_nvgraph(nvgraphHandle_t nvg_handle, + gdf_graph* gdf_G, + nvgraphGraphDescr_t* nvg_G, + bool use_transposed) { + + // check input + GDF_REQUIRE(!((gdf_G->edgeList == nullptr) && + (gdf_G->adjList == nullptr) && + (gdf_G->transposedAdjList == nullptr)), + GDF_INVALID_API_CALL); + nvgraphTopologyType_t TT; + cudaDataType_t settype; + // create an nvgraph graph handle + NVG_TRY(nvgraphCreateGraphDescr(nvg_handle, nvg_G)); + // setup nvgraph variables + if (use_transposed) { + // convert edgeList to transposedAdjList + if (gdf_G->transposedAdjList == nullptr) { + GDF_TRY(gdf_add_transposed_adj_list(gdf_G)); + } + // using exiting transposedAdjList if it exisits and if adjList is missing + TT = NVGRAPH_CSC_32; + nvgraphCSCTopology32I_st topoData; + topoData.nvertices = gdf_G->transposedAdjList->offsets->size - 1; + topoData.nedges = gdf_G->transposedAdjList->indices->size; + topoData.destination_offsets = (int *) gdf_G->transposedAdjList->offsets->data; + topoData.source_indices = (int *) gdf_G->transposedAdjList->indices->data; + // attach the transposed adj list + NVG_TRY(nvgraphAttachGraphStructure(nvg_handle, *nvg_G, (void * )&topoData, TT)); + //attach edge values + if (gdf_G->transposedAdjList->edge_data) { + switch (gdf_G->transposedAdjList->edge_data->dtype) { + case GDF_FLOAT32: + settype = CUDA_R_32F; + NVG_TRY(nvgraphAttachEdgeData(nvg_handle, + *nvg_G, + 0, + settype, + (float * ) gdf_G->transposedAdjList->edge_data->data)) + break; + case GDF_FLOAT64: + settype = CUDA_R_64F; + NVG_TRY(nvgraphAttachEdgeData(nvg_handle, + *nvg_G, + 0, + settype, + (double * ) gdf_G->transposedAdjList->edge_data->data)) + break; + default: + return GDF_UNSUPPORTED_DTYPE; + } + } + + } + else { + // convert edgeList to adjList + if (gdf_G->adjList == nullptr) { + GDF_TRY(gdf_add_adj_list(gdf_G)); + } + TT = NVGRAPH_CSR_32; + nvgraphCSRTopology32I_st topoData; + topoData.nvertices = gdf_G->adjList->offsets->size - 1; + topoData.nedges = gdf_G->adjList->indices->size; + topoData.source_offsets = (int *) gdf_G->adjList->offsets->data; + topoData.destination_indices = (int *) gdf_G->adjList->indices->data; + + // attach adj list + NVG_TRY(nvgraphAttachGraphStructure(nvg_handle, *nvg_G, (void * )&topoData, TT)); + //attach edge values + if (gdf_G->adjList->edge_data) { + switch (gdf_G->adjList->edge_data->dtype) { + case GDF_FLOAT32: + settype = CUDA_R_32F; + NVG_TRY(nvgraphAttachEdgeData(nvg_handle, + *nvg_G, + 0, + settype, + (float * ) gdf_G->adjList->edge_data->data)) + break; + case GDF_FLOAT64: + settype = CUDA_R_64F; + NVG_TRY(nvgraphAttachEdgeData(nvg_handle, + *nvg_G, + 0, + settype, + (double * ) gdf_G->adjList->edge_data->data)) + break; + default: + return GDF_UNSUPPORTED_DTYPE; + } + } + } + return GDF_SUCCESS; +} diff --git a/cpp/src/converters/nvgraph.cuh b/cpp/src/converters/nvgraph.cuh new file mode 100644 index 00000000000..76c1ff97b69 --- /dev/null +++ b/cpp/src/converters/nvgraph.cuh @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +/** + * Takes a GDF graph and wraps its data with an Nvgraph graph object. + * @param nvg_handle The Nvgraph handle + * @param gdf_G Pointer to GDF graph object + * @param nvgraph_G Pointer to the Nvgraph graph descriptor + * @param use_transposed True if we are transposing the input graph while wrapping + * @return Error code + */ +gdf_error gdf_createGraph_nvgraph(nvgraphHandle_t nvg_handle, + gdf_graph* gdf_G, + nvgraphGraphDescr_t * nvgraph_G, +bool use_transposed = false); diff --git a/cpp/src/converters/renumber.cu b/cpp/src/converters/renumber.cu new file mode 100644 index 00000000000..d7821ab6f55 --- /dev/null +++ b/cpp/src/converters/renumber.cu @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Renumber vertices +// Author: Chuck Hastings charlesh@nvidia.com + +#include "renumber.cuh" + +gdf_error gdf_renumber_vertices(const gdf_column *src, const gdf_column *dst, + gdf_column *src_renumbered, gdf_column *dst_renumbered, + gdf_column *numbering_map) { + GDF_REQUIRE( src->size == dst->size, GDF_COLUMN_SIZE_MISMATCH ); + GDF_REQUIRE( src->dtype == dst->dtype, GDF_UNSUPPORTED_DTYPE ); + GDF_REQUIRE( ((src->dtype == GDF_INT32) || (src->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE ); + GDF_REQUIRE( src->size > 0, GDF_DATASET_EMPTY ); + + // + // TODO: we're currently renumbering without using valid. We need to + // worry about that at some point, but for now we'll just + // copy the valid pointers to the new columns and go from there. + // + cudaStream_t stream{nullptr}; + + size_t src_size = src->size; + size_t new_size; + + // + // TODO: I assume int64_t for output. A few thoughts: + // + // * I could match src->dtype - since if the raw values fit in an int32_t, + // then the renumbered values must fit within an int32_t + // * If new_size < (2^31 - 1) then I could allocate 32-bit integers + // and copy them in order to make the final footprint smaller. + // + // + // NOTE: Forcing match right now - it appears that cugraph is artficially + // forcing the type to be 32 + if (src->dtype == GDF_INT32) { + int32_t *tmp; + + ALLOC_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream); + gdf_column_view(src_renumbered, tmp, src->valid, src->size, src->dtype); + + ALLOC_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream); + gdf_column_view(dst_renumbered, tmp, dst->valid, dst->size, dst->dtype); + + gdf_error err = cugraph::renumber_vertices(src_size, + (const int32_t *) src->data, + (const int32_t *) dst->data, + (int32_t *) src_renumbered->data, + (int32_t *) dst_renumbered->data, + &new_size, &tmp); + if (err != GDF_SUCCESS) + return err; + + gdf_column_view(numbering_map, tmp, nullptr, new_size, src->dtype); + } else if (src->dtype == GDF_INT64) { + + // + // NOTE: At the moment, we force the renumbered graph to use + // 32-bit integer ids. Since renumbering is going to make + // the vertex range dense, this limits us to 2 billion + // vertices. + // + // The renumbering code supports 64-bit integer generation + // so we can run this with int64_t output if desired... + // but none of the algorithms support that. + // + int64_t *tmp; + ALLOC_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream); + gdf_column_view(src_renumbered, tmp, src->valid, src->size, GDF_INT32); + + ALLOC_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream); + gdf_column_view(dst_renumbered, tmp, dst->valid, dst->size, GDF_INT32); + + gdf_error err = cugraph::renumber_vertices(src_size, + (const int64_t *) src->data, + (const int64_t *) dst->data, + (int32_t *) src_renumbered->data, + (int32_t *) dst_renumbered->data, + &new_size, &tmp); + if (err != GDF_SUCCESS) + return err; + + // + // If there are too many vertices then the renumbering overflows so we'll + // return an error. + // + if (new_size > 0x7fffffff) { + ALLOC_FREE_TRY(src_renumbered, stream); + ALLOC_FREE_TRY(dst_renumbered, stream); + return GDF_COLUMN_SIZE_TOO_BIG; + } + + gdf_column_view(numbering_map, tmp, nullptr, new_size, src->dtype); + } else { + return GDF_UNSUPPORTED_DTYPE; + } + + return GDF_SUCCESS; +} diff --git a/cpp/src/renumber.cuh b/cpp/src/converters/renumber.cuh similarity index 90% rename from cpp/src/renumber.cuh rename to cpp/src/converters/renumber.cuh index 0b05135e3cc..5e2fa069267 100644 --- a/cpp/src/renumber.cuh +++ b/cpp/src/converters/renumber.cuh @@ -34,8 +34,8 @@ #include #include "utilities/error_utils.h" -#include "graph_utils.cuh" -#include "heap.cuh" +#include "utilities/graph_utils.cuh" +#include "utilities/heap.cuh" #include "rmm_utils.h" namespace cugraph { @@ -134,25 +134,27 @@ namespace cugraph { } - __global__ void SetupHash(hash_type hash_size, index_type *hash_bins_start, index_type *hash_bins_end) { + template + __global__ void SetupHash(H hash_size, I *hash_bins_start, I *hash_bins_end) { hash_bins_end[0] = 0; - for (hash_type i = 0 ; i < hash_size ; ++i) { + for (H i = 0 ; i < hash_size ; ++i) { hash_bins_end[i+1] = hash_bins_end[i] + hash_bins_start[i]; } - for (hash_type i = 0 ; i < (hash_size + 1) ; ++i) { + for (H i = 0 ; i < (hash_size + 1) ; ++i) { hash_bins_start[i] = hash_bins_end[i]; } } - __global__ void ComputeBase(hash_type hash_size, index_type *hash_bins_base) { - index_type sum = 0; - for (hash_type i = 0 ; i < hash_size ; ++i) { + template + __global__ void ComputeBase(H hash_size, I *hash_bins_base) { + I sum = 0; + for (H i = 0 ; i < hash_size ; ++i) { sum += hash_bins_base[i]; } hash_bins_base[hash_size] = sum; - for (hash_type i = hash_size ; i > 0 ; --i) { + for (H i = hash_size ; i > 0 ; --i) { hash_bins_base[i-1] = hash_bins_base[i] - hash_bins_base[i-1]; } } @@ -202,9 +204,9 @@ namespace cugraph { // // We need 3 for hashing, and one array for data // - cudaStream_t stream{nullptr}; - rmm_temp_allocator allocator(stream); - + + cudaStream_t stream {nullptr}; + T_in *hash_data; detail::HashFunctionObject hash(hash_size); @@ -218,10 +220,10 @@ namespace cugraph { int hash_threads_per_block = min((int) hash_size, max_threads_per_block); int hash_thread_blocks = min(((int) hash_size + hash_threads_per_block - 1) / hash_threads_per_block, max_blocks); - ALLOC_TRY(&hash_data, 2 * size * sizeof(T_in), nullptr); - ALLOC_TRY(&hash_bins_start, (1 + hash_size) * sizeof(detail::index_type), nullptr); - ALLOC_TRY(&hash_bins_end, (1 + hash_size) * sizeof(detail::index_type), nullptr); - ALLOC_TRY(&hash_bins_base, (1 + hash_size) * sizeof(detail::index_type), nullptr); + ALLOC_TRY(&hash_data, 2 * size * sizeof(T_in), stream); + ALLOC_TRY(&hash_bins_start, (1 + hash_size) * sizeof(detail::index_type), stream); + ALLOC_TRY(&hash_bins_end, (1 + hash_size) * sizeof(detail::index_type), stream); + ALLOC_TRY(&hash_bins_base, (1 + hash_size) * sizeof(detail::index_type), stream); // // Pass 1: count how many vertex ids end up in each hash bin @@ -229,13 +231,13 @@ namespace cugraph { CUDA_TRY(cudaMemset(hash_bins_start, 0, (1 + hash_size) * sizeof(detail::index_type))); CUDA_TRY(cudaMemset(hash_bins_base, 0, (1 + hash_size) * sizeof(detail::index_type))); - thrust::for_each(thrust::cuda::par(allocator).on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream), src, src + size, [hash_bins_start, hash] __device__ (T_in vid) { atomicAdd(hash_bins_start + hash(vid), detail::index_type{1}); }); - thrust::for_each(thrust::cuda::par(allocator).on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream), dst, dst + size, [hash_bins_start, hash] __device__ (T_in vid) { atomicAdd(hash_bins_start + hash(vid), detail::index_type{1}); @@ -252,7 +254,7 @@ namespace cugraph { // Pass 2: Populate hash_data with data from the hash bins. This implementation // will do some partial deduplication, but we'll need to fully dedupe later. // - thrust::for_each(thrust::cuda::par(allocator).on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream), src, src + size, [hash_bins_end, hash_data, hash] __device__ (T_in vid) { uint32_t hash_index = hash(vid); @@ -260,7 +262,7 @@ namespace cugraph { hash_data[hash_offset] = vid; }); - thrust::for_each(thrust::cuda::par(allocator).on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream), dst, dst + size, [hash_bins_end, hash_data, hash] __device__ (T_in vid) { uint32_t hash_index = hash(vid); @@ -282,7 +284,7 @@ namespace cugraph { // Finally, we'll iterate over src and dst and populate src_renumbered // and dst_renumbered. // - thrust::for_each(thrust::cuda::par(allocator).on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(size), [hash_data, hash_bins_start, hash_bins_end, @@ -293,7 +295,7 @@ namespace cugraph { src_renumbered[idx] = hash_bins_base[tmp] + (id - (hash_data + hash_bins_start[tmp])); }); - thrust::for_each(thrust::cuda::par(allocator).on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(size), [hash_data, hash_bins_start, hash_bins_end, @@ -312,7 +314,7 @@ namespace cugraph { T_in * local_numbering_map = *numbering_map; - thrust::for_each(thrust::cuda::par(allocator).on(stream), + thrust::for_each(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(hash_size), [hash_data, hash_bins_start, hash_bins_end, diff --git a/cpp/src/cugraph.cu b/cpp/src/cugraph.cu deleted file mode 100644 index 80975930de7..00000000000 --- a/cpp/src/cugraph.cu +++ /dev/null @@ -1,671 +0,0 @@ -// -*-c++-*- - - /* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - * - */ - -// Graph analytics features -// Author: Alex Fender afender@nvidia.com - -#include -#include "graph_utils.cuh" -#include "pagerank.cuh" -#include "COOtoCSR.cuh" -#include "utilities/error_utils.h" -#include "bfs.cuh" -#include "renumber.cuh" -#include "snmg/spmv.cuh" -#include -#include -#include - -#include - -template -using Vector = thrust::device_vector>; - -/* - * cudf has gdf_column_free and using this is, in general, better design than - * creating our own, but we will keep this as cudf is planning to remove the - * function. cudf plans to redesign cudf::column to fundamentally solve this - * problem, so once they finished the redesign, we need to update this code to - * use their new features. Until that time, we may rely on this as a temporary - * solution. - */ -void gdf_col_delete(gdf_column* col) { - if (col != nullptr) { - auto stream = cudaStream_t{nullptr}; - if (col->data != nullptr) { - ALLOC_FREE_TRY(col->data, stream); - } - if (col->valid != nullptr) { - ALLOC_FREE_TRY(col->valid, stream); - } -#if 0/* Currently, gdf_column_view does not set col_name, and col_name can have - an arbitrary value, so freeing col_name can lead to freeing a ranodom - address. This problem should be cleaned up once cudf finishes - redesigning cudf::column. */ - if (col->col_name != nullptr) { - free(col->col_name); - } -#endif - delete col; - } -} - -void gdf_col_release(gdf_column* col) { - delete col; -} - -void cpy_column_view(const gdf_column *in, gdf_column *out) { - if (in != nullptr && out !=nullptr) { - gdf_column_view(out, in->data, in->valid, in->size, in->dtype); - } -} - -gdf_error gdf_adj_list_view(gdf_graph *graph, const gdf_column *offsets, - const gdf_column *indices, const gdf_column *edge_data) { - //This function returns an error if this graph object has at least one graph - //representation to prevent a single object storing two different graphs. - GDF_REQUIRE( ((graph->edgeList == nullptr) && (graph->adjList == nullptr) && - (graph->transposedAdjList == nullptr)), GDF_INVALID_API_CALL); - GDF_REQUIRE( offsets->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); - GDF_REQUIRE( indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); - GDF_REQUIRE( (offsets->dtype == indices->dtype), GDF_UNSUPPORTED_DTYPE ); - GDF_REQUIRE( ((offsets->dtype == GDF_INT32) || (offsets->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE ); - GDF_REQUIRE( (offsets->size > 0), GDF_DATASET_EMPTY ); - - graph->adjList = new gdf_adj_list; - graph->adjList->offsets = new gdf_column; - graph->adjList->indices = new gdf_column; - graph->adjList->ownership = 0; - - cpy_column_view(offsets, graph->adjList->offsets); - cpy_column_view(indices, graph->adjList->indices); - if (edge_data) { - GDF_REQUIRE( indices->size == edge_data->size, GDF_COLUMN_SIZE_MISMATCH ); - graph->adjList->edge_data = new gdf_column; - cpy_column_view(edge_data, graph->adjList->edge_data); - } - else { - graph->adjList->edge_data = nullptr; - } - return GDF_SUCCESS; -} - -gdf_error gdf_adj_list::get_vertex_identifiers(gdf_column *identifiers) { - GDF_REQUIRE( offsets != nullptr , GDF_INVALID_API_CALL); - GDF_REQUIRE( offsets->data != nullptr , GDF_INVALID_API_CALL); - cugraph::sequence((int)offsets->size-1, (int*)identifiers->data); - return GDF_SUCCESS; -} - -gdf_error gdf_adj_list::get_source_indices (gdf_column *src_indices) { - GDF_REQUIRE( offsets != nullptr , GDF_INVALID_API_CALL); - GDF_REQUIRE( offsets->data != nullptr , GDF_INVALID_API_CALL); - GDF_REQUIRE( src_indices->size == indices->size, GDF_COLUMN_SIZE_MISMATCH ); - GDF_REQUIRE( src_indices->dtype == indices->dtype, GDF_UNSUPPORTED_DTYPE ); - GDF_REQUIRE( src_indices->size > 0, GDF_DATASET_EMPTY ); - cugraph::offsets_to_indices((int*)offsets->data, offsets->size-1, (int*)src_indices->data); - - return GDF_SUCCESS; -} - -gdf_error gdf_renumber_vertices(const gdf_column *src, const gdf_column *dst, - gdf_column *src_renumbered, gdf_column *dst_renumbered, - gdf_column *numbering_map) { - - GDF_REQUIRE( src->size == dst->size, GDF_COLUMN_SIZE_MISMATCH ); - GDF_REQUIRE( src->dtype == dst->dtype, GDF_UNSUPPORTED_DTYPE ); - GDF_REQUIRE( ((src->dtype == GDF_INT32) || (src->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE ); - GDF_REQUIRE( src->size > 0, GDF_DATASET_EMPTY ); - - // - // TODO: we're currently renumbering without using valid. We need to - // worry about that at some point, but for now we'll just - // copy the valid pointers to the new columns and go from there. - // - cudaStream_t stream{nullptr}; - - size_t src_size = src->size; - size_t new_size; - - // - // TODO: I assume int64_t for output. A few thoughts: - // - // * I could match src->dtype - since if the raw values fit in an int32_t, - // then the renumbered values must fit within an int32_t - // * If new_size < (2^31 - 1) then I could allocate 32-bit integers - // and copy them in order to make the final footprint smaller. - // - // - // NOTE: Forcing match right now - it appears that cugraph is artficially - // forcing the type to be 32 - if (src->dtype == GDF_INT32) { - int32_t *tmp; - - ALLOC_MANAGED_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream); - gdf_column_view(src_renumbered, tmp, src->valid, src->size, src->dtype); - - ALLOC_MANAGED_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream); - gdf_column_view(dst_renumbered, tmp, dst->valid, dst->size, dst->dtype); - - gdf_error err = cugraph::renumber_vertices(src_size, - (const int32_t *) src->data, - (const int32_t *) dst->data, - (int32_t *) src_renumbered->data, - (int32_t *) dst_renumbered->data, - &new_size, &tmp); - if (err != GDF_SUCCESS) - return err; - - gdf_column_view(numbering_map, tmp, nullptr, new_size, src->dtype); - } else if (src->dtype == GDF_INT64) { - - // - // NOTE: At the moment, we force the renumbered graph to use - // 32-bit integer ids. Since renumbering is going to make - // the vertex range dense, this limits us to 2 billion - // vertices. - // - // The renumbering code supports 64-bit integer generation - // so we can run this with int64_t output if desired... - // but none of the algorithms support that. - // - int64_t *tmp; - ALLOC_MANAGED_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream); - gdf_column_view(src_renumbered, tmp, src->valid, src->size, GDF_INT32); - - ALLOC_MANAGED_TRY((void**) &tmp, sizeof(int32_t) * src->size, stream); - gdf_column_view(dst_renumbered, tmp, dst->valid, dst->size, GDF_INT32); - - gdf_error err = cugraph::renumber_vertices(src_size, - (const int64_t *) src->data, - (const int64_t *) dst->data, - (int32_t *) src_renumbered->data, - (int32_t *) dst_renumbered->data, - &new_size, &tmp); - if (err != GDF_SUCCESS) - return err; - - // - // If there are too many vertices then the renumbering overflows so we'll - // return an error. - // - if (new_size > 0x7fffffff) { - ALLOC_FREE_TRY(src_renumbered, stream); - ALLOC_FREE_TRY(dst_renumbered, stream); - return GDF_COLUMN_SIZE_TOO_BIG; - } - - gdf_column_view(numbering_map, tmp, nullptr, new_size, src->dtype); - } else { - return GDF_UNSUPPORTED_DTYPE; - } - - return GDF_SUCCESS; -} - -gdf_error gdf_edge_list_view(gdf_graph *graph, const gdf_column *src_indices, - const gdf_column *dest_indices, const gdf_column *edge_data) { - //This function returns an error if this graph object has at least one graph - //representation to prevent a single object storing two different graphs. - GDF_REQUIRE( ((graph->edgeList == nullptr) && (graph->adjList == nullptr) && - (graph->transposedAdjList == nullptr)), GDF_INVALID_API_CALL); - GDF_REQUIRE( src_indices->size == dest_indices->size, GDF_COLUMN_SIZE_MISMATCH ); - GDF_REQUIRE( src_indices->dtype == dest_indices->dtype, GDF_UNSUPPORTED_DTYPE ); - GDF_REQUIRE( ((src_indices->dtype == GDF_INT32) || (src_indices->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE ); - GDF_REQUIRE( src_indices->size > 0, GDF_DATASET_EMPTY ); - GDF_REQUIRE( src_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); - GDF_REQUIRE( dest_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); - - graph->edgeList = new gdf_edge_list; - graph->edgeList->src_indices = new gdf_column; - graph->edgeList->dest_indices = new gdf_column; - graph->edgeList->ownership = 0; - - cpy_column_view(src_indices, graph->edgeList->src_indices); - cpy_column_view(dest_indices, graph->edgeList->dest_indices); - if (edge_data) { - GDF_REQUIRE( src_indices->size == edge_data->size, GDF_COLUMN_SIZE_MISMATCH ); - graph->edgeList->edge_data = new gdf_column; - cpy_column_view(edge_data, graph->edgeList->edge_data); - } - else { - graph->edgeList->edge_data = nullptr; - } - - return GDF_SUCCESS; -} - -template -gdf_error gdf_add_adj_list_impl (gdf_graph *graph) { - if (graph->adjList == nullptr) { - GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL); - int nnz = graph->edgeList->src_indices->size, status = 0; - graph->adjList = new gdf_adj_list; - graph->adjList->offsets = new gdf_column; - graph->adjList->indices = new gdf_column; - graph->adjList->ownership = 1; - - if (graph->edgeList->edge_data!= nullptr) { - graph->adjList->edge_data = new gdf_column; - - CSR_Result_Weighted adj_list; - status = ConvertCOOtoCSR_weighted((int*)graph->edgeList->src_indices->data, (int*)graph->edgeList->dest_indices->data, (WT*)graph->edgeList->edge_data->data, nnz, adj_list); - - gdf_column_view(graph->adjList->offsets, adj_list.rowOffsets, - nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype); - gdf_column_view(graph->adjList->indices, adj_list.colIndices, - nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype); - gdf_column_view(graph->adjList->edge_data, adj_list.edgeWeights, - nullptr, adj_list.nnz, graph->edgeList->edge_data->dtype); - } - else { - CSR_Result adj_list; - status = ConvertCOOtoCSR((int*)graph->edgeList->src_indices->data,(int*)graph->edgeList->dest_indices->data, nnz, adj_list); - gdf_column_view(graph->adjList->offsets, adj_list.rowOffsets, - nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype); - gdf_column_view(graph->adjList->indices, adj_list.colIndices, - nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype); - } - if (status !=0) { - std::cerr << "Could not generate the adj_list" << std::endl; - return GDF_CUDA_ERROR; - } - } - return GDF_SUCCESS; -} - -gdf_error gdf_add_edge_list (gdf_graph *graph) { - if (graph->edgeList == nullptr) { - GDF_REQUIRE( graph->adjList != nullptr , GDF_INVALID_API_CALL); - int *d_src; - graph->edgeList = new gdf_edge_list; - graph->edgeList->src_indices = new gdf_column; - graph->edgeList->dest_indices = new gdf_column; - graph->edgeList->ownership = 2; - - CUDA_TRY(cudaMallocManaged ((void**)&d_src, sizeof(int) * graph->adjList->indices->size)); - - cugraph::offsets_to_indices((int*)graph->adjList->offsets->data, - graph->adjList->offsets->size-1, - (int*)d_src); - - gdf_column_view(graph->edgeList->src_indices, d_src, - nullptr, graph->adjList->indices->size, graph->adjList->indices->dtype); - cpy_column_view(graph->adjList->indices, graph->edgeList->dest_indices); - - if (graph->adjList->edge_data != nullptr) { - graph->edgeList->edge_data = new gdf_column; - cpy_column_view(graph->adjList->edge_data, graph->edgeList->edge_data); - } - } - return GDF_SUCCESS; -} - - -template -gdf_error gdf_add_transposed_adj_list_impl (gdf_graph *graph) { - if (graph->transposedAdjList == nullptr ) { - GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL); - int nnz = graph->edgeList->src_indices->size, status = 0; - graph->transposedAdjList = new gdf_adj_list; - graph->transposedAdjList->offsets = new gdf_column; - graph->transposedAdjList->indices = new gdf_column; - graph->transposedAdjList->ownership = 1; - - if (graph->edgeList->edge_data) { - graph->transposedAdjList->edge_data = new gdf_column; - CSR_Result_Weighted adj_list; - status = ConvertCOOtoCSR_weighted( (int*)graph->edgeList->dest_indices->data, (int*)graph->edgeList->src_indices->data, (WT*)graph->edgeList->edge_data->data, nnz, adj_list); - gdf_column_view(graph->transposedAdjList->offsets, adj_list.rowOffsets, - nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype); - gdf_column_view(graph->transposedAdjList->indices, adj_list.colIndices, - nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype); - gdf_column_view(graph->transposedAdjList->edge_data, adj_list.edgeWeights, - nullptr, adj_list.nnz, graph->edgeList->edge_data->dtype); - } - else { - - CSR_Result adj_list; - status = ConvertCOOtoCSR((int*)graph->edgeList->dest_indices->data, (int*)graph->edgeList->src_indices->data, nnz, adj_list); - gdf_column_view(graph->transposedAdjList->offsets, adj_list.rowOffsets, - nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype); - gdf_column_view(graph->transposedAdjList->indices, adj_list.colIndices, - nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype); - } - if (status !=0) { - std::cerr << "Could not generate the adj_list" << std::endl; - return GDF_CUDA_ERROR; - } - } - return GDF_SUCCESS; -} - -gdf_error gdf_degree_impl(int n, int e, gdf_column* col_ptr, gdf_column* degree, bool offsets) { - if(offsets == true) { - dim3 nthreads, nblocks; - nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - - switch (col_ptr->dtype) { - case GDF_INT32: cugraph::degree_offsets <<>>(n, e, static_cast(col_ptr->data), static_cast(degree->data));break; - default: return GDF_UNSUPPORTED_DTYPE; - } - } - else { - dim3 nthreads, nblocks; - nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - - switch (col_ptr->dtype) { - case GDF_INT32: cugraph::degree_coo <<>>(n, e, static_cast(col_ptr->data), static_cast(degree->data));break; - default: return GDF_UNSUPPORTED_DTYPE; - } - } - return GDF_SUCCESS; -} - - -gdf_error gdf_degree(gdf_graph *graph, gdf_column *degree, int x) { - // Calculates the degree of all vertices of the graph - // x = 0: in+out degree - // x = 1: in-degree - // x = 2: out-degree - GDF_REQUIRE(graph->adjList != nullptr || graph->transposedAdjList != nullptr, GDF_INVALID_API_CALL); - int n; - int e; - if(graph->adjList != nullptr) { - n = graph->adjList->offsets->size -1; - e = graph->adjList->indices->size; - } - else { - n = graph->transposedAdjList->offsets->size - 1; - e = graph->transposedAdjList->indices->size; - } - - if(x!=1) { - // Computes out-degree for x=0 and x=2 - if(graph->adjList) - gdf_degree_impl(n, e, graph->adjList->offsets, degree, true); - else - gdf_degree_impl(n, e, graph->transposedAdjList->indices, degree, false); - } - - if(x!=2) { - // Computes in-degree for x=0 and x=1 - if(graph->adjList) - gdf_degree_impl(n, e, graph->adjList->indices, degree, false); - else - gdf_degree_impl(n, e, graph->transposedAdjList->offsets, degree, true); - } - return GDF_SUCCESS; -} - - -template -gdf_error gdf_pagerank_impl (gdf_graph *graph, - gdf_column *pagerank, float alpha = 0.85, - float tolerance = 1e-4, int max_iter = 200, - bool has_guess = false) { - GDF_REQUIRE( graph->edgeList != nullptr, GDF_VALIDITY_UNSUPPORTED ); - GDF_REQUIRE( graph->edgeList->src_indices->size == graph->edgeList->dest_indices->size, GDF_COLUMN_SIZE_MISMATCH ); - GDF_REQUIRE( graph->edgeList->src_indices->dtype == graph->edgeList->dest_indices->dtype, GDF_UNSUPPORTED_DTYPE ); - GDF_REQUIRE( graph->edgeList->src_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); - GDF_REQUIRE( graph->edgeList->dest_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); - GDF_REQUIRE( pagerank != nullptr , GDF_INVALID_API_CALL ); - GDF_REQUIRE( pagerank->data != nullptr , GDF_INVALID_API_CALL ); - GDF_REQUIRE( pagerank->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); - GDF_REQUIRE( pagerank->size > 0 , GDF_INVALID_API_CALL ); - - int m=pagerank->size, nnz = graph->edgeList->src_indices->size, status = 0; - WT *d_pr, *d_val = nullptr, *d_leaf_vector = nullptr; - WT res = 1.0; - WT *residual = &res; - - if (graph->transposedAdjList == nullptr) { - gdf_add_transposed_adj_list(graph); - } - cudaStream_t stream{nullptr}; - ALLOC_MANAGED_TRY((void**)&d_leaf_vector, sizeof(WT) * m, stream); - ALLOC_MANAGED_TRY((void**)&d_val, sizeof(WT) * nnz , stream); - ALLOC_MANAGED_TRY((void**)&d_pr, sizeof(WT) * m, stream); - - // The templating for HT_matrix_csc_coo assumes that m, nnz and data are all the same type - cugraph::HT_matrix_csc_coo(m, nnz, (int *)graph->transposedAdjList->offsets->data, (int *)graph->transposedAdjList->indices->data, d_val, d_leaf_vector); - - if (has_guess) - { - GDF_REQUIRE( pagerank->data != nullptr, GDF_VALIDITY_UNSUPPORTED ); - cugraph::copy(m, (WT*)pagerank->data, d_pr); - } - - status = cugraph::pagerank( m,nnz, (int*)graph->transposedAdjList->offsets->data, (int*)graph->transposedAdjList->indices->data, - d_val, alpha, d_leaf_vector, false, tolerance, max_iter, d_pr, residual); - - if (status !=0) - switch ( status ) { - case -1: std::cerr<< "Error : bad parameters in Pagerank"<(m, d_pr, (WT*)pagerank->data); - - ALLOC_FREE_TRY(d_val, stream); - ALLOC_FREE_TRY(d_pr, stream); - ALLOC_FREE_TRY(d_leaf_vector, stream); - - return GDF_SUCCESS; -} - -gdf_error gdf_add_adj_list(gdf_graph *graph) { - if (graph->adjList != nullptr) - return GDF_SUCCESS; - - GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL); - GDF_REQUIRE( graph->edgeList->src_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE ); - - if (graph->edgeList->edge_data != nullptr) { - switch (graph->edgeList->edge_data->dtype) { - case GDF_FLOAT32: return gdf_add_adj_list_impl(graph); - case GDF_FLOAT64: return gdf_add_adj_list_impl(graph); - default: return GDF_UNSUPPORTED_DTYPE; - } - } - else { - return gdf_add_adj_list_impl(graph); - } -} - -gdf_error gdf_add_transposed_adj_list(gdf_graph *graph) { - if (graph->edgeList == nullptr) - gdf_add_edge_list(graph); - - GDF_REQUIRE(graph->edgeList->src_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); - GDF_REQUIRE(graph->edgeList->dest_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); - - if (graph->edgeList->edge_data != nullptr) { - switch (graph->edgeList->edge_data->dtype) { - case GDF_FLOAT32: return gdf_add_transposed_adj_list_impl(graph); - case GDF_FLOAT64: return gdf_add_transposed_adj_list_impl(graph); - default: return GDF_UNSUPPORTED_DTYPE; - } - } - else { - return gdf_add_transposed_adj_list_impl(graph); - } -} - -gdf_error gdf_delete_adj_list(gdf_graph *graph) { - if (graph->adjList) { - delete graph->adjList; - } - graph->adjList = nullptr; - return GDF_SUCCESS; -} - -gdf_error gdf_delete_edge_list(gdf_graph *graph) { - if (graph->edgeList) { - delete graph->edgeList; - } - graph->edgeList = nullptr; - return GDF_SUCCESS; -} - -gdf_error gdf_delete_transposed_adj_list(gdf_graph *graph) { - if (graph->transposedAdjList) { - delete graph->transposedAdjList; - } - graph->transposedAdjList = nullptr; - return GDF_SUCCESS; -} - -gdf_error gdf_pagerank(gdf_graph *graph, gdf_column *pagerank, float alpha, float tolerance, int max_iter, bool has_guess) { - // - // page rank operates on CSR and can't currently support 64-bit integers. - // - // If csr doesn't exist, create it. Then check type to make sure it is 32-bit. - // - GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL); - gdf_error err = gdf_add_adj_list(graph); - if (err != GDF_SUCCESS) - return err; - - GDF_REQUIRE(graph->adjList->offsets->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); - GDF_REQUIRE(graph->adjList->indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); - - switch (pagerank->dtype) { - case GDF_FLOAT32: return gdf_pagerank_impl(graph, pagerank, alpha, tolerance, max_iter, has_guess); - case GDF_FLOAT64: return gdf_pagerank_impl(graph, pagerank, alpha, tolerance, max_iter, has_guess); - default: return GDF_UNSUPPORTED_DTYPE; - } -} - -gdf_error gdf_bfs(gdf_graph *graph, gdf_column *distances, gdf_column *predecessors, int start_vertex, bool directed) { - GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL); - gdf_error err = gdf_add_adj_list(graph); - if (err != GDF_SUCCESS) - return err; - GDF_REQUIRE(graph->adjList->offsets->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); - GDF_REQUIRE(graph->adjList->indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); - GDF_REQUIRE(distances->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); - GDF_REQUIRE(predecessors->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); - - int n = graph->adjList->offsets->size - 1; - int e = graph->adjList->indices->size; - int* offsets_ptr = (int*)graph->adjList->offsets->data; - int* indices_ptr = (int*)graph->adjList->indices->data; - int* distances_ptr = (int*)distances->data; - int* predecessors_ptr = (int*)predecessors->data; - int alpha = 15; - int beta = 18; - - cugraph::Bfs bfs(n, e, offsets_ptr, indices_ptr, directed, alpha, beta); - bfs.configure(distances_ptr, predecessors_ptr, nullptr); - bfs.traverse(start_vertex); - return GDF_SUCCESS; -} - -gdf_error gdf_louvain(gdf_graph *graph, void *final_modularity, void *num_level, gdf_column *louvain_parts) { - GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL); - gdf_error err = gdf_add_adj_list(graph); - if (err != GDF_SUCCESS) - return err; - - size_t n = graph->adjList->offsets->size - 1; - size_t e = graph->adjList->indices->size; - - void* offsets_ptr = graph->adjList->offsets->data; - void* indices_ptr = graph->adjList->indices->data; - - void* value_ptr; - Vector d_values; - if(graph->adjList->edge_data) { - value_ptr = graph->adjList->edge_data->data; - } - else { - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - d_values.resize(graph->adjList->indices->size); - thrust::fill(thrust::cuda::par(allocator).on(stream), d_values.begin(), d_values.end(), 1.0); - value_ptr = (void * ) thrust::raw_pointer_cast(d_values.data()); - } - - void* louvain_parts_ptr = louvain_parts->data; - - auto gdf_to_cudadtype= [](gdf_column *col){ - cudaDataType_t cuda_dtype; - switch(col->dtype){ - case GDF_INT8: cuda_dtype = CUDA_R_8I; break; - case GDF_INT32: cuda_dtype = CUDA_R_32I; break; - case GDF_FLOAT32: cuda_dtype = CUDA_R_32F; break; - case GDF_FLOAT64: cuda_dtype = CUDA_R_64F; break; - default: throw new std::invalid_argument("Cannot convert data type"); - }return cuda_dtype; - }; - - cudaDataType_t index_type = gdf_to_cudadtype(graph->adjList->indices); - cudaDataType_t val_type = graph->adjList->edge_data? gdf_to_cudadtype(graph->adjList->edge_data): CUDA_R_32F; - - nvgraphLouvain(index_type, val_type, n, e, offsets_ptr, indices_ptr, value_ptr, 1, 0, NULL, - final_modularity, louvain_parts_ptr, num_level); - return GDF_SUCCESS; -} - -template -gdf_error gdf_snmg_csrmv_impl (size_t * part_offsets, gdf_column * off, gdf_column * ind, gdf_column * val, gdf_column ** x_cols){ - - GDF_REQUIRE( part_offsets != nullptr, GDF_INVALID_API_CALL ); - GDF_REQUIRE( off != nullptr, GDF_INVALID_API_CALL ); - GDF_REQUIRE( ind != nullptr, GDF_INVALID_API_CALL ); - GDF_REQUIRE( val != nullptr, GDF_INVALID_API_CALL ); - GDF_REQUIRE( x_cols != nullptr, GDF_INVALID_API_CALL ); - GDF_REQUIRE( off->size > 0, GDF_INVALID_API_CALL ); - GDF_REQUIRE( ind->size > 0, GDF_INVALID_API_CALL ); - GDF_REQUIRE( val->size > 0, GDF_INVALID_API_CALL ); - GDF_REQUIRE( ind->size == val->size, GDF_COLUMN_SIZE_MISMATCH ); - GDF_REQUIRE( off->dtype == ind->dtype, GDF_UNSUPPORTED_DTYPE ); - GDF_REQUIRE( off->null_count + ind->null_count + val->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); - - gdf_error status; - auto p = omp_get_num_threads(); - - val_t* x[p]; - for (auto i = 0; i < p; ++i) - { - GDF_REQUIRE( x_cols[i] != nullptr, GDF_INVALID_API_CALL ); - GDF_REQUIRE( x_cols[i]->size > 0, GDF_INVALID_API_CALL ); - x[i]= static_cast(x_cols[i]->data); - } - status = cugraph::snmg_csrmv(part_offsets, - static_cast(off->data), - static_cast(ind->data), - static_cast(val->data), - x); - return status; -} - -gdf_error gdf_snmg_csrmv (size_t * part_offsets, gdf_column * off, gdf_column * ind, gdf_column * val, gdf_column ** x_cols){ - switch (val->dtype) { - case GDF_FLOAT32: return gdf_snmg_csrmv_impl(part_offsets, off, ind, val, x_cols); - case GDF_FLOAT64: return gdf_snmg_csrmv_impl(part_offsets, off, ind, val, x_cols); - default: return GDF_UNSUPPORTED_DTYPE; - } -} diff --git a/cpp/src/graph_utils.cuh b/cpp/src/graph_utils.cuh deleted file mode 100644 index 190c71d9926..00000000000 --- a/cpp/src/graph_utils.cuh +++ /dev/null @@ -1,602 +0,0 @@ -/* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - * - */ - -// Interanl helper functions -// Author: Alex Fender afender@nvidia.com -#pragma once - -#include -#include -//#include -//#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define USE_CG 1 -//#define DEBUG 1 - -namespace cugraph -{ - -#define CUDA_MAX_BLOCKS 65535 -#define CUDA_MAX_KERNEL_THREADS 256 //kernel will launch at most 256 threads per block -#define DEFAULT_MASK 0xffffffff -#define US - -//error check -#ifdef DEBUG -#define WHERE " at: " << __FILE__ << ':' << __LINE__ -#define cudaCheckError() { \ - cudaError_t e=cudaGetLastError(); \ - if(e!=cudaSuccess) { \ - std::cerr << "Cuda failure: " << cudaGetErrorString(e) << WHERE << std::endl; \ - } \ - } -#else -#define cudaCheckError() -#define WHERE "" -#endif - - template - static __device__ __forceinline__ T shfl_up(T r, int offset, int bound = 32, int mask = - DEFAULT_MASK) - { -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_up_sync( mask, r, offset, bound ); -#else - return __shfl_up( r, offset, bound ); -#endif -#else - return 0.0f; -#endif - } - - template - static __device__ __forceinline__ T shfl(T r, int lane, int bound = 32, int mask = DEFAULT_MASK) - { -#if __CUDA_ARCH__ >= 300 -#if USE_CG - return __shfl_sync(mask, r, lane, bound ); -#else - return __shfl(r, lane, bound ); -#endif -#else - return 0.0f; -#endif - } - - template - __inline__ __device__ - ValType parallel_prefix_sum(IdxType n, IdxType *ind, ValType *w) { - IdxType i, j, mn; - ValType v, last; - ValType sum = 0.0; - bool valid; - - //Parallel prefix sum (using __shfl) - mn = (((n + blockDim.x - 1) / blockDim.x) * blockDim.x); //n in multiple of blockDim.x - for (i = threadIdx.x; i < mn; i += blockDim.x) { - //All threads (especially the last one) must always participate - //in the shfl instruction, otherwise their sum will be undefined. - //So, the loop stopping condition is based on multiple of n in loop increments, - //so that all threads enter into the loop and inside we make sure we do not - //read out of bounds memory checking for the actual size n. - - //check if the thread is valid - valid = i < n; - - //Notice that the last thread is used to propagate the prefix sum. - //For all the threads, in the first iteration the last is 0, in the following - //iterations it is the value at the last thread of the previous iterations. - - //get the value of the last thread - last = shfl(sum, blockDim.x - 1, blockDim.x); - - //if you are valid read the value from memory, otherwise set your value to 0 - sum = (valid) ? w[ind[i]] : 0.0; - - //do prefix sum (of size warpSize=blockDim.x =< 32) - for (j = 1; j < blockDim.x; j *= 2) { - v = shfl_up(sum, j, blockDim.x); - if (threadIdx.x >= j) - sum += v; - } - //shift by last - sum += last; - //notice that no __threadfence or __syncthreads are needed in this implementation - } - //get the value of the last thread (to all threads) - last = shfl(sum, blockDim.x - 1, blockDim.x); - - return last; - } - -//dot - template - T dot(size_t n, T* x, T* y) { - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - T result = thrust::inner_product(thrust::cuda::par(allocator).on(stream), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::device_pointer_cast(y), - 0.0f); - cudaCheckError(); - return result; - } - -//axpy - template - struct axpy_functor: public thrust::binary_function { - const T a; - axpy_functor(T _a) : - a(_a) { - } - __host__ __device__ - T operator()(const T& x, const T& y) const { - return a * x + y; - } - }; - - template - void axpy(size_t n, T a, T* x, T* y) { - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - thrust::transform(thrust::cuda::par(allocator).on(stream), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::device_pointer_cast(y), - thrust::device_pointer_cast(y), - axpy_functor(a)); - cudaCheckError(); - } - -//norm - template - struct square { - __host__ __device__ - T operator()(const T& x) const { - return x * x; - } - }; - - template - T nrm2(size_t n, T* x) { - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - T init = 0; - T result = std::sqrt(thrust::transform_reduce(thrust::cuda::par(allocator).on(stream), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - square(), - init, - thrust::plus())); - cudaCheckError(); - return result; - } - - template - T nrm1(size_t n, T* x) { - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - - T result = thrust::reduce(thrust::cuda::par(allocator).on(stream), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n)); - cudaCheckError(); - return result; - } - - template - void scal(size_t n, T val, T* x) { - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - - thrust::transform(thrust::cuda::par(allocator).on(stream), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), - thrust::make_constant_iterator(val), - thrust::device_pointer_cast(x), - thrust::multiplies()); - cudaCheckError(); - } - - template - void fill(size_t n, T* x, T value) { - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - - thrust::fill(thrust::cuda::par(allocator).on(stream), - thrust::device_pointer_cast(x), - thrust::device_pointer_cast(x + n), value); - cudaCheckError(); - } - - template - void printv(size_t n, T* vec, int offset) { - thrust::device_ptr dev_ptr(vec); - std::cout.precision(15); - std::cout << "sample size = " << n << ", offset = " << offset << std::endl; - thrust::copy(dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator(std::cout, " ")); //Assume no RMM dependency; TODO: check / test (potential BUG !!!!!) - cudaCheckError(); - std::cout << std::endl; - } - - template - void copy(size_t n, T *x, T *res) - { - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - - thrust::device_ptr dev_ptr(x); - thrust::device_ptr res_ptr(res); - thrust::copy_n(thrust::cuda::par(allocator).on(stream), dev_ptr, n, res_ptr); - cudaCheckError(); - } - - template - struct is_zero { - __host__ __device__ - bool operator()(const T x) { - return x == 0; - } - }; - - template - struct dangling_functor: public thrust::unary_function { - const T val; - dangling_functor(T _val) : - val(_val) { - } - __host__ __device__ - T operator()(const T& x) const { - return val + x; - } - }; - - template - void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor) { - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - - thrust::transform_if(thrust::cuda::par(allocator).on(stream), - thrust::device_pointer_cast(dangling_nodes), - thrust::device_pointer_cast(dangling_nodes + n), - thrust::device_pointer_cast(dangling_nodes), - dangling_functor(1.0 - damping_factor), - is_zero()); - cudaCheckError(); - } - -//google matrix kernels - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - degree_coo(const IndexType n, const IndexType e, const IndexType *ind, IndexType *degree) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) - atomicAdd(°ree[ind[i]], 1.0); - } - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - equi_prob(const IndexType n, - const IndexType e, - const IndexType *ind, - ValueType *val, - IndexType *degree) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) - val[i] = 1.0 / degree[ind[i]]; - } - - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - flag_leafs(const IndexType n, IndexType *degree, ValueType *bookmark) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) - if (degree[i] == 0) - bookmark[i] = 1.0; - } - - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - degree_offsets(const IndexType n, const IndexType e, const IndexType *ind, IndexType *degree) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) - degree[i] += ind[i+1]-ind[i]; - } - - -//notice that in the transposed matrix/csc a dangling node is a node without incomming edges -//just swap coo src and dest arrays after that to interpret it as HT - template - void HT_matrix_coo( const IndexType n, - const IndexType e, - const IndexType *src, - ValueType *cooVal, - ValueType *bookmark) { - IndexType *degree { nullptr }; - cudaStream_t stream { nullptr }; - ALLOC_MANAGED_TRY((void** )°ree, sizeof(IndexType) * n, stream); - - cudaMemset(degree, 0, sizeof(IndexType) * n); - - dim3 nthreads, nblocks; - nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - degree_coo <<>>(n, e, src, degree); - equi_prob <<>>(n, e, src, cooVal, degree); - ValueType val = 0.0; - fill(n, bookmark, val); - nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS); - nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - flag_leafs <<>>(n, degree, bookmark); - - //printv(n, degree , 0); - //printv(n, bookmark , 0); - //printv(e, cooVal , 0); - - //this was missing: TODO: check if okay - ALLOC_FREE_TRY(degree, stream); - } - - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - equi_prob3( const IndexType n, - const IndexType e, - const IndexType *csrPtr, - const IndexType *csrInd, - ValueType *val, - IndexType *degree) { - int j, row, col; - for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { - for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1]; - j += gridDim.y * blockDim.y) { - col = csrInd[j]; - val[j] = 1.0 / degree[col]; - //val[j] = 999; - } - } - } - - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - equi_prob2( const IndexType n, - const IndexType e, - const IndexType *csrPtr, - const IndexType *csrInd, - ValueType *val, - IndexType *degree) { - int row = blockIdx.x * blockDim.x + threadIdx.x; - if (row < n) - { - int row_begin = csrPtr[row]; - int row_end = csrPtr[row + 1]; - int col; - for (int i = row_begin; i < row_end; i++) { - col = csrInd[i]; - val[i] = 1.0 / degree[col]; - } - } - } - -// compute the H^T values for an already transposed adjacency matrix, leveraging coo info - template - void HT_matrix_csc_coo( const IndexType n, - const IndexType e, - const IndexType *csrPtr, - const IndexType *csrInd, - ValueType *val, - ValueType *bookmark) { - IndexType *degree; - cudaStream_t stream { nullptr }; - ALLOC_MANAGED_TRY((void** )°ree, sizeof(IndexType) * n, stream); - cudaMemset(degree, 0, sizeof(IndexType) * n); - - dim3 nthreads, nblocks; - nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - degree_coo <<>>(n, e, csrInd, degree); - cudaCheckError(); - - int y = 4; - nthreads.x = 32 / y; - nthreads.y = y; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS); //1; - equi_prob3 <<>>(n, e, csrPtr, csrInd, val, degree); - //printv(e, val , 0); - cudaCheckError(); - - ValueType a = 0.0; - fill(n, bookmark, a); - cudaCheckError(); - - nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - flag_leafs <<>>(n, degree, bookmark); - cudaCheckError(); - - //this was missing! TODO: check if okay. - ALLOC_FREE_TRY(degree, stream); - } - - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - permute_vals_kernel(const IndexType e, IndexType *perm, ValueType *in, ValueType *out) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) - out[i] = in[perm[i]]; - } - - template - void permute_vals(const IndexType e, IndexType *perm, ValueType *in, ValueType *out) { - int nthreads = min(e, CUDA_MAX_KERNEL_THREADS); - int nblocks = min((e + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS); - permute_vals_kernel<<>>(e, perm, in, out); - //printv(e, in , 0); - //printv(e, perm , 0); - //printv(e, out , 0); - } - -// This will remove duplicate along with sorting -// This will sort the COO Matrix, row will be sorted and each column of same row will be sorted. - template - void remove_duplicate(IndexType* src, IndexType* dest, ValueType* val, SizeT &nnz) - { - //RMM: - // - cudaStream_t stream { nullptr }; - rmm_temp_allocator allocator(stream); - if (val != NULL) - { - thrust::stable_sort_by_key( thrust::cuda::par(allocator).on(stream), - thrust::raw_pointer_cast(val), - thrust::raw_pointer_cast(val) + nnz, - thrust::make_zip_iterator(thrust::make_tuple( thrust::raw_pointer_cast(src), - thrust::raw_pointer_cast(dest)))); - thrust::stable_sort_by_key( thrust::cuda::par(allocator).on(stream), - thrust::raw_pointer_cast(dest), - thrust::raw_pointer_cast(dest + nnz), - thrust::make_zip_iterator(thrust::make_tuple( thrust::raw_pointer_cast(src), - thrust::raw_pointer_cast(val)))); - thrust::stable_sort_by_key( thrust::cuda::par(allocator).on(stream), - thrust::raw_pointer_cast(src), - thrust::raw_pointer_cast(src + nnz), - thrust::make_zip_iterator(thrust::make_tuple( thrust::raw_pointer_cast(dest), - thrust::raw_pointer_cast(val)))); - - typedef thrust::tuple IteratorTuple; - typedef thrust::zip_iterator ZipIterator; - typedef thrust::tuple ZipIteratorTuple; - typedef thrust::zip_iterator ZipZipIterator; - - ZipZipIterator newEnd = - thrust::unique( thrust::cuda::par(allocator).on(stream), - thrust::make_zip_iterator(thrust::make_tuple( thrust::raw_pointer_cast(src), - thrust::make_zip_iterator(thrust::make_tuple( thrust::raw_pointer_cast(dest), - thrust::raw_pointer_cast(val))))), - thrust::make_zip_iterator(thrust::make_tuple( thrust::raw_pointer_cast(src - + nnz), - thrust::make_zip_iterator(thrust::make_tuple( dest - + nnz, - val - + nnz))))); - - ZipIteratorTuple endTuple = newEnd.get_iterator_tuple(); - IndexType* row_end = thrust::get<0>(endTuple); - - nnz = ((size_t) row_end - (size_t) src) / sizeof(IndexType); - } - else - { - thrust::stable_sort_by_key( thrust::cuda::par(allocator).on(stream), - thrust::raw_pointer_cast(dest), - thrust::raw_pointer_cast(dest + nnz), - thrust::raw_pointer_cast(src)); - thrust::stable_sort_by_key( thrust::cuda::par(allocator).on(stream), - thrust::raw_pointer_cast(src), - thrust::raw_pointer_cast(src + nnz), - thrust::raw_pointer_cast(dest)); - - typedef thrust::tuple IteratorTuple; - typedef thrust::zip_iterator ZipIterator; - - ZipIterator newEnd = - thrust::unique( thrust::cuda::par(allocator).on(stream), - thrust::make_zip_iterator(thrust::make_tuple( thrust::raw_pointer_cast(src), - thrust::raw_pointer_cast(dest))), - thrust::make_zip_iterator(thrust::make_tuple( thrust::raw_pointer_cast(src - + nnz), - thrust::raw_pointer_cast(dest - + nnz)))); - - IteratorTuple endTuple = newEnd.get_iterator_tuple(); - IndexType* row_end = thrust::get<0>(endTuple); - - nnz = ((size_t) row_end - (size_t) src) / sizeof(IndexType); - } - } - - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) offsets_to_indices_kernel( const IndexType *offsets, - IndexType v, - IndexType *indices) { - - int tid, ctaStart; - tid = threadIdx.x; - ctaStart = blockIdx.x; - - for (int j = ctaStart; j < v; j += gridDim.x) { - IndexType colStart = offsets[j]; - IndexType colEnd = offsets[j + 1]; - IndexType rowNnz = colEnd - colStart; - - for (int i = 0; i < rowNnz; i += blockDim.x) { - if ((colStart + tid + i) < colEnd) { - indices[colStart + tid + i] = j; - } - } - } - } - - template - void offsets_to_indices(const IndexType *offsets, IndexType v, IndexType *indices) - { - int nthreads = min(v, CUDA_MAX_KERNEL_THREADS); - int nblocks = min((v + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS); - offsets_to_indices_kernel<<>>(offsets, v, indices); - cudaCheckError(); - } - - template - void sequence(IndexType n, IndexType *vec, IndexType init = 0) - { - thrust::sequence( thrust::device, - thrust::device_pointer_cast(vec), - thrust::device_pointer_cast(vec + n), - init); - cudaCheckError(); - } - -} //namespace cugraph diff --git a/cpp/src/jaccard.cu b/cpp/src/jaccard.cu deleted file mode 100644 index 91d6206a7e6..00000000000 --- a/cpp/src/jaccard.cu +++ /dev/null @@ -1,710 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** ---------------------------------------------------------------------------* - * @brief The cugraph Jaccard core functionality - * - * @file jaccard.cu - * ---------------------------------------------------------------------------**/ - -#include "graph_utils.cuh" -#include "cugraph.h" -#include "rmm_utils.h" -#include "utilities/error_utils.h" - -namespace cugraph { - // Volume of neighboors (*weight_s) - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - jaccard_row_sum(IdxType n, - IdxType *csrPtr, - IdxType *csrInd, - ValType *v, - ValType *work) { - IdxType row, start, end, length; - ValType sum; - for (row = threadIdx.y + blockIdx.y * blockDim.y; - row < n; - row += gridDim.y * blockDim.y) { - start = csrPtr[row]; - end = csrPtr[row + 1]; - length = end - start; - //compute row sums - if (weighted) { - sum = parallel_prefix_sum(length, csrInd + start, v); - if (threadIdx.x == 0) - work[row] = sum; - } - else { - work[row] = (ValType) length; - } - } - } - - // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - jaccard_is(IdxType n, - IdxType *csrPtr, - IdxType *csrInd, - ValType *v, - ValType *work, - ValType *weight_i, - ValType *weight_s) { - IdxType i, j, row, col, Ni, Nj; - IdxType ref, cur, ref_col, cur_col, match; - ValType ref_val; - - for (row = threadIdx.z + blockIdx.z * blockDim.z; - row < n; - row += gridDim.z * blockDim.z) { - for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; - j < csrPtr[row + 1]; - j += gridDim.y * blockDim.y) { - col = csrInd[j]; - //find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - //compute new sum weights - weight_s[j] = work[row] + work[col]; - - //compute new intersection weights - //search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } - else { - ref_val = 1.0; - } - - //binary search (column indices are sorted within each row) - IdxType left = csrPtr[cur]; - IdxType right = csrPtr[cur + 1] - 1; - while (left <= right) { - IdxType middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } - else if (cur_col < ref_col) { - left = middle + 1; - } - else { - match = middle; - break; - } - } - - //if the element with the same column index in the reference row has been found - if (match != -1) { - atomicAdd(&weight_i[j], ref_val); - } - } - } - } - } - - // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) - // Using list of node pairs - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - jaccard_is_pairs(IdxType num_pairs, - IdxType *csrPtr, - IdxType *csrInd, - IdxType *first_pair, - IdxType *second_pair, - ValType *v, - ValType *work, - ValType *weight_i, - ValType *weight_s) { - IdxType i, idx, row, col, Ni, Nj; - IdxType ref, cur, ref_col, cur_col, match; - ValType ref_val; - - for (idx = threadIdx.z + blockIdx.z * blockDim.z; - idx < num_pairs; - idx += gridDim.z * blockDim.z) { - row = first_pair[idx]; - col = second_pair[idx]; - //find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - //compute new sum weights - weight_s[idx] = work[row] + work[col]; - - //compute new intersection weights - //search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; - i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } - else { - ref_val = 1.0; - } - - //binary search (column indices are sorted within each row) - IdxType left = csrPtr[cur]; - IdxType right = csrPtr[cur + 1] - 1; - while (left <= right) { - IdxType middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } - else if (cur_col < ref_col) { - left = middle + 1; - } - else { - match = middle; - break; - } - } - - //if the element with the same column index in the reference row has been found - if (match != -1) { - atomicAdd(&weight_i[idx], ref_val); - } - } - } - } - - //Jaccard weights (*weight) - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - jaccard_jw(IdxType e, - IdxType *csrPtr, - IdxType *csrInd, - ValType *weight_i, - ValType *weight_s, - ValType *weight_j) { - IdxType j; - ValType Wi, Ws, Wu; - - for (j = threadIdx.x + blockIdx.x * blockDim.x; - j < e; - j += gridDim.x * blockDim.x) { - Wi = weight_i[j]; - Ws = weight_s[j]; - Wu = Ws - Wi; - weight_j[j] = (Wi / Wu); - } - } - - template - int jaccard(IdxType n, - IdxType e, - IdxType *csrPtr, - IdxType *csrInd, - ValType *weight_in, - ValType *work, - ValType *weight_i, - ValType *weight_s, - ValType *weight_j) { - dim3 nthreads, nblocks; - int y = 4; - - //setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS); - nblocks.z = 1; - //launch kernel - jaccard_row_sum <<>>(n, - csrPtr, - csrInd, - weight_in, - work); - cudaDeviceSynchronize(); - fill(e, weight_i, (ValType) 0.0); - //setup launch configuration - nthreads.x = 32 / y; - nthreads.y = y; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1; - //launch kernel - jaccard_is <<>>(n, - csrPtr, - csrInd, - weight_in, - work, - weight_i, - weight_s); - - //setup launch configuration - nthreads.x = min(e, (IdxType) CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - //launch kernel - jaccard_jw <<>>(e, - csrPtr, - csrInd, - weight_i, - weight_s, - weight_j); - - return 0; - } - - template - int jaccard_pairs(IdxType n, - IdxType num_pairs, - IdxType *csrPtr, - IdxType *csrInd, - IdxType *first_pair, - IdxType *second_pair, - ValType *weight_in, - ValType *work, - ValType *weight_i, - ValType *weight_s, - ValType *weight_j) { - dim3 nthreads, nblocks; - int y = 4; - - //setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS); - nblocks.z = 1; - //launch kernel - jaccard_row_sum <<>>(n, - csrPtr, - csrInd, - weight_in, - work); - cudaDeviceSynchronize(); - fill(num_pairs, weight_i, (ValType) 0.0); - //setup launch configuration - nthreads.x = 32; - nthreads.y = 1; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1; - //launch kernel - jaccard_is_pairs <<>>(num_pairs, - csrPtr, - csrInd, - first_pair, - second_pair, - weight_in, - work, - weight_i, - weight_s); - - //setup launch configuration - nthreads.x = min(num_pairs, (IdxType) CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - //launch kernel - jaccard_jw <<>>(num_pairs, - csrPtr, - csrInd, - weight_i, - weight_s, - weight_j); - - return 0; - } -} // End cugraph namespace - -gdf_error gdf_jaccard(gdf_graph *graph, gdf_column *weights, gdf_column *result) { - GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL); - GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED); - - GDF_TRY(gdf_add_adj_list(graph)); - GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL); - - bool weighted = (weights != nullptr); - - gdf_dtype ValueType = result->dtype; - gdf_dtype IndexType = graph->adjList->offsets->dtype; - - void *csrPtr = graph->adjList->offsets->data; - void *csrInd = graph->adjList->indices->data; - void *weight_i = nullptr; - void *weight_s = nullptr; - void *weight_j = result->data; - void *work = nullptr; - void *weight_in = nullptr; - if (weighted) - weight_in = weights->data; - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::jaccard(n, - e, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::jaccard(n, - e, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::jaccard(n, - e, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::jaccard(n, - e, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::jaccard(n, - e, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::jaccard(n, - e, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::jaccard(n, - e, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::jaccard(n, - e, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - -// Clean up temp arrays - ALLOC_FREE_TRY(weight_i, nullptr); - ALLOC_FREE_TRY(weight_s, nullptr); - ALLOC_FREE_TRY(work, nullptr); - - return GDF_SUCCESS; -} - -gdf_error gdf_jaccard_list(gdf_graph* graph, - gdf_column* weights, - gdf_column* first, - gdf_column* second, - gdf_column* result) { - GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL); - GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED); - - GDF_REQUIRE(first != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(first->data != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(!first->valid, GDF_VALIDITY_UNSUPPORTED); - - GDF_REQUIRE(second != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(second->data != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(!second->valid, GDF_VALIDITY_UNSUPPORTED); - - GDF_TRY(gdf_add_adj_list(graph)); - GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL); - - bool weighted = (weights != nullptr); - - gdf_dtype ValueType = result->dtype; - gdf_dtype IndexType = graph->adjList->offsets->dtype; - GDF_REQUIRE(first->dtype == IndexType, GDF_INVALID_API_CALL); - GDF_REQUIRE(second->dtype == IndexType, GDF_INVALID_API_CALL); - - void *first_pair = first->data; - void *second_pair = second->data; - void *csrPtr = graph->adjList->offsets->data; - void *csrInd = graph->adjList->indices->data; - void *weight_i = nullptr; - void *weight_s = nullptr; - void *weight_j = result->data; - void *work = nullptr; - void *weight_in = nullptr; - if (weighted) - weight_in = weights->data; - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::jaccard_pairs(n, - num_pairs, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (int32_t*) first_pair, - (int32_t*) second_pair, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::jaccard_pairs(n, - num_pairs, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (int32_t*) first_pair, - (int32_t*) second_pair, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::jaccard_pairs(n, - num_pairs, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (int32_t*) first_pair, - (int32_t*) second_pair, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::jaccard_pairs(n, - num_pairs, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (int32_t*) first_pair, - (int32_t*) second_pair, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::jaccard_pairs(n, - num_pairs, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (int64_t*) first_pair, - (int64_t*) second_pair, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::jaccard_pairs(n, - num_pairs, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (int64_t*) first_pair, - (int64_t*) second_pair, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::jaccard_pairs(n, - num_pairs, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (int64_t*) first_pair, - (int64_t*) second_pair, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::jaccard_pairs(n, - num_pairs, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (int64_t*) first_pair, - (int64_t*) second_pair, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - - // Clean up temp arrays - ALLOC_FREE_TRY(weight_i, nullptr); - ALLOC_FREE_TRY(weight_s, nullptr); - ALLOC_FREE_TRY(work, nullptr); - - return GDF_SUCCESS; -} - diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu new file mode 100644 index 00000000000..1943ba9f22b --- /dev/null +++ b/cpp/src/link_analysis/pagerank.cu @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + * + */ + +// Pagerank solver +// Author: Alex Fender afender@nvidia.com + +#include +#include +#include +#include + #include +#include +#include +#include "cub/cub.cuh" +#include +#include + +#include + +#include "utilities/graph_utils.cuh" +#include "utilities/error_utils.h" +#include + +namespace cugraph +{ + +#ifdef DEBUG + #define PR_VERBOSE +#endif +template +bool pagerankIteration(IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd,ValueType *cscVal, + ValueType alpha, ValueType *a, ValueType *b, float tolerance, int iter, int max_iter, + ValueType * &tmp, void* cub_d_temp_storage, size_t cub_temp_storage_bytes, + ValueType * &pr, ValueType *residual) { + ValueType dot_res; + CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal, + cscPtr, cscInd, tmp, pr, n, n, e)); + + scal(n, alpha, pr); + dot_res = dot( n, a, tmp); + axpy(n, dot_res, b, pr); + scal(n, (ValueType)1.0/nrm2(n, pr) , pr); + axpy(n, (ValueType)-1.0, pr, tmp); + *residual = nrm2(n, tmp); + if (*residual < tolerance) + { + scal(n, (ValueType)1.0/nrm1(n,pr), pr); + return true; + } + else + { + if (iter< max_iter) + { + std::swap(pr, tmp); + } + else + { + scal(n, (ValueType)1.0/nrm1(n,pr), pr); + } + return false; + } +} + +template +int pagerank(IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd, ValueType *cscVal, + ValueType alpha, ValueType *a, bool has_guess, float tolerance, int max_iter, + ValueType * &pagerank_vector, ValueType * &residual) { + int max_it, i = 0 ; + float tol; + bool converged = false; + ValueType randomProbability = static_cast( 1.0/n); + ValueType *b=0, *tmp=0; + void* cub_d_temp_storage = NULL; + size_t cub_temp_storage_bytes = 0; + + if (max_iter > 0) + max_it = max_iter; + else + max_it = 500; + + if (tolerance == 0.0f) + tol = 1.0E-6f; + else if (tolerance < 1.0f && tolerance > 0.0f) + tol = tolerance; + else + return -1; + + if (alpha <= 0.0f || alpha >= 1.0f) + return -1; + + cudaStream_t stream{nullptr}; + + ALLOC_TRY((void**)&b, sizeof(ValueType) * n, stream); +#if 1/* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */ + CUDA_TRY(cudaMalloc((void**)&tmp, sizeof(ValueType) * n)); +#else + ALLOC_TRY((void**)&tmp, sizeof(ValueType) * n, stream); +#endif + cudaCheckError(); + + if (!has_guess) { + fill(n, pagerank_vector, randomProbability); + fill(n, tmp, randomProbability); + } + else { + copy(n, pagerank_vector, tmp); + } + + fill(n, b, randomProbability); + update_dangling_nodes(n, a, alpha); + + CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal, + cscPtr, cscInd, tmp, pagerank_vector, n, n, e)); + // Allocate temporary storage + ALLOC_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream); + cudaCheckError() +#ifdef PR_VERBOSE + std::stringstream ss; + ss.str(std::string()); + ss <<" ------------------PageRank------------------"<< std::endl; + ss <<" --------------------------------------------"<< std::endl; + ss << std::setw(10) << "Iteration" << std::setw(15) << "Residual" << std::endl; + ss <<" --------------------------------------------"<< std::endl; + std::cout< ( int n, int e, int *cscPtr, int *cscInd,half *cscVal, half alpha, half *a, bool has_guess, float tolerance, int max_iter, half * &pagerank_vector, half * &residual); +template int pagerank ( int n, int e, int *cscPtr, int *cscInd,float *cscVal, float alpha, float *a, bool has_guess, float tolerance, int max_iter, float * &pagerank_vector, float * &residual); +template int pagerank ( int n, int e, int *cscPtr, int *cscInd,double *cscVal, double alpha, double *a, bool has_guess, float tolerance, int max_iter, double * &pagerank_vector, double * &residual); + +} //namespace cugraph + +template +gdf_error gdf_pagerank_impl (gdf_graph *graph, + gdf_column *pagerank, float alpha = 0.85, + float tolerance = 1e-4, int max_iter = 200, + bool has_guess = false) { + GDF_REQUIRE( graph->edgeList != nullptr, GDF_VALIDITY_UNSUPPORTED ); + GDF_REQUIRE( graph->edgeList->src_indices->size == graph->edgeList->dest_indices->size, GDF_COLUMN_SIZE_MISMATCH ); + GDF_REQUIRE( graph->edgeList->src_indices->dtype == graph->edgeList->dest_indices->dtype, GDF_UNSUPPORTED_DTYPE ); + GDF_REQUIRE( graph->edgeList->src_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); + GDF_REQUIRE( graph->edgeList->dest_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); + GDF_REQUIRE( pagerank != nullptr , GDF_INVALID_API_CALL ); + GDF_REQUIRE( pagerank->data != nullptr , GDF_INVALID_API_CALL ); + GDF_REQUIRE( pagerank->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); + GDF_REQUIRE( pagerank->size > 0 , GDF_INVALID_API_CALL ); + + int m=pagerank->size, nnz = graph->edgeList->src_indices->size, status = 0; + WT *d_pr, *d_val = nullptr, *d_leaf_vector = nullptr; + WT res = 1.0; + WT *residual = &res; + + if (graph->transposedAdjList == nullptr) { + gdf_add_transposed_adj_list(graph); + } + cudaStream_t stream{nullptr}; + ALLOC_TRY((void**)&d_leaf_vector, sizeof(WT) * m, stream); + ALLOC_TRY((void**)&d_val, sizeof(WT) * nnz , stream); +#if 1/* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */ + CUDA_TRY(cudaMalloc((void**)&d_pr, sizeof(WT) * m)); +#else + ALLOC_TRY((void**)&d_pr, sizeof(WT) * m, stream); +#endif + + // The templating for HT_matrix_csc_coo assumes that m, nnz and data are all the same type + cugraph::HT_matrix_csc_coo(m, nnz, (int *)graph->transposedAdjList->offsets->data, (int *)graph->transposedAdjList->indices->data, d_val, d_leaf_vector); + + if (has_guess) + { + GDF_REQUIRE( pagerank->data != nullptr, GDF_VALIDITY_UNSUPPORTED ); + cugraph::copy(m, (WT*)pagerank->data, d_pr); + } + + status = cugraph::pagerank( m,nnz, (int*)graph->transposedAdjList->offsets->data, (int*)graph->transposedAdjList->indices->data, + d_val, alpha, d_leaf_vector, false, tolerance, max_iter, d_pr, residual); + + if (status !=0) + switch ( status ) { + case -1: std::cerr<< "Error : bad parameters in Pagerank"<(m, d_pr, (WT*)pagerank->data); + + ALLOC_FREE_TRY(d_val, stream); +#if 1/* temporary solution till https://github.com/NVlabs/cub/issues/162 is resolved */ + CUDA_TRY(cudaFree(d_pr)); +#else + ALLOC_FREE_TRY(d_pr, stream); +#endif + ALLOC_FREE_TRY(d_leaf_vector, stream); + + return GDF_SUCCESS; +} + +gdf_error gdf_pagerank(gdf_graph *graph, gdf_column *pagerank, float alpha, float tolerance, int max_iter, bool has_guess) { + // + // page rank operates on CSR and can't currently support 64-bit integers. + // + // If csr doesn't exist, create it. Then check type to make sure it is 32-bit. + // + GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL); + gdf_error err = gdf_add_adj_list(graph); + if (err != GDF_SUCCESS) + return err; + + GDF_REQUIRE(graph->adjList->offsets->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); + GDF_REQUIRE(graph->adjList->indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); + + switch (pagerank->dtype) { + case GDF_FLOAT32: return gdf_pagerank_impl(graph, pagerank, alpha, tolerance, max_iter, has_guess); + case GDF_FLOAT64: return gdf_pagerank_impl(graph, pagerank, alpha, tolerance, max_iter, has_guess); + default: return GDF_UNSUPPORTED_DTYPE; + } +} diff --git a/cpp/src/link_prediction/jaccard.cu b/cpp/src/link_prediction/jaccard.cu new file mode 100644 index 00000000000..fe3502e4356 --- /dev/null +++ b/cpp/src/link_prediction/jaccard.cu @@ -0,0 +1,710 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** ---------------------------------------------------------------------------* + * @brief The cugraph Jaccard core functionality + * + * @file jaccard.cu + * ---------------------------------------------------------------------------**/ + +#include "utilities/graph_utils.cuh" +#include "cugraph.h" +#include "rmm_utils.h" +#include "utilities/error_utils.h" + +namespace cugraph { + // Volume of neighboors (*weight_s) + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + jaccard_row_sum(IdxType n, + IdxType *csrPtr, + IdxType *csrInd, + ValType *v, + ValType *work) { + IdxType row, start, end, length; + ValType sum; + for (row = threadIdx.y + blockIdx.y * blockDim.y; + row < n; + row += gridDim.y * blockDim.y) { + start = csrPtr[row]; + end = csrPtr[row + 1]; + length = end - start; + //compute row sums + if (weighted) { + sum = parallel_prefix_sum(length, csrInd + start, v); + if (threadIdx.x == 0) + work[row] = sum; + } + else { + work[row] = (ValType) length; + } + } + } + + // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + jaccard_is(IdxType n, + IdxType *csrPtr, + IdxType *csrInd, + ValType *v, + ValType *work, + ValType *weight_i, + ValType *weight_s) { + IdxType i, j, row, col, Ni, Nj; + IdxType ref, cur, ref_col, cur_col, match; + ValType ref_val; + + for (row = threadIdx.z + blockIdx.z * blockDim.z; + row < n; + row += gridDim.z * blockDim.z) { + for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; + j < csrPtr[row + 1]; + j += gridDim.y * blockDim.y) { + col = csrInd[j]; + //find which row has least elements (and call it reference row) + Ni = csrPtr[row + 1] - csrPtr[row]; + Nj = csrPtr[col + 1] - csrPtr[col]; + ref = (Ni < Nj) ? row : col; + cur = (Ni < Nj) ? col : row; + + //compute new sum weights + weight_s[j] = work[row] + work[col]; + + //compute new intersection weights + //search for the element with the same column index in the reference row + for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; + i += gridDim.x * blockDim.x) { + match = -1; + ref_col = csrInd[i]; + if (weighted) { + ref_val = v[ref_col]; + } + else { + ref_val = 1.0; + } + + //binary search (column indices are sorted within each row) + IdxType left = csrPtr[cur]; + IdxType right = csrPtr[cur + 1] - 1; + while (left <= right) { + IdxType middle = (left + right) >> 1; + cur_col = csrInd[middle]; + if (cur_col > ref_col) { + right = middle - 1; + } + else if (cur_col < ref_col) { + left = middle + 1; + } + else { + match = middle; + break; + } + } + + //if the element with the same column index in the reference row has been found + if (match != -1) { + atomicAdd(&weight_i[j], ref_val); + } + } + } + } + } + + // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) + // Using list of node pairs + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + jaccard_is_pairs(IdxType num_pairs, + IdxType *csrPtr, + IdxType *csrInd, + IdxType *first_pair, + IdxType *second_pair, + ValType *v, + ValType *work, + ValType *weight_i, + ValType *weight_s) { + IdxType i, idx, row, col, Ni, Nj; + IdxType ref, cur, ref_col, cur_col, match; + ValType ref_val; + + for (idx = threadIdx.z + blockIdx.z * blockDim.z; + idx < num_pairs; + idx += gridDim.z * blockDim.z) { + row = first_pair[idx]; + col = second_pair[idx]; + //find which row has least elements (and call it reference row) + Ni = csrPtr[row + 1] - csrPtr[row]; + Nj = csrPtr[col + 1] - csrPtr[col]; + ref = (Ni < Nj) ? row : col; + cur = (Ni < Nj) ? col : row; + + //compute new sum weights + weight_s[idx] = work[row] + work[col]; + + //compute new intersection weights + //search for the element with the same column index in the reference row + for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; + i < csrPtr[ref + 1]; + i += gridDim.x * blockDim.x) { + match = -1; + ref_col = csrInd[i]; + if (weighted) { + ref_val = v[ref_col]; + } + else { + ref_val = 1.0; + } + + //binary search (column indices are sorted within each row) + IdxType left = csrPtr[cur]; + IdxType right = csrPtr[cur + 1] - 1; + while (left <= right) { + IdxType middle = (left + right) >> 1; + cur_col = csrInd[middle]; + if (cur_col > ref_col) { + right = middle - 1; + } + else if (cur_col < ref_col) { + left = middle + 1; + } + else { + match = middle; + break; + } + } + + //if the element with the same column index in the reference row has been found + if (match != -1) { + atomicAdd(&weight_i[idx], ref_val); + } + } + } + } + + //Jaccard weights (*weight) + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + jaccard_jw(IdxType e, + IdxType *csrPtr, + IdxType *csrInd, + ValType *weight_i, + ValType *weight_s, + ValType *weight_j) { + IdxType j; + ValType Wi, Ws, Wu; + + for (j = threadIdx.x + blockIdx.x * blockDim.x; + j < e; + j += gridDim.x * blockDim.x) { + Wi = weight_i[j]; + Ws = weight_s[j]; + Wu = Ws - Wi; + weight_j[j] = (Wi / Wu); + } + } + + template + int jaccard(IdxType n, + IdxType e, + IdxType *csrPtr, + IdxType *csrInd, + ValType *weight_in, + ValType *work, + ValType *weight_i, + ValType *weight_s, + ValType *weight_j) { + dim3 nthreads, nblocks; + int y = 4; + + //setup launch configuration + nthreads.x = 32; + nthreads.y = y; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS); + nblocks.z = 1; + //launch kernel + jaccard_row_sum <<>>(n, + csrPtr, + csrInd, + weight_in, + work); + cudaDeviceSynchronize(); + fill(e, weight_i, (ValType) 0.0); + //setup launch configuration + nthreads.x = 32 / y; + nthreads.y = y; + nthreads.z = 8; + nblocks.x = 1; + nblocks.y = 1; + nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1; + //launch kernel + jaccard_is <<>>(n, + csrPtr, + csrInd, + weight_in, + work, + weight_i, + weight_s); + + //setup launch configuration + nthreads.x = min(e, (IdxType) CUDA_MAX_KERNEL_THREADS); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min((e + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS); + nblocks.y = 1; + nblocks.z = 1; + //launch kernel + jaccard_jw <<>>(e, + csrPtr, + csrInd, + weight_i, + weight_s, + weight_j); + + return 0; + } + + template + int jaccard_pairs(IdxType n, + IdxType num_pairs, + IdxType *csrPtr, + IdxType *csrInd, + IdxType *first_pair, + IdxType *second_pair, + ValType *weight_in, + ValType *work, + ValType *weight_i, + ValType *weight_s, + ValType *weight_j) { + dim3 nthreads, nblocks; + int y = 4; + + //setup launch configuration + nthreads.x = 32; + nthreads.y = y; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS); + nblocks.z = 1; + //launch kernel + jaccard_row_sum <<>>(n, + csrPtr, + csrInd, + weight_in, + work); + cudaDeviceSynchronize(); + fill(num_pairs, weight_i, (ValType) 0.0); + //setup launch configuration + nthreads.x = 32; + nthreads.y = 1; + nthreads.z = 8; + nblocks.x = 1; + nblocks.y = 1; + nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1; + //launch kernel + jaccard_is_pairs <<>>(num_pairs, + csrPtr, + csrInd, + first_pair, + second_pair, + weight_in, + work, + weight_i, + weight_s); + + //setup launch configuration + nthreads.x = min(num_pairs, (IdxType) CUDA_MAX_KERNEL_THREADS); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS); + nblocks.y = 1; + nblocks.z = 1; + //launch kernel + jaccard_jw <<>>(num_pairs, + csrPtr, + csrInd, + weight_i, + weight_s, + weight_j); + + return 0; + } +} // End cugraph namespace + +gdf_error gdf_jaccard(gdf_graph *graph, gdf_column *weights, gdf_column *result) { + GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL); + GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED); + + GDF_TRY(gdf_add_adj_list(graph)); + GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL); + + bool weighted = (weights != nullptr); + + gdf_dtype ValueType = result->dtype; + gdf_dtype IndexType = graph->adjList->offsets->dtype; + + void *csrPtr = graph->adjList->offsets->data; + void *csrInd = graph->adjList->indices->data; + void *weight_i = nullptr; + void *weight_s = nullptr; + void *weight_j = result->data; + void *work = nullptr; + void *weight_in = nullptr; + if (weighted) + weight_in = weights->data; + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::jaccard(n, + e, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::jaccard(n, + e, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::jaccard(n, + e, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::jaccard(n, + e, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::jaccard(n, + e, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::jaccard(n, + e, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::jaccard(n, + e, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::jaccard(n, + e, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + +// Clean up temp arrays + ALLOC_FREE_TRY(weight_i, nullptr); + ALLOC_FREE_TRY(weight_s, nullptr); + ALLOC_FREE_TRY(work, nullptr); + + return GDF_SUCCESS; +} + +gdf_error gdf_jaccard_list(gdf_graph* graph, + gdf_column* weights, + gdf_column* first, + gdf_column* second, + gdf_column* result) { + GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL); + GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED); + + GDF_REQUIRE(first != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(first->data != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(!first->valid, GDF_VALIDITY_UNSUPPORTED); + + GDF_REQUIRE(second != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(second->data != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(!second->valid, GDF_VALIDITY_UNSUPPORTED); + + GDF_TRY(gdf_add_adj_list(graph)); + GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL); + + bool weighted = (weights != nullptr); + + gdf_dtype ValueType = result->dtype; + gdf_dtype IndexType = graph->adjList->offsets->dtype; + GDF_REQUIRE(first->dtype == IndexType, GDF_INVALID_API_CALL); + GDF_REQUIRE(second->dtype == IndexType, GDF_INVALID_API_CALL); + + void *first_pair = first->data; + void *second_pair = second->data; + void *csrPtr = graph->adjList->offsets->data; + void *csrInd = graph->adjList->indices->data; + void *weight_i = nullptr; + void *weight_s = nullptr; + void *weight_j = result->data; + void *work = nullptr; + void *weight_in = nullptr; + if (weighted) + weight_in = weights->data; + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::jaccard_pairs(n, + num_pairs, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (int32_t*) first_pair, + (int32_t*) second_pair, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::jaccard_pairs(n, + num_pairs, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (int32_t*) first_pair, + (int32_t*) second_pair, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::jaccard_pairs(n, + num_pairs, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (int32_t*) first_pair, + (int32_t*) second_pair, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::jaccard_pairs(n, + num_pairs, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (int32_t*) first_pair, + (int32_t*) second_pair, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::jaccard_pairs(n, + num_pairs, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (int64_t*) first_pair, + (int64_t*) second_pair, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::jaccard_pairs(n, + num_pairs, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (int64_t*) first_pair, + (int64_t*) second_pair, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::jaccard_pairs(n, + num_pairs, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (int64_t*) first_pair, + (int64_t*) second_pair, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::jaccard_pairs(n, + num_pairs, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (int64_t*) first_pair, + (int64_t*) second_pair, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + + // Clean up temp arrays + ALLOC_FREE_TRY(weight_i, nullptr); + ALLOC_FREE_TRY(weight_s, nullptr); + ALLOC_FREE_TRY(work, nullptr); + + return GDF_SUCCESS; +} + diff --git a/cpp/src/link_prediction/overlap.cu b/cpp/src/link_prediction/overlap.cu new file mode 100644 index 00000000000..cce0ac99752 --- /dev/null +++ b/cpp/src/link_prediction/overlap.cu @@ -0,0 +1,709 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** ---------------------------------------------------------------------------* + * @brief The cugraph Jaccard core functionality + * + * @file jaccard.cu + * ---------------------------------------------------------------------------**/ + +#include "utilities/graph_utils.cuh" +#include "cugraph.h" +#include "rmm_utils.h" +#include "utilities/error_utils.h" + +namespace cugraph { + // Volume of neighboors (*weight_s) + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + overlap_row_sum(IdxType n, + IdxType *csrPtr, + IdxType *csrInd, + ValType *v, + ValType *work) { + IdxType row, start, end, length; + ValType sum; + for (row = threadIdx.y + blockIdx.y * blockDim.y; + row < n; + row += gridDim.y * blockDim.y) { + start = csrPtr[row]; + end = csrPtr[row + 1]; + length = end - start; + //compute row sums + if (weighted) { + sum = parallel_prefix_sum(length, csrInd + start, v); + if (threadIdx.x == 0) + work[row] = sum; + } + else { + work[row] = (ValType) length; + } + } + } + + // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + overlap_is(IdxType n, + IdxType *csrPtr, + IdxType *csrInd, + ValType *v, + ValType *work, + ValType *weight_i, + ValType *weight_s) { + IdxType i, j, row, col, Ni, Nj; + IdxType ref, cur, ref_col, cur_col, match; + ValType ref_val; + + for (row = threadIdx.z + blockIdx.z * blockDim.z; + row < n; + row += gridDim.z * blockDim.z) { + for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; + j < csrPtr[row + 1]; + j += gridDim.y * blockDim.y) { + col = csrInd[j]; + //find which row has least elements (and call it reference row) + Ni = csrPtr[row + 1] - csrPtr[row]; + Nj = csrPtr[col + 1] - csrPtr[col]; + ref = (Ni < Nj) ? row : col; + cur = (Ni < Nj) ? col : row; + + //compute new sum weights + weight_s[j] = min(work[row], work[col]); + + //compute new intersection weights + //search for the element with the same column index in the reference row + for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; + i += gridDim.x * blockDim.x) { + match = -1; + ref_col = csrInd[i]; + if (weighted) { + ref_val = v[ref_col]; + } + else { + ref_val = 1.0; + } + + //binary search (column indices are sorted within each row) + IdxType left = csrPtr[cur]; + IdxType right = csrPtr[cur + 1] - 1; + while (left <= right) { + IdxType middle = (left + right) >> 1; + cur_col = csrInd[middle]; + if (cur_col > ref_col) { + right = middle - 1; + } + else if (cur_col < ref_col) { + left = middle + 1; + } + else { + match = middle; + break; + } + } + + //if the element with the same column index in the reference row has been found + if (match != -1) { + atomicAdd(&weight_i[j], ref_val); + } + } + } + } + } + + // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) + // Using list of node pairs + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + overlap_is_pairs(IdxType num_pairs, + IdxType *csrPtr, + IdxType *csrInd, + IdxType *first_pair, + IdxType *second_pair, + ValType *v, + ValType *work, + ValType *weight_i, + ValType *weight_s) { + IdxType i, idx, row, col, Ni, Nj; + IdxType ref, cur, ref_col, cur_col, match; + ValType ref_val; + + for (idx = threadIdx.z + blockIdx.z * blockDim.z; + idx < num_pairs; + idx += gridDim.z * blockDim.z) { + row = first_pair[idx]; + col = second_pair[idx]; + //find which row has least elements (and call it reference row) + Ni = csrPtr[row + 1] - csrPtr[row]; + Nj = csrPtr[col + 1] - csrPtr[col]; + ref = (Ni < Nj) ? row : col; + cur = (Ni < Nj) ? col : row; + + //compute new sum weights + weight_s[idx] = min(work[row], work[col]); + + //compute new intersection weights + //search for the element with the same column index in the reference row + for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; + i < csrPtr[ref + 1]; + i += gridDim.x * blockDim.x) { + match = -1; + ref_col = csrInd[i]; + if (weighted) { + ref_val = v[ref_col]; + } + else { + ref_val = 1.0; + } + + //binary search (column indices are sorted within each row) + IdxType left = csrPtr[cur]; + IdxType right = csrPtr[cur + 1] - 1; + while (left <= right) { + IdxType middle = (left + right) >> 1; + cur_col = csrInd[middle]; + if (cur_col > ref_col) { + right = middle - 1; + } + else if (cur_col < ref_col) { + left = middle + 1; + } + else { + match = middle; + break; + } + } + + //if the element with the same column index in the reference row has been found + if (match != -1) { + atomicAdd(&weight_i[idx], ref_val); + } + } + } + } + + //Jaccard weights (*weight) + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + overlap_jw(IdxType e, + IdxType *csrPtr, + IdxType *csrInd, + ValType *weight_i, + ValType *weight_s, + ValType *weight_j) { + IdxType j; + ValType Wi, Wu; + + for (j = threadIdx.x + blockIdx.x * blockDim.x; + j < e; + j += gridDim.x * blockDim.x) { + Wi = weight_i[j]; + Wu = weight_s[j]; + weight_j[j] = (Wi / Wu); + } + } + + template + int overlap(IdxType n, + IdxType e, + IdxType *csrPtr, + IdxType *csrInd, + ValType *weight_in, + ValType *work, + ValType *weight_i, + ValType *weight_s, + ValType *weight_j) { + dim3 nthreads, nblocks; + int y = 4; + + //setup launch configuration + nthreads.x = 32; + nthreads.y = y; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS); + nblocks.z = 1; + //launch kernel + overlap_row_sum <<>>(n, + csrPtr, + csrInd, + weight_in, + work); + cudaDeviceSynchronize(); + fill(e, weight_i, (ValType) 0.0); + //setup launch configuration + nthreads.x = 32 / y; + nthreads.y = y; + nthreads.z = 8; + nblocks.x = 1; + nblocks.y = 1; + nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1; + //launch kernel + overlap_is <<>>(n, + csrPtr, + csrInd, + weight_in, + work, + weight_i, + weight_s); + + //setup launch configuration + nthreads.x = min(e, (IdxType) CUDA_MAX_KERNEL_THREADS); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min((e + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS); + nblocks.y = 1; + nblocks.z = 1; + //launch kernel + overlap_jw <<>>(e, + csrPtr, + csrInd, + weight_i, + weight_s, + weight_j); + + return 0; + } + + template + int overlap_pairs(IdxType n, + IdxType num_pairs, + IdxType *csrPtr, + IdxType *csrInd, + IdxType *first_pair, + IdxType *second_pair, + ValType *weight_in, + ValType *work, + ValType *weight_i, + ValType *weight_s, + ValType *weight_j) { + dim3 nthreads, nblocks; + int y = 4; + + //setup launch configuration + nthreads.x = 32; + nthreads.y = y; + nthreads.z = 1; + nblocks.x = 1; + nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS); + nblocks.z = 1; + //launch kernel + overlap_row_sum <<>>(n, + csrPtr, + csrInd, + weight_in, + work); + cudaDeviceSynchronize(); + fill(num_pairs, weight_i, (ValType) 0.0); + //setup launch configuration + nthreads.x = 32; + nthreads.y = 1; + nthreads.z = 8; + nblocks.x = 1; + nblocks.y = 1; + nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1; + //launch kernel + overlap_is_pairs <<>>(num_pairs, + csrPtr, + csrInd, + first_pair, + second_pair, + weight_in, + work, + weight_i, + weight_s); + + //setup launch configuration + nthreads.x = min(num_pairs, (IdxType) CUDA_MAX_KERNEL_THREADS); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS); + nblocks.y = 1; + nblocks.z = 1; + //launch kernel + overlap_jw <<>>(num_pairs, + csrPtr, + csrInd, + weight_i, + weight_s, + weight_j); + + return 0; + } +} // End cugraph namespace + +gdf_error gdf_overlap(gdf_graph *graph, gdf_column *weights, gdf_column *result) { + GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL); + GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED); + + GDF_TRY(gdf_add_adj_list(graph)); + GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL); + + bool weighted = (weights != nullptr); + + gdf_dtype ValueType = result->dtype; + gdf_dtype IndexType = graph->adjList->offsets->dtype; + + void *csrPtr = graph->adjList->offsets->data; + void *csrInd = graph->adjList->indices->data; + void *weight_i = nullptr; + void *weight_s = nullptr; + void *weight_j = result->data; + void *work = nullptr; + void *weight_in = nullptr; + if (weighted) + weight_in = weights->data; + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::overlap(n, + e, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::overlap(n, + e, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::overlap(n, + e, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::overlap(n, + e, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::overlap(n, + e, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(float) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * e, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::overlap(n, + e, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::overlap(n, + e, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t e = graph->adjList->indices->size; + ALLOC_TRY(&weight_i, sizeof(double) * e, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * e, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::overlap(n, + e, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + +// Clean up temp arrays + ALLOC_FREE_TRY(weight_i, nullptr); + ALLOC_FREE_TRY(weight_s, nullptr); + ALLOC_FREE_TRY(work, nullptr); + + return GDF_SUCCESS; +} + +gdf_error gdf_overlap_list(gdf_graph* graph, + gdf_column* weights, + gdf_column* first, + gdf_column* second, + gdf_column* result) { + GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL); + GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED); + + GDF_REQUIRE(first != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(first->data != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(!first->valid, GDF_VALIDITY_UNSUPPORTED); + + GDF_REQUIRE(second != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(second->data != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(!second->valid, GDF_VALIDITY_UNSUPPORTED); + + GDF_TRY(gdf_add_adj_list(graph)); + GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL); + + bool weighted = (weights != nullptr); + + gdf_dtype ValueType = result->dtype; + gdf_dtype IndexType = graph->adjList->offsets->dtype; + GDF_REQUIRE(first->dtype == IndexType, GDF_INVALID_API_CALL); + GDF_REQUIRE(second->dtype == IndexType, GDF_INVALID_API_CALL); + + void *first_pair = first->data; + void *second_pair = second->data; + void *csrPtr = graph->adjList->offsets->data; + void *csrInd = graph->adjList->indices->data; + void *weight_i = nullptr; + void *weight_s = nullptr; + void *weight_j = result->data; + void *work = nullptr; + void *weight_in = nullptr; + if (weighted) + weight_in = weights->data; + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::overlap_pairs(n, + num_pairs, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (int32_t*) first_pair, + (int32_t*) second_pair, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::overlap_pairs(n, + num_pairs, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (int32_t*) first_pair, + (int32_t*) second_pair, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::overlap_pairs(n, + num_pairs, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (int32_t*) first_pair, + (int32_t*) second_pair, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) { + int32_t n = graph->adjList->offsets->size - 1; + int32_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::overlap_pairs(n, + num_pairs, + (int32_t*) csrPtr, + (int32_t*) csrInd, + (int32_t*) first_pair, + (int32_t*) second_pair, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::overlap_pairs(n, + num_pairs, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (int64_t*) first_pair, + (int64_t*) second_pair, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + + if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(float) * n, nullptr); + cugraph::overlap_pairs(n, + num_pairs, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (int64_t*) first_pair, + (int64_t*) second_pair, + (float*) weight_in, + (float*) work, + (float*) weight_i, + (float*) weight_s, + (float*) weight_j); + } + + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::overlap_pairs(n, + num_pairs, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (int64_t*) first_pair, + (int64_t*) second_pair, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + + if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) { + int64_t n = graph->adjList->offsets->size - 1; + int64_t num_pairs = first->size; + ALLOC_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); + ALLOC_TRY(&work, sizeof(double) * n, nullptr); + cugraph::overlap_pairs(n, + num_pairs, + (int64_t*) csrPtr, + (int64_t*) csrInd, + (int64_t*) first_pair, + (int64_t*) second_pair, + (double*) weight_in, + (double*) work, + (double*) weight_i, + (double*) weight_s, + (double*) weight_j); + } + + // Clean up temp arrays + ALLOC_FREE_TRY(weight_i, nullptr); + ALLOC_FREE_TRY(weight_s, nullptr); + ALLOC_FREE_TRY(work, nullptr); + + return GDF_SUCCESS; +} + diff --git a/cpp/src/overlap.cu b/cpp/src/overlap.cu deleted file mode 100644 index 315baf1dac8..00000000000 --- a/cpp/src/overlap.cu +++ /dev/null @@ -1,709 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** ---------------------------------------------------------------------------* - * @brief The cugraph Jaccard core functionality - * - * @file jaccard.cu - * ---------------------------------------------------------------------------**/ - -#include "graph_utils.cuh" -#include "cugraph.h" -#include "rmm_utils.h" -#include "utilities/error_utils.h" - -namespace cugraph { - // Volume of neighboors (*weight_s) - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - overlap_row_sum(IdxType n, - IdxType *csrPtr, - IdxType *csrInd, - ValType *v, - ValType *work) { - IdxType row, start, end, length; - ValType sum; - for (row = threadIdx.y + blockIdx.y * blockDim.y; - row < n; - row += gridDim.y * blockDim.y) { - start = csrPtr[row]; - end = csrPtr[row + 1]; - length = end - start; - //compute row sums - if (weighted) { - sum = parallel_prefix_sum(length, csrInd + start, v); - if (threadIdx.x == 0) - work[row] = sum; - } - else { - work[row] = (ValType) length; - } - } - } - - // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - overlap_is(IdxType n, - IdxType *csrPtr, - IdxType *csrInd, - ValType *v, - ValType *work, - ValType *weight_i, - ValType *weight_s) { - IdxType i, j, row, col, Ni, Nj; - IdxType ref, cur, ref_col, cur_col, match; - ValType ref_val; - - for (row = threadIdx.z + blockIdx.z * blockDim.z; - row < n; - row += gridDim.z * blockDim.z) { - for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; - j < csrPtr[row + 1]; - j += gridDim.y * blockDim.y) { - col = csrInd[j]; - //find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - //compute new sum weights - weight_s[j] = min(work[row], work[col]); - - //compute new intersection weights - //search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } - else { - ref_val = 1.0; - } - - //binary search (column indices are sorted within each row) - IdxType left = csrPtr[cur]; - IdxType right = csrPtr[cur + 1] - 1; - while (left <= right) { - IdxType middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } - else if (cur_col < ref_col) { - left = middle + 1; - } - else { - match = middle; - break; - } - } - - //if the element with the same column index in the reference row has been found - if (match != -1) { - atomicAdd(&weight_i[j], ref_val); - } - } - } - } - } - - // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s) - // Using list of node pairs - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - overlap_is_pairs(IdxType num_pairs, - IdxType *csrPtr, - IdxType *csrInd, - IdxType *first_pair, - IdxType *second_pair, - ValType *v, - ValType *work, - ValType *weight_i, - ValType *weight_s) { - IdxType i, idx, row, col, Ni, Nj; - IdxType ref, cur, ref_col, cur_col, match; - ValType ref_val; - - for (idx = threadIdx.z + blockIdx.z * blockDim.z; - idx < num_pairs; - idx += gridDim.z * blockDim.z) { - row = first_pair[idx]; - col = second_pair[idx]; - //find which row has least elements (and call it reference row) - Ni = csrPtr[row + 1] - csrPtr[row]; - Nj = csrPtr[col + 1] - csrPtr[col]; - ref = (Ni < Nj) ? row : col; - cur = (Ni < Nj) ? col : row; - - //compute new sum weights - weight_s[idx] = min(work[row], work[col]); - - //compute new intersection weights - //search for the element with the same column index in the reference row - for (i = csrPtr[ref] + threadIdx.x + blockIdx.x * blockDim.x; - i < csrPtr[ref + 1]; - i += gridDim.x * blockDim.x) { - match = -1; - ref_col = csrInd[i]; - if (weighted) { - ref_val = v[ref_col]; - } - else { - ref_val = 1.0; - } - - //binary search (column indices are sorted within each row) - IdxType left = csrPtr[cur]; - IdxType right = csrPtr[cur + 1] - 1; - while (left <= right) { - IdxType middle = (left + right) >> 1; - cur_col = csrInd[middle]; - if (cur_col > ref_col) { - right = middle - 1; - } - else if (cur_col < ref_col) { - left = middle + 1; - } - else { - match = middle; - break; - } - } - - //if the element with the same column index in the reference row has been found - if (match != -1) { - atomicAdd(&weight_i[idx], ref_val); - } - } - } - } - - //Jaccard weights (*weight) - template - __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) - overlap_jw(IdxType e, - IdxType *csrPtr, - IdxType *csrInd, - ValType *weight_i, - ValType *weight_s, - ValType *weight_j) { - IdxType j; - ValType Wi, Wu; - - for (j = threadIdx.x + blockIdx.x * blockDim.x; - j < e; - j += gridDim.x * blockDim.x) { - Wi = weight_i[j]; - Wu = weight_s[j]; - weight_j[j] = (Wi / Wu); - } - } - - template - int overlap(IdxType n, - IdxType e, - IdxType *csrPtr, - IdxType *csrInd, - ValType *weight_in, - ValType *work, - ValType *weight_i, - ValType *weight_s, - ValType *weight_j) { - dim3 nthreads, nblocks; - int y = 4; - - //setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS); - nblocks.z = 1; - //launch kernel - overlap_row_sum <<>>(n, - csrPtr, - csrInd, - weight_in, - work); - cudaDeviceSynchronize(); - fill(e, weight_i, (ValType) 0.0); - //setup launch configuration - nthreads.x = 32 / y; - nthreads.y = y; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1; - //launch kernel - overlap_is <<>>(n, - csrPtr, - csrInd, - weight_in, - work, - weight_i, - weight_s); - - //setup launch configuration - nthreads.x = min(e, (IdxType) CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((e + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - //launch kernel - overlap_jw <<>>(e, - csrPtr, - csrInd, - weight_i, - weight_s, - weight_j); - - return 0; - } - - template - int overlap_pairs(IdxType n, - IdxType num_pairs, - IdxType *csrPtr, - IdxType *csrInd, - IdxType *first_pair, - IdxType *second_pair, - ValType *weight_in, - ValType *work, - ValType *weight_i, - ValType *weight_s, - ValType *weight_j) { - dim3 nthreads, nblocks; - int y = 4; - - //setup launch configuration - nthreads.x = 32; - nthreads.y = y; - nthreads.z = 1; - nblocks.x = 1; - nblocks.y = min((n + nthreads.y - 1) / nthreads.y, (IdxType) CUDA_MAX_BLOCKS); - nblocks.z = 1; - //launch kernel - overlap_row_sum <<>>(n, - csrPtr, - csrInd, - weight_in, - work); - cudaDeviceSynchronize(); - fill(num_pairs, weight_i, (ValType) 0.0); - //setup launch configuration - nthreads.x = 32; - nthreads.y = 1; - nthreads.z = 8; - nblocks.x = 1; - nblocks.y = 1; - nblocks.z = min((n + nthreads.z - 1) / nthreads.z, (IdxType) CUDA_MAX_BLOCKS); //1; - //launch kernel - overlap_is_pairs <<>>(num_pairs, - csrPtr, - csrInd, - first_pair, - second_pair, - weight_in, - work, - weight_i, - weight_s); - - //setup launch configuration - nthreads.x = min(num_pairs, (IdxType) CUDA_MAX_KERNEL_THREADS); - nthreads.y = 1; - nthreads.z = 1; - nblocks.x = min((num_pairs + nthreads.x - 1) / nthreads.x, (IdxType) CUDA_MAX_BLOCKS); - nblocks.y = 1; - nblocks.z = 1; - //launch kernel - overlap_jw <<>>(num_pairs, - csrPtr, - csrInd, - weight_i, - weight_s, - weight_j); - - return 0; - } -} // End cugraph namespace - -gdf_error gdf_overlap(gdf_graph *graph, gdf_column *weights, gdf_column *result) { - GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL); - GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED); - - GDF_TRY(gdf_add_adj_list(graph)); - GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL); - - bool weighted = (weights != nullptr); - - gdf_dtype ValueType = result->dtype; - gdf_dtype IndexType = graph->adjList->offsets->dtype; - - void *csrPtr = graph->adjList->offsets->data; - void *csrInd = graph->adjList->indices->data; - void *weight_i = nullptr; - void *weight_s = nullptr; - void *weight_j = result->data; - void *work = nullptr; - void *weight_in = nullptr; - if (weighted) - weight_in = weights->data; - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::overlap(n, - e, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::overlap(n, - e, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::overlap(n, - e, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::overlap(n, - e, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::overlap(n, - e, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::overlap(n, - e, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::overlap(n, - e, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t e = graph->adjList->indices->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * e, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::overlap(n, - e, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - -// Clean up temp arrays - ALLOC_FREE_TRY(weight_i, nullptr); - ALLOC_FREE_TRY(weight_s, nullptr); - ALLOC_FREE_TRY(work, nullptr); - - return GDF_SUCCESS; -} - -gdf_error gdf_overlap_list(gdf_graph* graph, - gdf_column* weights, - gdf_column* first, - gdf_column* second, - gdf_column* result) { - GDF_REQUIRE(graph != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE((graph->adjList != nullptr) || (graph->edgeList != nullptr), GDF_INVALID_API_CALL); - GDF_REQUIRE(result != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(result->data != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(!result->valid, GDF_VALIDITY_UNSUPPORTED); - - GDF_REQUIRE(first != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(first->data != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(!first->valid, GDF_VALIDITY_UNSUPPORTED); - - GDF_REQUIRE(second != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(second->data != nullptr, GDF_INVALID_API_CALL); - GDF_REQUIRE(!second->valid, GDF_VALIDITY_UNSUPPORTED); - - GDF_TRY(gdf_add_adj_list(graph)); - GDF_REQUIRE(graph->adjList != nullptr, GDF_INVALID_API_CALL); - - bool weighted = (weights != nullptr); - - gdf_dtype ValueType = result->dtype; - gdf_dtype IndexType = graph->adjList->offsets->dtype; - GDF_REQUIRE(first->dtype == IndexType, GDF_INVALID_API_CALL); - GDF_REQUIRE(second->dtype == IndexType, GDF_INVALID_API_CALL); - - void *first_pair = first->data; - void *second_pair = second->data; - void *csrPtr = graph->adjList->offsets->data; - void *csrInd = graph->adjList->indices->data; - void *weight_i = nullptr; - void *weight_s = nullptr; - void *weight_j = result->data; - void *work = nullptr; - void *weight_in = nullptr; - if (weighted) - weight_in = weights->data; - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::overlap_pairs(n, - num_pairs, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (int32_t*) first_pair, - (int32_t*) second_pair, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT32 && !weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::overlap_pairs(n, - num_pairs, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (int32_t*) first_pair, - (int32_t*) second_pair, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::overlap_pairs(n, - num_pairs, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (int32_t*) first_pair, - (int32_t*) second_pair, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT32 && !weighted) { - int32_t n = graph->adjList->offsets->size - 1; - int32_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::overlap_pairs(n, - num_pairs, - (int32_t*) csrPtr, - (int32_t*) csrInd, - (int32_t*) first_pair, - (int32_t*) second_pair, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::overlap_pairs(n, - num_pairs, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (int64_t*) first_pair, - (int64_t*) second_pair, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - - if (ValueType == GDF_FLOAT32 && IndexType == GDF_INT64 && !weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(float) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(float) * n, nullptr); - cugraph::overlap_pairs(n, - num_pairs, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (int64_t*) first_pair, - (int64_t*) second_pair, - (float*) weight_in, - (float*) work, - (float*) weight_i, - (float*) weight_s, - (float*) weight_j); - } - - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::overlap_pairs(n, - num_pairs, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (int64_t*) first_pair, - (int64_t*) second_pair, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - - if (ValueType == GDF_FLOAT64 && IndexType == GDF_INT64 && !weighted) { - int64_t n = graph->adjList->offsets->size - 1; - int64_t num_pairs = first->size; - ALLOC_MANAGED_TRY(&weight_i, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&weight_s, sizeof(double) * num_pairs, nullptr); - ALLOC_MANAGED_TRY(&work, sizeof(double) * n, nullptr); - cugraph::overlap_pairs(n, - num_pairs, - (int64_t*) csrPtr, - (int64_t*) csrInd, - (int64_t*) first_pair, - (int64_t*) second_pair, - (double*) weight_in, - (double*) work, - (double*) weight_i, - (double*) weight_s, - (double*) weight_j); - } - - // Clean up temp arrays - ALLOC_FREE_TRY(weight_i, nullptr); - ALLOC_FREE_TRY(weight_s, nullptr); - ALLOC_FREE_TRY(work, nullptr); - - return GDF_SUCCESS; -} - diff --git a/cpp/src/pagerank.cu b/cpp/src/pagerank.cu deleted file mode 100644 index 668e19d1bf3..00000000000 --- a/cpp/src/pagerank.cu +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - * - */ - -// Pagerank solver -// Author: Alex Fender afender@nvidia.com - -#include -#include -#include -#include - #include -#include -#include -#include "graph_utils.cuh" -#include "pagerank.cuh" -#include "cub/cub.cuh" -#include -#include - -#include - -namespace cugraph -{ - -#ifdef DEBUG - #define PR_VERBOSE -#endif -template -bool pagerankIteration( IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd,ValueType *cscVal, - ValueType alpha, ValueType *a, ValueType *b, float tolerance, int iter, int max_iter, - ValueType * &tmp, void* cub_d_temp_storage, size_t cub_temp_storage_bytes, - ValueType * &pr, ValueType *residual) { - - ValueType dot_res; - cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal, - cscPtr, cscInd, tmp, pr, - n, n, e); - - scal(n, alpha, pr); - dot_res = dot( n, a, tmp); - axpy(n, dot_res, b, pr); - scal(n, (ValueType)1.0/nrm2(n, pr) , pr); - axpy(n, (ValueType)-1.0, pr, tmp); - *residual = nrm2(n, tmp); - if (*residual < tolerance) - { - scal(n, (ValueType)1.0/nrm1(n,pr), pr); - return true; - } - else - { - if (iter< max_iter) - { - std::swap(pr, tmp); - } - else - { - scal(n, (ValueType)1.0/nrm1(n,pr), pr); - } - return false; - } -} - -template -int pagerank ( IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd, ValueType *cscVal, - ValueType alpha, ValueType *a, bool has_guess, float tolerance, int max_iter, - ValueType * &pagerank_vector, ValueType * &residual) { - int max_it, i = 0 ; - float tol; - bool converged = false; - ValueType randomProbability = static_cast( 1.0/n); - ValueType *b=0, *tmp=0; - void* cub_d_temp_storage = NULL; - size_t cub_temp_storage_bytes = 0; - - if (max_iter > 0 ) - max_it = max_iter; - else - max_it = 500; - - if (tolerance == 0.0f) - tol = 1.0E-6f; - else if (tolerance < 1.0f && tolerance > 0.0f) - tol = tolerance; - else - return -1; - - if (alpha <= 0.0f || alpha >= 1.0f) - return -1; - - cudaStream_t stream{nullptr}; - - ALLOC_MANAGED_TRY ((void**)&b, sizeof(ValueType) * n, stream); - ALLOC_MANAGED_TRY ((void**)&tmp, sizeof(ValueType) * n, stream); - cudaCheckError(); - - if (!has_guess) { - fill(n, pagerank_vector, randomProbability); - fill(n, tmp, randomProbability); - } - else { - copy(n, pagerank_vector, tmp); - } - - - fill(n, b, randomProbability); - update_dangling_nodes(n, a, alpha); - - cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, cscVal, - cscPtr, cscInd, tmp, pagerank_vector, n, n, e); - // Allocate temporary storage - ALLOC_MANAGED_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream); - cudaCheckError() - #ifdef PR_VERBOSE - std::stringstream ss; - ss.str(std::string()); - ss <<" ------------------PageRank------------------"<< std::endl; - ss <<" --------------------------------------------"<< std::endl; - ss << std::setw(10) << "Iteration" << std::setw(15) << "Residual" << std::endl; - ss <<" --------------------------------------------"<< std::endl; - std::cout< ( int n, int e, int *cscPtr, int *cscInd,half *cscVal, half alpha, half *a, bool has_guess, float tolerance, int max_iter, half * &pagerank_vector, half * &residual); -template int pagerank ( int n, int e, int *cscPtr, int *cscInd,float *cscVal, float alpha, float *a, bool has_guess, float tolerance, int max_iter, float * &pagerank_vector, float * &residual); -template int pagerank ( int n, int e, int *cscPtr, int *cscInd,double *cscVal, double alpha, double *a, bool has_guess, float tolerance, int max_iter, double * &pagerank_vector, double * &residual); - -} //namespace cugraph diff --git a/cpp/src/pagerank.cuh b/cpp/src/pagerank.cuh deleted file mode 100644 index d3e1572d3bd..00000000000 --- a/cpp/src/pagerank.cuh +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - * - */ - -// Pagerank solver -// Author: Alex Fender afender@nvidia.com - -#pragma once -namespace cugraph -{ - -template -int pagerank ( IndexType n, IndexType e, IndexType *cscPtr, IndexType *cscInd,ValueType *cscVal, - ValueType alpha, ValueType *a, bool has_guess, float tolerance, int max_iter, ValueType * &pagerank_vector, ValueType * &residual); - -} //namespace cugraph diff --git a/cpp/src/snmg/blas/spmv.cu b/cpp/src/snmg/blas/spmv.cu new file mode 100644 index 00000000000..c5b369396c7 --- /dev/null +++ b/cpp/src/snmg/blas/spmv.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// snmg spmv +// Author: Alex Fender afender@nvidia.com + +#include "spmv.cuh" + +template +gdf_error gdf_snmg_csrmv_impl (size_t * part_offsets, gdf_column * off, gdf_column * ind, gdf_column * val, gdf_column ** x_cols){ + + GDF_REQUIRE( part_offsets != nullptr, GDF_INVALID_API_CALL ); + GDF_REQUIRE( off != nullptr, GDF_INVALID_API_CALL ); + GDF_REQUIRE( ind != nullptr, GDF_INVALID_API_CALL ); + GDF_REQUIRE( val != nullptr, GDF_INVALID_API_CALL ); + GDF_REQUIRE( x_cols != nullptr, GDF_INVALID_API_CALL ); + GDF_REQUIRE( off->size > 0, GDF_INVALID_API_CALL ); + GDF_REQUIRE( ind->size > 0, GDF_INVALID_API_CALL ); + GDF_REQUIRE( val->size > 0, GDF_INVALID_API_CALL ); + GDF_REQUIRE( ind->size == val->size, GDF_COLUMN_SIZE_MISMATCH ); + GDF_REQUIRE( off->dtype == ind->dtype, GDF_UNSUPPORTED_DTYPE ); + GDF_REQUIRE( off->null_count + ind->null_count + val->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); + + auto p = omp_get_num_threads(); + + val_t* x[p]; + for (auto i = 0; i < p; ++i) + { + GDF_REQUIRE( x_cols[i] != nullptr, GDF_INVALID_API_CALL ); + GDF_REQUIRE( x_cols[i]->size > 0, GDF_INVALID_API_CALL ); + x[i]= static_cast(x_cols[i]->data); + } + cugraph::SNMGinfo snmg_env; + cugraph::SNMGcsrmv spmv_solver(snmg_env, part_offsets, + static_cast(off->data), + static_cast(ind->data), + static_cast(val->data), + x); + spmv_solver.run(x); + return GDF_SUCCESS; +} + +gdf_error gdf_snmg_csrmv (size_t * part_offsets, gdf_column * off, gdf_column * ind, gdf_column * val, gdf_column ** x_cols){ + switch (val->dtype) { + case GDF_FLOAT32: return gdf_snmg_csrmv_impl(part_offsets, off, ind, val, x_cols); + case GDF_FLOAT64: return gdf_snmg_csrmv_impl(part_offsets, off, ind, val, x_cols); + default: return GDF_UNSUPPORTED_DTYPE; + } +} diff --git a/cpp/src/snmg/blas/spmv.cuh b/cpp/src/snmg/blas/spmv.cuh new file mode 100644 index 00000000000..8b7120a8e65 --- /dev/null +++ b/cpp/src/snmg/blas/spmv.cuh @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// snmg spmv +// Author: Alex Fender afender@nvidia.com + +#pragma once +#include "cub/cub.cuh" +#include +#include "utilities/graph_utils.cuh" +#include "snmg/utils.cuh" +//#define SNMG_DEBUG + +namespace cugraph +{ + +template +class SNMGcsrmv +{ + + private: + size_t v_glob; + size_t v_loc; + size_t e_loc; + SNMGinfo env; + size_t* part_off; + int i; + int p; + IndexType * off; + IndexType * ind; + ValueType * val; + ValueType * y_loc; + cudaStream_t stream; + void* cub_d_temp_storage; + size_t cub_temp_storage_bytes; + + public: + SNMGcsrmv(SNMGinfo & env_, size_t* part_off_, + IndexType * off_, IndexType * ind_, ValueType * val_, ValueType ** x) : + env(env_), part_off(part_off_), off(off_), ind(ind_), val(val_) { + sync_all(); + cub_d_temp_storage = NULL; + cub_temp_storage_bytes = 0; + stream = nullptr; + i = env.get_thread_num(); + p = env.get_num_threads(); + v_glob = part_off[p]; + v_loc = part_off[i+1]-part_off[i]; + IndexType tmp; + cudaMemcpy(&tmp, &off[v_loc], sizeof(IndexType),cudaMemcpyDeviceToHost); + cudaCheckError(); + e_loc = tmp; + + // Allocate the local result + ALLOC_TRY ((void**)&y_loc, v_loc*sizeof(ValueType), stream); + + // get temporary storage size for CUB + cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, + val, off, ind, x[i], y_loc, v_loc, v_glob, e_loc); + cudaCheckError(); + // Allocate CUB's temporary storage + ALLOC_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream); + } + + ~SNMGcsrmv() { + ALLOC_FREE_TRY(cub_d_temp_storage, stream); + ALLOC_FREE_TRY(y_loc, stream); + } + + // run the power iteration + void run (ValueType ** x) { + // Local SPMV + cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, + val, off, ind, x[i], y_loc, v_loc, v_glob, e_loc); + cudaCheckError() + sync_all(); + + #ifdef SNMG_DEBUG + print_mem_usage(); + #pragma omp master + {std::cout << omp_get_wtime() - t << " ";} + Wait for all local spmv + t = omp_get_wtime(); + sync_all(); + #pragma omp master + {std::cout << omp_get_wtime() - t << " ";} + Update the output vector +#endif + + allgather (env, part_off, y_loc, x); + } +}; + + +} //namespace cugraph diff --git a/cpp/src/snmg/degree/degree.cu b/cpp/src/snmg/degree/degree.cu new file mode 100644 index 00000000000..514228e7fd2 --- /dev/null +++ b/cpp/src/snmg/degree/degree.cu @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "degree.cuh" + +template +gdf_error gdf_snmg_degree_impl(int x, + size_t* part_offsets, + gdf_column* off, + gdf_column* ind, + gdf_column** x_cols) { + GDF_REQUIRE(off->size > 0, GDF_INVALID_API_CALL); + GDF_REQUIRE(ind->size > 0, GDF_INVALID_API_CALL); + GDF_REQUIRE(off->dtype == ind->dtype, GDF_UNSUPPORTED_DTYPE); + GDF_REQUIRE(off->null_count + ind->null_count == 0, GDF_VALIDITY_UNSUPPORTED); + + gdf_error status; + auto p = omp_get_num_threads(); + + idx_t* degree[p]; + for (auto i = 0; i < p; ++i) { + GDF_REQUIRE(x_cols[i] != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(x_cols[i]->size > 0, GDF_INVALID_API_CALL); + degree[i] = static_cast(x_cols[i]->data); + } + + status = cugraph::snmg_degree(x, + part_offsets, + static_cast(off->data), + static_cast(ind->data), + degree); + return status; +} + +gdf_error gdf_snmg_degree(int x, + size_t* part_offsets, + gdf_column* off, + gdf_column* ind, + gdf_column** x_cols) { + GDF_REQUIRE(part_offsets != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(off != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(ind != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(x_cols != nullptr, GDF_INVALID_API_CALL); + switch (off->dtype) { + case GDF_INT32: + return gdf_snmg_degree_impl(x, part_offsets, off, ind, x_cols); + case GDF_INT64: + return gdf_snmg_degree_impl(x, part_offsets, off, ind, x_cols); + default: + return GDF_INVALID_API_CALL; + } +} diff --git a/cpp/src/snmg/degree/degree.cuh b/cpp/src/snmg/degree/degree.cuh new file mode 100644 index 00000000000..1e22da4ef4b --- /dev/null +++ b/cpp/src/snmg/degree/degree.cuh @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include "utilities/graph_utils.cuh" +#include "snmg/utils.cuh" +#include "rmm_utils.h" + +namespace cugraph { + /** + * Single node multi-GPU method for degree calculation on a partitioned graph. + * @param x Indicates whether to compute in degree, out degree, or the sum of both. + * 0 = in + out degree + * 1 = in-degree + * 2 = out-degree + * @param part_off The vertex partitioning of the global graph + * @param off The offsets array of the local partition + * @param ind The indices array of the local partition + * @param degree Pointer to pointers to memory on each GPU for the result + * @return Error code + */ + template + gdf_error snmg_degree(int x, size_t* part_off, idx_t* off, idx_t* ind, idx_t** degree) { + sync_all(); + SNMGinfo env; + auto i = env.get_thread_num(); + auto p = env.get_num_threads(); + + // Getting the global and local vertices and edges + size_t glob_v = part_off[p]; + size_t loc_v = part_off[i + 1] - part_off[i]; + idx_t tmp; + CUDA_TRY(cudaMemcpy(&tmp, &off[loc_v], sizeof(idx_t), cudaMemcpyDeviceToHost)); + size_t loc_e = tmp; + + // Allocating the local result array, and setting all entries to zero. + idx_t* local_result; + ALLOC_TRY((void** )&local_result, glob_v * sizeof(idx_t), nullptr); + thrust::fill(rmm::exec_policy(nullptr)->on(nullptr), local_result, local_result + glob_v, 0); + + // In-degree + if (x == 1 || x == 0) { + dim3 nthreads, nblocks; + nthreads.x = min(static_cast(loc_e), static_cast(CUDA_MAX_KERNEL_THREADS)); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min(static_cast((loc_e + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); + nblocks.y = 1; + nblocks.z = 1; + degree_coo <<>>(static_cast(loc_e), + static_cast(loc_e), + ind, + local_result); + cudaCheckError(); + } + + // Out-degree + if (x == 2 || x == 0) { + dim3 nthreads, nblocks; + nthreads.x = min(static_cast(loc_v), static_cast(CUDA_MAX_KERNEL_THREADS)); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min(static_cast((loc_v + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); + nblocks.y = 1; + nblocks.z = 1; + degree_offsets <<>>(static_cast(loc_v), + static_cast(loc_e), + off, + local_result + part_off[i]); + cudaCheckError(); + } + + // Combining the local results into global results + sync_all(); + treeReduce >(env, glob_v, local_result, degree); + + // Broadcasting the global result to all GPUs + treeBroadcast(env, glob_v, local_result, degree); + + return GDF_SUCCESS; + } + + template<> + gdf_error snmg_degree(int x, + size_t* part_off, + int64_t* off, + int64_t* ind, + int64_t** degree) { + sync_all(); + SNMGinfo env; + auto i = env.get_thread_num(); + auto p = env.get_num_threads(); + + // Getting the global and local vertices and edges + size_t glob_v = part_off[p]; + size_t loc_v = part_off[i + 1] - part_off[i]; + int64_t tmp; + CUDA_TRY(cudaMemcpy(&tmp, &off[loc_v], sizeof(int64_t), cudaMemcpyDeviceToHost)); + size_t loc_e = tmp; + + // Allocating the local result array, and setting all entries to zero. + int64_t* local_result; + ALLOC_TRY((void** )&local_result, glob_v * sizeof(int64_t), nullptr); + thrust::fill(rmm::exec_policy(nullptr)->on(nullptr), local_result, local_result + glob_v, 0); + + // In-degree + if (x == 1 || x == 0) { + dim3 nthreads, nblocks; + nthreads.x = min(static_cast(loc_e), static_cast(CUDA_MAX_KERNEL_THREADS)); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min(static_cast((loc_e + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); + nblocks.y = 1; + nblocks.z = 1; + degree_coo <<>>(static_cast(loc_e), + static_cast(loc_e), + ind, + reinterpret_cast(local_result)); + cudaCheckError(); + } + + // Out-degree + if (x == 2 || x == 0) { + dim3 nthreads, nblocks; + nthreads.x = min(static_cast(loc_v), static_cast(CUDA_MAX_KERNEL_THREADS)); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min(static_cast((loc_v + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); + nblocks.y = 1; + nblocks.z = 1; + degree_offsets <<>>(static_cast(loc_v), + static_cast(loc_e), + off, + reinterpret_cast(local_result + + part_off[i])); + cudaCheckError(); + } + + // Convert the values written as doubles back to int64: + dim3 nthreads, nblocks; + nthreads.x = min(static_cast(glob_v), static_cast(CUDA_MAX_KERNEL_THREADS)); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min(static_cast((glob_v + nthreads.x - 1) / nthreads.x), + static_cast(env.get_num_sm() * 32)); + nblocks.y = 1; + nblocks.z = 1; + type_convert <<>>(reinterpret_cast(local_result), glob_v); + cudaCheckError(); + + // Combining the local results into global results + treeReduce >(env, glob_v, local_result, degree); + + // Broadcasting the global result to all GPUs + treeBroadcast(env, glob_v, local_result, degree); + + return GDF_SUCCESS; + } +} diff --git a/cpp/src/snmg/link_analysis/pagerank.cuh b/cpp/src/snmg/link_analysis/pagerank.cuh new file mode 100644 index 00000000000..7d2af4491ef --- /dev/null +++ b/cpp/src/snmg/link_analysis/pagerank.cuh @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// snmg pagerank +// Author: Alex Fender afender@nvidia.com + +#pragma once +#include "cub/cub.cuh" +#include +#include "utilities/graph_utils.cuh" +#include "snmg/utils.cuh" +#include "snmg/blas/spmv.cuh" +//#define SNMG_DEBUG + +namespace cugraph +{ + + template +__global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) +transition_kernel(const size_t e, + const IndexType *ind, + const IndexType *degree, + ValueType *val) { + for (auto i = threadIdx.x + blockIdx.x * blockDim.x; + i < e; + i += gridDim.x * blockDim.x) + val[i] = 1.0 / degree[ind[i]]; +} + +template +class SNMGpagerank +{ + private: + size_t v_glob; //global number of vertices + size_t v_loc; //local number of vertices + size_t e_loc; //local number of edges + int id; // thread id + int nt; // number of threads + ValueType alpha; // damping factor + SNMGinfo env; //info about the snmg env setup + cudaStream_t stream; + + //Vertex offsets for each partition. + //This information should be available on all threads/devices + //part_offsets[device_id] contains the global ID + //of the first vertex of the partion owned by device_id. + //part_offsets[num_devices] contains the global number of vertices + size_t* part_off; + + // local CSR matrix + IndexType * off; + IndexType * ind; + ValueType * val; + + // vectors of size v_glob + ValueType * bookmark; // constant vector with dangling node info + + bool is_setup; + + public: + SNMGpagerank(SNMGinfo & env_, size_t* part_off_, + IndexType * off_, IndexType * ind_) : + env(env_), part_off(part_off_), off(off_), ind(ind_) { + id = env.get_thread_num(); + nt = env.get_num_threads(); + v_glob = part_off[nt]; + v_loc = part_off[id+1]-part_off[id]; + IndexType tmp_e; + cudaMemcpy(&tmp_e, &off[v_loc], sizeof(IndexType),cudaMemcpyDeviceToHost); + cudaCheckError(); + e_loc = tmp_e; + stream = nullptr; + is_setup = false; + ALLOC_TRY ((void**)&bookmark, sizeof(ValueType) * v_glob, stream); + ALLOC_TRY ((void**)&val, sizeof(ValueType) * e_loc, stream); + } + ~SNMGpagerank() { + ALLOC_FREE_TRY(bookmark, stream); + ALLOC_FREE_TRY(val, stream); + } + + void transition_vals(const IndexType *degree) { + int threads = min(static_cast(e_loc), 256); + int blocks = min(static_cast(32*env.get_num_sm()), CUDA_MAX_BLOCKS); + transition_kernel <<>> (e_loc, ind, degree, val); + cudaCheckError(); + } + + void flag_leafs(const IndexType *degree) { + int threads = min(static_cast(v_glob), 256); + int blocks = min(static_cast(32*env.get_num_sm()), CUDA_MAX_BLOCKS); + flag_leafs_kernel <<>> (v_glob, degree, bookmark); + cudaCheckError(); + } + + + // Artificially create the google matrix by setting val and bookmark + void setup(ValueType _alpha) { + if (!is_setup) { + alpha=_alpha; + ValueType zero = 0.0; + IndexType *degree; + ALLOC_TRY ((void**)°ree, sizeof(IndexType) * v_glob, stream); + + // TODO snmg degree + int nthreads = min(static_cast(e_loc), 256); + int nblocks = min(static_cast(32*env.get_num_sm()), CUDA_MAX_BLOCKS); + degree_coo<<>>(v_glob, e_loc, ind, degree); + + // Update dangling node vector + fill(v_glob, bookmark, zero); + flag_leafs(degree); + update_dangling_nodes(v_glob, bookmark, alpha); + + // Transition matrix + transition_vals(degree); + + //exit + ALLOC_FREE_TRY(degree, stream); + is_setup = true; + } + else + throw std::string("Setup can be called only once"); + } + + // run the power iteration on the google matrix + void solve (int max_iter, ValueType ** pagerank) { + if (is_setup) { + ValueType dot_res; + ValueType one = 1.0; + ValueType *pr = pagerank[id]; + fill(v_glob, pagerank[id], one/v_glob); + dot_res = dot( v_glob, bookmark, pr); + SNMGcsrmv spmv_solver(env, part_off, off, ind, val, pagerank); + for (auto i = 0; i < max_iter; ++i) { + spmv_solver.run(pagerank); + scal(v_glob, alpha, pr); + addv(v_glob, dot_res * (one/v_glob) , pr); + dot_res = dot( v_glob, bookmark, pr); + scal(v_glob, one/nrm2(v_glob, pr) , pr); + } + scal(v_glob, one/nrm1(v_glob,pr), pr); + } + else { + throw std::string("Solve was called before setup"); + } + } +}; + +} //namespace cugraph diff --git a/cpp/src/snmg/snmg_utils.cuh b/cpp/src/snmg/snmg_utils.cuh deleted file mode 100644 index eea6a43053d..00000000000 --- a/cpp/src/snmg/snmg_utils.cuh +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// snmg utils -// Author: Alex Fender afender@nvidia.com - -#pragma once -#include - -namespace cugraph -{ - -// Wait for all host threads -void sync_all() { - cudaDeviceSynchronize(); - #pragma omp barrier -} - -// enable peer access (all to all) -gdf_error setup_peer_access() { - auto i = omp_get_thread_num(); - auto p = omp_get_num_threads(); - for (int j = 0; j < p; ++j) { - if (i != j) { - int canAccessPeer = 0; - CUDA_TRY(cudaDeviceCanAccessPeer(&canAccessPeer, i, j)); - if (canAccessPeer) { - cudaDeviceEnablePeerAccess(j, 0); - cudaError_t status = cudaGetLastError(); - if (!(status == cudaSuccess || status == cudaErrorPeerAccessAlreadyEnabled)) { - std::cerr << "Could not Enable Peer Access from" << i << " to " << j << std::endl; - return GDF_CUDA_ERROR; - } - } - else { - std::cerr << "P2P access required from " << i << " to " << j << std::endl; - return GDF_CUDA_ERROR; - } - } - } - return GDF_SUCCESS; -} - -// Each GPU copies its x_loc to x_glob[offset[device]] on all GPU -template -gdf_error allgather (size_t* offset, val_t* x_loc, val_t ** x_glob) { - auto i = omp_get_thread_num(); - auto p = omp_get_num_threads(); - size_t n_loc= offset[i+1]-offset[i]; - - GDF_TRY(setup_peer_access()); - // this causes issues with CUB. TODO : verify the impact on performance. - - // send the local spmv output (x_loc) to all peers to reconstruct the global vector x_glob - // After this call each peer has a full, updated, copy of x_glob - for (int j = 0; j < p; ++j) - CUDA_TRY(cudaMemcpyPeer(x_glob[j]+offset[i],j, x_loc,i, n_loc*sizeof(val_t))); - //CUDA_TRY(cudaMemcpy(x_glob[j]+offset[i], x_loc, n_loc*sizeof(val_t),cudaMemcpyDeviceToDevice)); - - //Make sure everyone has finished copying before returning - sync_all(); - - return GDF_SUCCESS; -} - -void print_mem_usage() -{ - size_t free,total; - cudaMemGetInfo(&free, &total); - std::cout<< std::endl<< "Mem used: "< -#include "graph_utils.cuh" -#include "snmg_utils.cuh" -//#define SNMG_DEBUG - -namespace cugraph -{ - -template -gdf_error snmg_csrmv (size_t* part_off, idx_t * off, idx_t * ind, val_t * val, val_t ** x) { - sync_all(); - void* cub_d_temp_storage = NULL; - size_t cub_temp_storage_bytes = 0; - cudaStream_t stream{nullptr}; - auto i = omp_get_thread_num(); - auto p = omp_get_num_threads(); - size_t v_glob = part_off[p]; - size_t v_loc = part_off[i+1]-part_off[i]; - idx_t tmp; - CUDA_TRY(cudaMemcpy(&tmp, &off[v_loc], sizeof(idx_t),cudaMemcpyDeviceToHost)); - size_t e_loc = tmp; - val_t* y_loc; - //double t = omp_get_wtime(); - - // Allocate the local result - ALLOC_MANAGED_TRY ((void**)&y_loc, v_loc*sizeof(val_t), stream); - - // get temporary storage size for CUB - CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, - val, off, ind, x[i], y_loc, v_loc, v_glob, e_loc)); - // Allocate CUB's temporary storage - ALLOC_MANAGED_TRY ((void**)&cub_d_temp_storage, cub_temp_storage_bytes, stream); - - // Local SPMV - CUDA_TRY(cub::DeviceSpmv::CsrMV(cub_d_temp_storage, cub_temp_storage_bytes, - val, off, ind, x[i], y_loc, v_loc, v_glob, e_loc)); - print_mem_usage(); - // Free CUB's temporary storage - ALLOC_FREE_TRY(cub_d_temp_storage, stream); - //#pragma omp master - //{std::cout << omp_get_wtime() - t << " ";} - - // Wait for all local spmv - //t = omp_get_wtime(); - sync_all(); - //#pragma omp master - //{std::cout << omp_get_wtime() - t << " ";} - - //Update the output vector - allgather (part_off, y_loc, x); - - return GDF_SUCCESS; -} - -} //namespace cugraph diff --git a/cpp/src/snmg/utils.cu b/cpp/src/snmg/utils.cu new file mode 100644 index 00000000000..ebee5976de5 --- /dev/null +++ b/cpp/src/snmg/utils.cu @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cugraph { + void sync_all() { + cudaDeviceSynchronize(); + #pragma omp barrier + } + + void print_mem_usage() { + size_t free,total; + cudaMemGetInfo(&free, &total); + std::cout<< std::endl<< "Mem used: "< +#include "rmm_utils.h" + +namespace cugraph +{ + +// basic info about the snmg env setup +class SNMGinfo +{ + private: + int i, p, n_sm; + + public: + SNMGinfo() { + int tmp_p, tmp_i; + //get info from cuda + cudaGetDeviceCount(&tmp_p); + cudaGetDevice(&tmp_i); + + //get info from omp + i = omp_get_thread_num(); + p = omp_get_num_threads(); + + // check that thread_num and num_threads are compatible with the device ID and the number of device + if (tmp_i != i) { + std::cerr << "Thread ID and GPU ID do not match" << std::endl; + } + if (p > tmp_p) { + std::cerr << "More threads than GPUs" << std::endl; + } + // number of SM, usefull for kernels paramters + cudaDeviceGetAttribute(&n_sm, cudaDevAttrMultiProcessorCount, i); + cudaCheckError(); + } + ~SNMGinfo() { } + + int get_thread_num() { + return i; + } + int get_num_threads() { + return p; + } + int get_num_sm() { + return n_sm; + } + // enable peer access (all to all) + void setup_peer_access() { + for (int j = 0; j < p; ++j) { + if (i != j) { + int canAccessPeer = 0; + cudaDeviceCanAccessPeer(&canAccessPeer, i, j); + cudaCheckError(); + if (canAccessPeer) { + cudaDeviceEnablePeerAccess(j, 0); + cudaError_t status = cudaGetLastError(); + if (!(status == cudaSuccess || status == cudaErrorPeerAccessAlreadyEnabled)) { + std::cerr << "Could not Enable Peer Access from" << i << " to " << j << std::endl; + } + } + else { + std::cerr << "P2P access required from " << i << " to " << j << std::endl; + } + } + } + } +}; + +// Wait for all host threads +void sync_all(); + +// Each GPU copies its x_loc to x_glob[offset[device]] on all GPU +template +void allgather (SNMGinfo & env, size_t* offset, val_t* x_loc, val_t ** x_glob) { + auto i = env.get_thread_num(); + auto p = env.get_num_threads(); + size_t n_loc= offset[i+1]-offset[i]; + + env.setup_peer_access(); + // this causes issues with CUB. TODO : verify the impact on performance. + + // send the local spmv output (x_loc) to all peers to reconstruct the global vector x_glob + // After this call each peer has a full, updated, copy of x_glob + for (int j = 0; j < p; ++j) { + cudaMemcpyPeer(x_glob[j]+offset[i],j, x_loc,i, n_loc*sizeof(val_t)); + cudaCheckError(); + } + + //Make sure everyone has finished copying before returning + sync_all(); + +} + +/** + * @tparam val_t The value type + * @tparam func_t The reduce functor type + * @param length The length of each array being combined + * @param x_loc Pointer to the local array + * @param x_glob Pointer to global array pointers + * @return Error code + */ +template +gdf_error treeReduce(SNMGinfo& env, size_t length, val_t* x_loc, val_t** x_glob){ + auto i = env.get_thread_num(); + auto p = env.get_num_threads(); + env.setup_peer_access(); + int rank = 1; + while(rank < p){ + // Copy local data to the receiver's global buffer + if((i - rank) % (rank * 2) == 0){ + int receiver = i - rank; + cudaMemcpyPeer(x_glob[receiver], receiver, x_loc, i, length*sizeof(val_t)); + cudaCheckError(); + } + + // Sync everything now. This shouldn't be required as cudaMemcpyPeer is supposed to synchronize... + sync_all(); + + // Reduce the data from the receiver's global buffer with its local one + if(i % (rank * 2) == 0 && i + rank < p){ + func_t op; + thrust::transform(rmm::exec_policy(nullptr)->on(nullptr), + x_glob[i], + x_glob[i] + length, + x_loc, + x_loc, + op); + cudaCheckError(); + } + rank *= 2; + } + + // Thread 0 copies it's local result into it's global space + if (i == 0) { + cudaMemcpy(x_glob[i], x_loc, sizeof(val_t) * length, cudaMemcpyDefault); + cudaCheckError(); + } + + // Sync everything before returning + sync_all(); + + return GDF_SUCCESS; +} + +/** + * @tparam val_t The value type + * @param length The length of the array being broadcast + * @param x_loc The local array for each node + * @param x_glob Pointer to the global array pointers + * @return Error code + */ +template +gdf_error treeBroadcast(SNMGinfo& env, size_t length, val_t* x_loc, val_t** x_glob){ + auto i = env.get_thread_num(); + auto p = env.get_num_threads(); + env.setup_peer_access(); + int rank = 1; + while(rank * 2 < p) + rank *= 2; + for(; rank >= 1; rank /= 2){ + if(i % (rank * 2) == 0 and i + rank < p){ + int receiver = i + rank; + cudaMemcpyPeer(x_glob[receiver], receiver, x_glob[i], i, sizeof(val_t) * length); + cudaCheckError(); + } + } + + // Sync everything before returning + sync_all(); + + return GDF_SUCCESS; +} + +void print_mem_usage(); + +} //namespace cugraph diff --git a/cpp/src/structure/cugraph.cu b/cpp/src/structure/cugraph.cu new file mode 100644 index 00000000000..a5b1dd0e4ab --- /dev/null +++ b/cpp/src/structure/cugraph.cu @@ -0,0 +1,313 @@ +// -*-c++-*- + + /* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + * + */ + +// Graph analytics features +// Author: Alex Fender afender@nvidia.com + +#include +#include "utilities/graph_utils.cuh" +#include "converters/COOtoCSR.cuh" +#include "utilities/error_utils.h" +#include "converters/renumber.cuh" +#include +#include +#include + +#include + +/* + * cudf has gdf_column_free and using this is, in general, better design than + * creating our own, but we will keep this as cudf is planning to remove the + * function. cudf plans to redesign cudf::column to fundamentally solve this + * problem, so once they finished the redesign, we need to update this code to + * use their new features. Until that time, we may rely on this as a temporary + * solution. + */ +void gdf_col_delete(gdf_column* col) { + if (col != nullptr) { + cudaStream_t stream {nullptr}; + if (col->data != nullptr) { + ALLOC_FREE_TRY(col->data, stream); + } + if (col->valid != nullptr) { + ALLOC_FREE_TRY(col->valid, stream); + } +#if 0/* Currently, gdf_column_view does not set col_name, and col_name can have + an arbitrary value, so freeing col_name can lead to freeing a ranodom + address. This problem should be cleaned up once cudf finishes + redesigning cudf::column. */ + if (col->col_name != nullptr) { + free(col->col_name); + } +#endif + delete col; + } +} + +void gdf_col_release(gdf_column* col) { + delete col; +} + +void cpy_column_view(const gdf_column *in, gdf_column *out) { + if (in != nullptr && out !=nullptr) { + gdf_column_view(out, in->data, in->valid, in->size, in->dtype); + } +} + +gdf_error gdf_adj_list_view(gdf_graph *graph, const gdf_column *offsets, + const gdf_column *indices, const gdf_column *edge_data) { + //This function returns an error if this graph object has at least one graph + //representation to prevent a single object storing two different graphs. + GDF_REQUIRE( ((graph->edgeList == nullptr) && (graph->adjList == nullptr) && + (graph->transposedAdjList == nullptr)), GDF_INVALID_API_CALL); + GDF_REQUIRE( offsets->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); + GDF_REQUIRE( indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); + GDF_REQUIRE( (offsets->dtype == indices->dtype), GDF_UNSUPPORTED_DTYPE ); + GDF_REQUIRE( ((offsets->dtype == GDF_INT32) || (offsets->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE ); + GDF_REQUIRE( (offsets->size > 0), GDF_DATASET_EMPTY ); + + graph->adjList = new gdf_adj_list; + graph->adjList->offsets = new gdf_column; + graph->adjList->indices = new gdf_column; + graph->adjList->ownership = 0; + + cpy_column_view(offsets, graph->adjList->offsets); + cpy_column_view(indices, graph->adjList->indices); + if (edge_data) { + GDF_REQUIRE( indices->size == edge_data->size, GDF_COLUMN_SIZE_MISMATCH ); + graph->adjList->edge_data = new gdf_column; + cpy_column_view(edge_data, graph->adjList->edge_data); + } + else { + graph->adjList->edge_data = nullptr; + } + return GDF_SUCCESS; +} + +gdf_error gdf_adj_list::get_vertex_identifiers(gdf_column *identifiers) { + GDF_REQUIRE( offsets != nullptr , GDF_INVALID_API_CALL); + GDF_REQUIRE( offsets->data != nullptr , GDF_INVALID_API_CALL); + cugraph::sequence((int)offsets->size-1, (int*)identifiers->data); + return GDF_SUCCESS; +} + +gdf_error gdf_adj_list::get_source_indices (gdf_column *src_indices) { + GDF_REQUIRE( offsets != nullptr , GDF_INVALID_API_CALL); + GDF_REQUIRE( offsets->data != nullptr , GDF_INVALID_API_CALL); + GDF_REQUIRE( src_indices->size == indices->size, GDF_COLUMN_SIZE_MISMATCH ); + GDF_REQUIRE( src_indices->dtype == indices->dtype, GDF_UNSUPPORTED_DTYPE ); + GDF_REQUIRE( src_indices->size > 0, GDF_DATASET_EMPTY ); + cugraph::offsets_to_indices((int*)offsets->data, offsets->size-1, (int*)src_indices->data); + + return GDF_SUCCESS; +} + +gdf_error gdf_edge_list_view(gdf_graph *graph, const gdf_column *src_indices, + const gdf_column *dest_indices, const gdf_column *edge_data) { + //This function returns an error if this graph object has at least one graph + //representation to prevent a single object storing two different graphs. + GDF_REQUIRE( ((graph->edgeList == nullptr) && (graph->adjList == nullptr) && + (graph->transposedAdjList == nullptr)), GDF_INVALID_API_CALL); + GDF_REQUIRE( src_indices->size == dest_indices->size, GDF_COLUMN_SIZE_MISMATCH ); + GDF_REQUIRE( src_indices->dtype == dest_indices->dtype, GDF_UNSUPPORTED_DTYPE ); + GDF_REQUIRE( ((src_indices->dtype == GDF_INT32) || (src_indices->dtype == GDF_INT64)), GDF_UNSUPPORTED_DTYPE ); + GDF_REQUIRE( src_indices->size > 0, GDF_DATASET_EMPTY ); + GDF_REQUIRE( src_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); + GDF_REQUIRE( dest_indices->null_count == 0 , GDF_VALIDITY_UNSUPPORTED ); + + graph->edgeList = new gdf_edge_list; + graph->edgeList->src_indices = new gdf_column; + graph->edgeList->dest_indices = new gdf_column; + graph->edgeList->ownership = 0; + + cpy_column_view(src_indices, graph->edgeList->src_indices); + cpy_column_view(dest_indices, graph->edgeList->dest_indices); + if (edge_data) { + GDF_REQUIRE( src_indices->size == edge_data->size, GDF_COLUMN_SIZE_MISMATCH ); + graph->edgeList->edge_data = new gdf_column; + cpy_column_view(edge_data, graph->edgeList->edge_data); + } + else { + graph->edgeList->edge_data = nullptr; + } + + return GDF_SUCCESS; +} + +template +gdf_error gdf_add_adj_list_impl (gdf_graph *graph) { + if (graph->adjList == nullptr) { + GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL); + int nnz = graph->edgeList->src_indices->size, status = 0; + graph->adjList = new gdf_adj_list; + graph->adjList->offsets = new gdf_column; + graph->adjList->indices = new gdf_column; + graph->adjList->ownership = 1; + + if (graph->edgeList->edge_data!= nullptr) { + graph->adjList->edge_data = new gdf_column; + + CSR_Result_Weighted adj_list; + status = ConvertCOOtoCSR_weighted((int*)graph->edgeList->src_indices->data, (int*)graph->edgeList->dest_indices->data, (WT*)graph->edgeList->edge_data->data, nnz, adj_list); + + gdf_column_view(graph->adjList->offsets, adj_list.rowOffsets, + nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype); + gdf_column_view(graph->adjList->indices, adj_list.colIndices, + nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype); + gdf_column_view(graph->adjList->edge_data, adj_list.edgeWeights, + nullptr, adj_list.nnz, graph->edgeList->edge_data->dtype); + } + else { + CSR_Result adj_list; + status = ConvertCOOtoCSR((int*)graph->edgeList->src_indices->data,(int*)graph->edgeList->dest_indices->data, nnz, adj_list); + gdf_column_view(graph->adjList->offsets, adj_list.rowOffsets, + nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype); + gdf_column_view(graph->adjList->indices, adj_list.colIndices, + nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype); + } + if (status !=0) { + std::cerr << "Could not generate the adj_list" << std::endl; + return GDF_CUDA_ERROR; + } + } + return GDF_SUCCESS; +} + +gdf_error gdf_add_edge_list (gdf_graph *graph) { + if (graph->edgeList == nullptr) { + GDF_REQUIRE( graph->adjList != nullptr , GDF_INVALID_API_CALL); + int *d_src; + graph->edgeList = new gdf_edge_list; + graph->edgeList->src_indices = new gdf_column; + graph->edgeList->dest_indices = new gdf_column; + graph->edgeList->ownership = 2; + + cudaStream_t stream{nullptr}; + ALLOC_TRY((void**)&d_src, sizeof(int) * graph->adjList->indices->size, stream); + + cugraph::offsets_to_indices((int*)graph->adjList->offsets->data, + graph->adjList->offsets->size-1, + (int*)d_src); + + gdf_column_view(graph->edgeList->src_indices, d_src, + nullptr, graph->adjList->indices->size, graph->adjList->indices->dtype); + cpy_column_view(graph->adjList->indices, graph->edgeList->dest_indices); + + if (graph->adjList->edge_data != nullptr) { + graph->edgeList->edge_data = new gdf_column; + cpy_column_view(graph->adjList->edge_data, graph->edgeList->edge_data); + } + } + return GDF_SUCCESS; +} + + +template +gdf_error gdf_add_transposed_adj_list_impl (gdf_graph *graph) { + if (graph->transposedAdjList == nullptr ) { + GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL); + int nnz = graph->edgeList->src_indices->size, status = 0; + graph->transposedAdjList = new gdf_adj_list; + graph->transposedAdjList->offsets = new gdf_column; + graph->transposedAdjList->indices = new gdf_column; + graph->transposedAdjList->ownership = 1; + + if (graph->edgeList->edge_data) { + graph->transposedAdjList->edge_data = new gdf_column; + CSR_Result_Weighted adj_list; + status = ConvertCOOtoCSR_weighted( (int*)graph->edgeList->dest_indices->data, (int*)graph->edgeList->src_indices->data, (WT*)graph->edgeList->edge_data->data, nnz, adj_list); + gdf_column_view(graph->transposedAdjList->offsets, adj_list.rowOffsets, + nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype); + gdf_column_view(graph->transposedAdjList->indices, adj_list.colIndices, + nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype); + gdf_column_view(graph->transposedAdjList->edge_data, adj_list.edgeWeights, + nullptr, adj_list.nnz, graph->edgeList->edge_data->dtype); + } + else { + + CSR_Result adj_list; + status = ConvertCOOtoCSR((int*)graph->edgeList->dest_indices->data, (int*)graph->edgeList->src_indices->data, nnz, adj_list); + gdf_column_view(graph->transposedAdjList->offsets, adj_list.rowOffsets, + nullptr, adj_list.size+1, graph->edgeList->src_indices->dtype); + gdf_column_view(graph->transposedAdjList->indices, adj_list.colIndices, + nullptr, adj_list.nnz, graph->edgeList->src_indices->dtype); + } + if (status !=0) { + std::cerr << "Could not generate the adj_list" << std::endl; + return GDF_CUDA_ERROR; + } + } + return GDF_SUCCESS; +} + +gdf_error gdf_add_adj_list(gdf_graph *graph) { + if (graph->adjList != nullptr) + return GDF_SUCCESS; + + GDF_REQUIRE( graph->edgeList != nullptr , GDF_INVALID_API_CALL); + GDF_REQUIRE( graph->edgeList->src_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE ); + + if (graph->edgeList->edge_data != nullptr) { + switch (graph->edgeList->edge_data->dtype) { + case GDF_FLOAT32: return gdf_add_adj_list_impl(graph); + case GDF_FLOAT64: return gdf_add_adj_list_impl(graph); + default: return GDF_UNSUPPORTED_DTYPE; + } + } + else { + return gdf_add_adj_list_impl(graph); + } +} + +gdf_error gdf_add_transposed_adj_list(gdf_graph *graph) { + if (graph->edgeList == nullptr) + gdf_add_edge_list(graph); + + GDF_REQUIRE(graph->edgeList->src_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); + GDF_REQUIRE(graph->edgeList->dest_indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); + + if (graph->edgeList->edge_data != nullptr) { + switch (graph->edgeList->edge_data->dtype) { + case GDF_FLOAT32: return gdf_add_transposed_adj_list_impl(graph); + case GDF_FLOAT64: return gdf_add_transposed_adj_list_impl(graph); + default: return GDF_UNSUPPORTED_DTYPE; + } + } + else { + return gdf_add_transposed_adj_list_impl(graph); + } +} + +gdf_error gdf_delete_adj_list(gdf_graph *graph) { + if (graph->adjList) { + delete graph->adjList; + } + graph->adjList = nullptr; + return GDF_SUCCESS; +} + +gdf_error gdf_delete_edge_list(gdf_graph *graph) { + if (graph->edgeList) { + delete graph->edgeList; + } + graph->edgeList = nullptr; + return GDF_SUCCESS; +} + +gdf_error gdf_delete_transposed_adj_list(gdf_graph *graph) { + if (graph->transposedAdjList) { + delete graph->transposedAdjList; + } + graph->transposedAdjList = nullptr; + return GDF_SUCCESS; +} diff --git a/cpp/src/tests/CMakeLists.txt b/cpp/src/tests/CMakeLists.txt index ee3418fa4c8..e2e47ac3e97 100644 --- a/cpp/src/tests/CMakeLists.txt +++ b/cpp/src/tests/CMakeLists.txt @@ -46,7 +46,7 @@ function(configure_test TEST_NAME Tests_SRCS) # message(STATUS "${TEST_NAME} will link against: gdf, cugraph") add_executable(${TEST_NAME} ${Tests_SRCS}) - target_link_libraries(${TEST_NAME} OpenMP::OpenMP_CXX gmock_main gmock GTest::GTest cudart cudf cugraph nvgraph) + target_link_libraries(${TEST_NAME} OpenMP::OpenMP_CXX gmock_main gmock GTest::GTest cugraph nvgraph cudf cudart) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/gtests/") @@ -129,13 +129,27 @@ set(RENUMBERING_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/renumber/renumber_test.cu") configure_test(RENUMBERING_TEST "${RENUMBERING_TEST_SRCS}") - +################################################################################################### +#-SNMG_SPMV tests -------------------------------------------------------------------------------- set(SNMG_SPMV_TEST_SRCS "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c" "${CMAKE_CURRENT_SOURCE_DIR}/snmg_spmv/snmg_spmv_test.cu") configure_test(SNMG_SPMV_TEST "${SNMG_SPMV_TEST_SRCS}") +################################################################################################### +#-SNMG_DEGREE tests -------------------------------------------------------------------------------- +set(SNMG_DEGREE_TEST_SRCS + "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c" + "${CMAKE_CURRENT_SOURCE_DIR}/snmg_degree/snmg_degree_test.cu") + +configure_test(SNMG_DEGREE_TEST "${SNMG_DEGREE_TEST_SRCS}") + +set(SNMG_PR_TEST_SRCS + "${CMAKE_SOURCE_DIR}/../thirdparty/mmio/mmio.c" + "${CMAKE_CURRENT_SOURCE_DIR}/snmg_pagerank/snmg_pagerank_test.cu") + +configure_test(SNMG_PR_TEST "${SNMG_PR_TEST_SRCS}") message(STATUS "******** Tests are ready ********") diff --git a/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_jaccard.cpp b/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_jaccard.cpp index 28e96f1f1a1..c6c612bbfa5 100644 --- a/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_jaccard.cpp +++ b/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_jaccard.cpp @@ -98,7 +98,7 @@ TEST(nvgraph_jaccard, success) float gamma = 1.0; cudaStream_t stream{nullptr}; - ALLOC_MANAGED_TRY((void**)&weight_j, sizeof(float)*edges, stream); + ALLOC_TRY((void**)&weight_j, sizeof(float)*edges, stream); ASSERT_EQ(nvgraphJaccard (CUDA_R_32I, CUDA_R_32F, no_vertex, edges, (void*)G.adjList->offsets->data, @@ -164,7 +164,7 @@ TEST(nvgraph_jaccard_grmat, success) cudaMemcpy ((void*) &ind_h[0], G.adjList->indices->data, sizeof(int)*edges, cudaMemcpyDeviceToHost); cudaStream_t stream{nullptr}; - ALLOC_MANAGED_TRY((void**)&weight_j, sizeof(float)*edges, stream); + ALLOC_TRY((void**)&weight_j, sizeof(float)*edges, stream); ASSERT_EQ(nvgraphJaccard (CUDA_R_32I, CUDA_R_32F, vertices, edges, (void*)G.adjList->offsets->data, diff --git a/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_louvain.cpp b/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_louvain.cpp index fba7b4b8c6c..932d4d99a4f 100644 --- a/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_louvain.cpp +++ b/cpp/src/tests/nvgraph_plugin/nvgraph_gdf_louvain.cpp @@ -51,7 +51,7 @@ TEST(nvgraph_louvain, success) int* best_cluster_vec = NULL; cudaStream_t stream{nullptr}; - ALLOC_MANAGED_TRY((void**)&best_cluster_vec, sizeof(int) * no_vertex, stream); + ALLOC_TRY((void**)&best_cluster_vec, sizeof(int) * no_vertex, stream); ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, nvgraphLouvain (CUDA_R_32I, CUDA_R_32F, no_vertex, ind_h.size(), G.adjList->offsets->data, G.adjList->indices->data, G.adjList->edge_data->data, weighted, has_init_cluster, nullptr, @@ -94,7 +94,7 @@ TEST(nvgraph_louvain_grmat, success) ASSERT_EQ(gdf_grmat_gen(argv, vertices, edges, &col_src, &col_dest, nullptr), GDF_SUCCESS); cudaStream_t stream{nullptr}; - ALLOC_MANAGED_TRY ((void**)&col_weights.data, sizeof(int) * edges, stream); + ALLOC_TRY ((void**)&col_weights.data, sizeof(int) * edges, stream); col_weights.size = edges; std::vector w_h (edges, (float)1.0); cudaMemcpy (col_weights.data, (void*) &(w_h[0]), sizeof(float)*edges, cudaMemcpyHostToDevice); @@ -110,7 +110,7 @@ TEST(nvgraph_louvain_grmat, success) int num_level = 0; int* best_cluster_vec = NULL; - ALLOC_MANAGED_TRY ((void**)&best_cluster_vec, sizeof(int) * vertices, stream); + ALLOC_TRY ((void**)&best_cluster_vec, sizeof(int) * vertices, stream); ASSERT_EQ(NVGRAPH_STATUS_SUCCESS, nvgraphLouvain (CUDA_R_32I, CUDA_R_32F, vertices, edges, G.adjList->offsets->data, G.adjList->indices->data, G.adjList->edge_data->data, weighted, has_init_cluster, nullptr, (void*) &modularity, (void*) best_cluster_vec, (void *)(&num_level))); diff --git a/cpp/src/tests/pagerank/pagerank_test.cu b/cpp/src/tests/pagerank/pagerank_test.cu index 46c1150f292..5ed111fd5cb 100644 --- a/cpp/src/tests/pagerank/pagerank_test.cu +++ b/cpp/src/tests/pagerank/pagerank_test.cu @@ -106,12 +106,6 @@ class Tests_Pagerank : public ::testing::TestWithParam { // Read ASSERT_EQ( (mm_to_coo(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], &cooVal[0], NULL)) , 0)<< "could not read matrix data"<< "\n"; ASSERT_EQ(fclose(fpin),0); - - //std::cout<< *std::min_element(cooRowInd.begin(), cooRowInd.end()) < { fclose(fpin); T err; int n_err = 0; - for (int i = 0; i < m; i++) - { - //if(i > (m-10)) - // std::cout << expected_res[i] << " " << calculated_res[i] < tol*1.1) - { - n_err++; + if (err> tol*1.1) { + n_err++; // count the number of mismatches } } - if (n_err) - { - //EXPECT_NEAR(tot_err/n_err, cugraph_Const::tol, cugraph_Const::tol*9.99); // Network x used n*1e-10 for precision + if (n_err) { EXPECT_LE(n_err, 0.001*m); // we tolerate 0.1% of values with a litte difference - //printf("number of incorrect entries: %d\n", n_err); - //if (n_err > 0.001*m) - //{ - // eq(calculated_res,expected_res); - //} } } } diff --git a/cpp/src/tests/renumber/renumber_test.cu b/cpp/src/tests/renumber/renumber_test.cu index cd70e631f3c..c982ec71ec1 100644 --- a/cpp/src/tests/renumber/renumber_test.cu +++ b/cpp/src/tests/renumber/renumber_test.cu @@ -21,7 +21,7 @@ #include "cuda_profiler_api.h" -#include "renumber.cuh" +#include "converters/renumber.cuh" #include "rmm_utils.h" #include @@ -93,8 +93,10 @@ TEST_F(RenumberingTest, SmallFixedVertexList) uint32_t tmp_results[length]; uint32_t tmp_map[2 * length]; - EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint32_t) * length), cudaSuccess); - EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint32_t) * length), cudaSuccess); + cudaStream_t stream{nullptr}; + + EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint32_t) * length, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint32_t) * length, stream), RMM_SUCCESS); EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice), cudaSuccess); EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint32_t) * length, cudaMemcpyHostToDevice), cudaSuccess); @@ -116,8 +118,8 @@ TEST_F(RenumberingTest, SmallFixedVertexList) EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]); } - EXPECT_EQ(cudaFree(src_d), cudaSuccess); - EXPECT_EQ(cudaFree(dst_d), cudaSuccess); + EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS); EXPECT_EQ(test_free(number_map_d), cudaSuccess); } @@ -138,8 +140,10 @@ TEST_F(RenumberingTest, SmallFixedVertexList64Bit) uint64_t tmp_results[length]; uint64_t tmp_map[2 * length]; - EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint64_t) * length), cudaSuccess); - EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint64_t) * length), cudaSuccess); + cudaStream_t stream{nullptr}; + + EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint64_t) * length, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint64_t) * length, stream), RMM_SUCCESS); EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), cudaSuccess); EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), cudaSuccess); @@ -161,8 +165,8 @@ TEST_F(RenumberingTest, SmallFixedVertexList64Bit) EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]); } - EXPECT_EQ(cudaFree(src_d), cudaSuccess); - EXPECT_EQ(cudaFree(dst_d), cudaSuccess); + EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS); EXPECT_EQ(test_free(number_map_d), cudaSuccess); } @@ -185,10 +189,12 @@ TEST_F(RenumberingTest, SmallFixedVertexList64BitTo32Bit) uint32_t tmp_results[length]; uint64_t tmp_map[2 * length]; - EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint64_t) * length), cudaSuccess); - EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint64_t) * length), cudaSuccess); - EXPECT_EQ(cudaMalloc(&src_renumbered_d, sizeof(uint32_t) * length), cudaSuccess); - EXPECT_EQ(cudaMalloc(&dst_renumbered_d, sizeof(uint32_t) * length), cudaSuccess); + cudaStream_t stream{nullptr}; + + EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint64_t) * length, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint64_t) * length, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_ALLOC(&src_renumbered_d, sizeof(uint32_t) * length, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_ALLOC(&dst_renumbered_d, sizeof(uint32_t) * length, stream), RMM_SUCCESS); EXPECT_EQ(cudaMemcpy(src_d, src_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), cudaSuccess); EXPECT_EQ(cudaMemcpy(dst_d, dst_data, sizeof(uint64_t) * length, cudaMemcpyHostToDevice), cudaSuccess); @@ -210,8 +216,8 @@ TEST_F(RenumberingTest, SmallFixedVertexList64BitTo32Bit) EXPECT_EQ(tmp_map[tmp_results[i]], dst_data[i]); } - EXPECT_EQ(cudaFree(src_d), cudaSuccess); - EXPECT_EQ(cudaFree(dst_d), cudaSuccess); + EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS); EXPECT_EQ(test_free(number_map_d), cudaSuccess); } @@ -228,8 +234,10 @@ TEST_F(RenumberingTest, Random100KVertexSet) uint64_t *tmp_results = (uint64_t *) malloc(num_verts * sizeof(uint64_t)); uint64_t *tmp_map = (uint64_t *) malloc(2 * num_verts * sizeof(uint64_t)); - EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint64_t) * num_verts), cudaSuccess); - EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint64_t) * num_verts), cudaSuccess); + cudaStream_t stream{nullptr}; + + EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint64_t) * num_verts, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint64_t) * num_verts, stream), RMM_SUCCESS); // // Generate random source and vertex values @@ -296,8 +304,8 @@ TEST_F(RenumberingTest, Random100KVertexSet) EXPECT_EQ(min_id, 0); EXPECT_EQ(max_id, (unique_verts - 1)); - EXPECT_EQ(cudaFree(src_d), cudaSuccess); - EXPECT_EQ(cudaFree(dst_d), cudaSuccess); + EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS); EXPECT_EQ(test_free(number_map_d), cudaSuccess); free(src_data); free(dst_data); @@ -317,8 +325,10 @@ TEST_F(RenumberingTest, Random10MVertexSet) uint32_t *dst_d; uint32_t *number_map_d; - EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint32_t) * num_verts), cudaSuccess); - EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint32_t) * num_verts), cudaSuccess); + cudaStream_t stream{nullptr}; + + EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint32_t) * num_verts, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint32_t) * num_verts, stream), RMM_SUCCESS); // // Init the random number generate @@ -326,7 +336,7 @@ TEST_F(RenumberingTest, Random10MVertexSet) const int num_threads{64}; curandState *state; - EXPECT_EQ(cudaMalloc(&state, sizeof(curandState) * num_threads), cudaSuccess); + EXPECT_EQ(RMM_ALLOC(&state, sizeof(curandState) * num_threads, stream), RMM_SUCCESS); setup_generator<<>>(state); generate_sources<<>>(state, num_verts, src_d); generate_destinations<<>>(state, num_verts, src_d, dst_d); @@ -346,8 +356,8 @@ TEST_F(RenumberingTest, Random10MVertexSet) std::cout << " unique verts = " << unique_verts << std::endl; std::cout << " hash size = " << hash_size << std::endl; - EXPECT_EQ(cudaFree(src_d), cudaSuccess); - EXPECT_EQ(cudaFree(dst_d), cudaSuccess); + EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS); EXPECT_EQ(test_free(number_map_d), cudaSuccess); } @@ -364,8 +374,10 @@ TEST_F(RenumberingTest, Random100MVertexSet) uint32_t *dst_d; uint32_t *number_map_d; - EXPECT_EQ(cudaMalloc(&src_d, sizeof(uint32_t) * num_verts), cudaSuccess); - EXPECT_EQ(cudaMalloc(&dst_d, sizeof(uint32_t) * num_verts), cudaSuccess); + cudaStream_t stream{nullptr}; + + EXPECT_EQ(RMM_ALLOC(&src_d, sizeof(uint32_t) * num_verts, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_ALLOC(&dst_d, sizeof(uint32_t) * num_verts, stream), RMM_SUCCESS); // // Init the random number generate @@ -373,7 +385,7 @@ TEST_F(RenumberingTest, Random100MVertexSet) const int num_threads{64}; curandState *state; - EXPECT_EQ(cudaMalloc(&state, sizeof(curandState) * num_threads), cudaSuccess); + EXPECT_EQ(RMM_ALLOC(&state, sizeof(curandState) * num_threads, stream), RMM_SUCCESS); setup_generator<<>>(state); generate_sources<<>>(state, num_verts, src_d); generate_destinations<<>>(state, num_verts, src_d, dst_d); @@ -393,7 +405,7 @@ TEST_F(RenumberingTest, Random100MVertexSet) std::cout << " unique verts = " << unique_verts << std::endl; std::cout << " hash size = " << hash_size << std::endl; - EXPECT_EQ(cudaFree(src_d), cudaSuccess); - EXPECT_EQ(cudaFree(dst_d), cudaSuccess); + EXPECT_EQ(RMM_FREE(src_d, stream), RMM_SUCCESS); + EXPECT_EQ(RMM_FREE(dst_d, stream), RMM_SUCCESS); EXPECT_EQ(test_free(number_map_d), cudaSuccess); } diff --git a/cpp/src/tests/snmg_degree/snmg_degree_test.cu b/cpp/src/tests/snmg_degree/snmg_degree_test.cu new file mode 100644 index 00000000000..8612f242be3 --- /dev/null +++ b/cpp/src/tests/snmg_degree/snmg_degree_test.cu @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gtest/gtest.h" +#include "high_res_clock.h" +#include "cuda_profiler_api.h" +#include +#include +#include "test_utils.h" +#include "snmg_test_utils.h" + +//#define SNMG_VERBOSE + +// ref Degree on the host +template +void ref_degree_h(int x, + std::vector & off_h, + std::vector & ind_h, + std::vector & degree) { + for (auto i = 0; i < degree.size(); i++) + degree[i] = 0; + if (x == 0 || x == 2) { + for (auto i = 0; i < degree.size(); ++i) { + degree[i] += off_h[i + 1] - off_h[i]; + } + } + if (x == 0 || x == 1) { + for (auto i = 0; i < ind_h.size(); i++) + degree[ind_h[i]] += 1; + } +} + +struct MGDegree_Usecase { + std::string matrix_file; + int x; + MGDegree_Usecase(const std::string& a, int _x) { + x = _x; + // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR + // if RAPIDS_DATASET_ROOT_DIR not set, default to "/datasets" + const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + if ((a != "") && (a[0] != '/')) { + matrix_file = rapidsDatasetRootDir + "/" + a; + } else { + matrix_file = a; + } + } + MGDegree_Usecase& operator=(const MGDegree_Usecase& rhs) { + matrix_file = rhs.matrix_file; + return *this; + } +}; + +class Tests_MGDegree: public ::testing::TestWithParam { +public: + Tests_MGDegree() { + } + static void SetupTestCase() { + } + static void TearDownTestCase() { + } + virtual void SetUp() { + } + virtual void TearDown() { + } + + static std::vector mgspmv_time; + + template + void run_current_test(const MGDegree_Usecase& param) { + const ::testing::TestInfo* const test_info = + ::testing::UnitTest::GetInstance()->current_test_info(); + std::stringstream ss; + std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file) + + std::string("_") + ss.str().c_str(); + std::cout << test_id << "\n"; + int m, k, nnz, n_gpus; + MM_typecode mc; + gdf_error status; + + double t; + + FILE* fpin = fopen(param.matrix_file.c_str(), "r"); + + if (!fpin) { + std::cout << "Could not open file: " << param.matrix_file << "\n"; + FAIL(); + } + + ASSERT_EQ(mm_properties(fpin, 1, &mc, &m, &k, &nnz),0)<< "could not read Matrix Market file properties"<< "\n"; + ASSERT_TRUE(mm_is_matrix(mc)); + ASSERT_TRUE(mm_is_coordinate(mc)); + ASSERT_FALSE(mm_is_complex(mc)); + ASSERT_FALSE(mm_is_skew(mc)); + + // Allocate memory on host + std::vector cooRowInd(nnz), cooColInd(nnz), csrColInd(nnz), csrRowPtr(m + 1); + std::vector degree_h(m, 0.0), degree_ref(m, 0.0), csrVal(nnz); + + // Read + ASSERT_EQ( (mm_to_coo(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], NULL, NULL)) , 0)<< "could not read matrix data"<< "\n"; + ASSERT_EQ(fclose(fpin), 0); + //ASSERT_EQ( (coo_to_csr (m, m, nnz, &cooRowInd[0], &cooColInd[0], NULL, NULL, &csrRowPtr[0], NULL, NULL, NULL)), 0) << "could not covert COO to CSR "<< "\n"; + coo2csr(cooRowInd, cooColInd, csrRowPtr, csrColInd); + + CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus)); + std::vector v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus + 1); + gdf_column *col_x[n_gpus]; + //reference result + t = omp_get_wtime(); + ref_degree_h(param.x, csrRowPtr, csrColInd, degree_ref); + std::cout << "CPU time: " << omp_get_wtime() - t << "\n"; + if (nnz < 1200000000) + { +#pragma omp parallel num_threads(1) + { + //omp_set_num_threads(n_gpus); + auto i = omp_get_thread_num(); + auto p = omp_get_num_threads(); + CUDA_RT_CALL(cudaSetDevice(i)); + +#ifdef SNMG_VERBOSE +#pragma omp master + { + std::cout << "Number of GPUs : "<< n_gpus <dtype << "," << col_ind->dtype << "\n"; + } + EXPECT_EQ(status, 0); +#pragma omp master + { + std::cout << "GPU time: " << omp_get_wtime() - t << "\n"; + } + +#pragma omp master + { + //printv(m, (val_t *)col_x[0]->data, 0); + CUDA_RT_CALL(cudaMemcpy(°ree_h[0], + col_x[0]->data, + sizeof(idx_t) * m, + cudaMemcpyDeviceToHost)); + + for (auto j = 0; j < degree_h.size(); ++j) + EXPECT_EQ(degree_ref[j], degree_h[j]); + } + + gdf_col_delete(col_off); + gdf_col_delete(col_ind); + gdf_col_delete(col_val); + gdf_col_delete(col_x[i]); + } + } + if (n_gpus > 1) + { + // Only using the 4 fully connected GPUs on DGX1 + if (n_gpus == 8) + n_gpus = 4; + +#pragma omp parallel num_threads(n_gpus) + { + //omp_set_num_threads(n_gpus); + auto i = omp_get_thread_num(); + auto p = omp_get_num_threads(); + CUDA_RT_CALL(cudaSetDevice(i)); + +#ifdef SNMG_VERBOSE +#pragma omp master + { + std::cout << "Number of GPUs : "<< n_gpus <dtype << "," << col_ind->dtype << "\n"; + } + EXPECT_EQ(status, 0); +#pragma omp master + { + std::cout << "multi-GPU time: " << omp_get_wtime() - t << "\n"; + } + +#pragma omp master + { + //printv(m, (val_t *)col_x[0]->data, 0); + CUDA_RT_CALL(cudaMemcpy(°ree_h[0], + col_x[0]->data, + sizeof(idx_t) * m, + cudaMemcpyDeviceToHost)); + + for (auto j = 0; j < degree_h.size(); ++j) + EXPECT_EQ(degree_ref[j], degree_h[j]); + } + + gdf_col_delete(col_off); + gdf_col_delete(col_ind); + gdf_col_delete(col_val); + gdf_col_delete(col_x[i]); + } + } + std::cout << std::endl; + } +}; + +TEST_P(Tests_MGDegree, CheckInt32_mtx) { + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGDegree, + ::testing::Values(MGDegree_Usecase("test/datasets/karate.mtx", 0) + , + MGDegree_Usecase("test/datasets/karate.mtx", 1) + , + MGDegree_Usecase("test/datasets/karate.mtx", 2) + , + MGDegree_Usecase("test/datasets/netscience.mtx", 0) + , + MGDegree_Usecase("test/datasets/netscience.mtx", 1) + , + MGDegree_Usecase("test/datasets/netscience.mtx", 2) + , + MGDegree_Usecase("test/datasets/cit-Patents.mtx", 0) + , + MGDegree_Usecase("test/datasets/cit-Patents.mtx", 1) + , + MGDegree_Usecase("test/datasets/cit-Patents.mtx", 2) + , + MGDegree_Usecase("test/datasets/webbase-1M.mtx", 0) + , + MGDegree_Usecase("test/datasets/webbase-1M.mtx", 1) + , + MGDegree_Usecase("test/datasets/webbase-1M.mtx", 2) + , + MGDegree_Usecase("test/datasets/web-Google.mtx", 0) + , + MGDegree_Usecase("test/datasets/web-Google.mtx", 1) + , + MGDegree_Usecase("test/datasets/web-Google.mtx", 2) + , + MGDegree_Usecase("test/datasets/wiki-Talk.mtx", 0) + , + MGDegree_Usecase("test/datasets/wiki-Talk.mtx", 1) + , + MGDegree_Usecase("test/datasets/wiki-Talk.mtx", 2) + ) + ); + +class Tests_MGDegree_hibench: public ::testing::TestWithParam { +public: + Tests_MGDegree_hibench() { + } + static void SetupTestCase() { + } + static void TearDownTestCase() { + } + virtual void SetUp() { + } + virtual void TearDown() { + } + + static std::vector mgspmv_time; + + template + void run_current_test(const MGDegree_Usecase& param) { + const ::testing::TestInfo* const test_info = + ::testing::UnitTest::GetInstance()->current_test_info(); + std::stringstream ss; + std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file) + + std::string("_") + ss.str().c_str(); + std::cout << "Filename: " << param.matrix_file << ", x=" << param.x << "\n"; + int m, nnz, n_gpus; + gdf_error status; + std::vector cooRowInd, cooColInd; + double t; + + ASSERT_EQ(read_single_file(param.matrix_file.c_str(), cooRowInd, cooColInd), 0); + nnz = cooRowInd.size(); + m = std::max(*(std::max_element(cooRowInd.begin(), cooRowInd.end())), + *(std::max_element(cooColInd.begin(), cooColInd.end()))); + m += 1; + + // Allocate memory on host + std::vector csrColInd(nnz), csrRowPtr(m + 1), degree_ref(m), degree_h(m), csrVal(nnz); + coo2csr(cooRowInd, cooColInd, csrRowPtr, csrColInd); + CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus)); + std::vector v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus + 1); + gdf_column *col_x[n_gpus]; + //reference result + t = omp_get_wtime(); + ref_degree_h(param.x, csrRowPtr, csrColInd, degree_ref); + std::cout << "CPU time: " << omp_get_wtime() - t << "\n"; + + if (nnz < 1200000000) { +#pragma omp parallel num_threads(1) + { + //omp_set_num_threads(n_gpus); + auto i = omp_get_thread_num(); + auto p = omp_get_num_threads(); + CUDA_RT_CALL(cudaSetDevice(i)); + +#ifdef SNMG_VERBOSE +#pragma omp master + { + std::cout << "Number of GPUs : "<< n_gpus <size,(float*)col_val->data,0); + t = omp_get_wtime(); + status = gdf_snmg_degree(param.x, &part_offset[0], col_off, col_ind, col_x); + if (status != 0){ + std::cout << "Call to gdf_snmg_degree failed: " << gdf_error_get_name(status) << "\n"; + std::cout << "Dtypes: " << col_off->dtype << "," << col_ind->dtype << "\n"; + } + EXPECT_EQ(status, 0); +#pragma omp master + { + std::cout << "GPU time: " << omp_get_wtime() - t << "\n"; + } + +#pragma omp master + { + //printv(m, (val_t *)col_x[0]->data, 0); + CUDA_RT_CALL(cudaMemcpy(°ree_h[0], + col_x[0]->data, + sizeof(idx_t) * m, + cudaMemcpyDeviceToHost)); + + for (auto j = 0; j < degree_ref.size(); ++j) + EXPECT_EQ(degree_ref[j], degree_h[j]); + } + + gdf_col_delete(col_off); + gdf_col_delete(col_ind); + gdf_col_delete(col_val); + gdf_col_delete(col_x[i]); + } + } + if (n_gpus > 1) { + // Only using the 4 fully connected GPUs on DGX1 + if (n_gpus == 8) + n_gpus = 4; + +#pragma omp parallel num_threads(n_gpus) + { + auto i = omp_get_thread_num(); + auto p = omp_get_num_threads(); + CUDA_RT_CALL(cudaSetDevice(i)); + +#ifdef SNMG_VERBOSE +#pragma omp master + { + std::cout << "Number of GPUs : "<< n_gpus <size,(float*)col_val->data,0); + t = omp_get_wtime(); + status = gdf_snmg_degree(param.x, &part_offset[0], col_off, col_ind, col_x); + if (status != 0){ + std::cout << "Call to gdf_snmg_degree failed: " << gdf_error_get_name(status) << "\n"; + std::cout << "Dtypes: " << col_off->dtype << "," << col_ind->dtype << "\n"; + } + EXPECT_EQ(status, 0); +#pragma omp master + { + std::cout << "multi-GPU time: " << omp_get_wtime() - t << "\n"; + } + +#pragma omp master + { + //printv(m, (val_t *)col_x[0]->data, 0); + CUDA_RT_CALL(cudaMemcpy(°ree_h[0], + col_x[0]->data, + sizeof(idx_t) * m, + cudaMemcpyDeviceToHost)); + + for (auto j = 0; j < degree_h.size(); ++j) + EXPECT_EQ(degree_ref[j], degree_h[j]); + } + + gdf_col_delete(col_off); + gdf_col_delete(col_ind); + gdf_col_delete(col_val); + gdf_col_delete(col_x[i]); + } + } + std::cout << std::endl; + } +}; + +TEST_P(Tests_MGDegree_hibench, CheckFP32_hibench) { + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P(hibench_test, + Tests_MGDegree_hibench, + ::testing::Values(MGDegree_Usecase("benchmark/hibench/1/Input-small/edges/part-00000", + 0) + , + MGDegree_Usecase("benchmark/hibench/1/Input-small/edges/part-00000", + 1) + , + MGDegree_Usecase("benchmark/hibench/1/Input-small/edges/part-00000", + 2) + , + MGDegree_Usecase("benchmark/hibench/1/Input-large/edges/part-00000", + 0) + , + MGDegree_Usecase("benchmark/hibench/1/Input-large/edges/part-00000", + 1) + , + MGDegree_Usecase("benchmark/hibench/1/Input-large/edges/part-00000", + 2) + , + MGDegree_Usecase("benchmark/hibench/1/Input-huge/edges/part-00000", + 0) + , + MGDegree_Usecase("benchmark/hibench/1/Input-huge/edges/part-00000", + 1) + , + MGDegree_Usecase("benchmark/hibench/1/Input-huge/edges/part-00000", + 2) + ) + ); + +int main(int argc, char **argv) { + srand(42); + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} + diff --git a/cpp/src/tests/snmg_pagerank/snmg_pagerank_test.cu b/cpp/src/tests/snmg_pagerank/snmg_pagerank_test.cu new file mode 100644 index 00000000000..e65e4267600 --- /dev/null +++ b/cpp/src/tests/snmg_pagerank/snmg_pagerank_test.cu @@ -0,0 +1,369 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gtest/gtest.h" +#include "high_res_clock.h" +#include "cuda_profiler_api.h" +#include +#include +#include "test_utils.h" +#include "snmg_test_utils.h" +#include "snmg/link_analysis/pagerank.cuh" + +//#define SNMG_VERBOSE + +typedef struct MGPagerank_Usecase_t { + std::string matrix_file; + std::string result_file; + + MGPagerank_Usecase_t(const std::string& a, const std::string& b) { + // assume relative paths are relative to RAPIDS_DATASET_ROOT_DIR + // if RAPIDS_DATASET_ROOT_DIR not set, default to "/datasets" + const std::string& rapidsDatasetRootDir = get_rapids_dataset_root_dir(); + if ((a != "") && (a[0] != '/')) { + matrix_file = rapidsDatasetRootDir + "/" + a; + } else { + matrix_file = a; + } + if ((b != "") && (b[0] != '/')) { + result_file = rapidsDatasetRootDir + "/" + b; + } else { + result_file = b; + } + } + MGPagerank_Usecase_t& operator=(const MGPagerank_Usecase_t& rhs) { + matrix_file = rhs.matrix_file; + result_file = rhs.result_file; + return *this; + } +} MGPagerank_Usecase; + +template +void verify_pr(gdf_column* col_pagerank, const MGPagerank_Usecase& param){ + // Check vs golden data + if (param.result_file.length()>0) + { + int m = col_pagerank->size; + std::vector calculated_res(m); + CUDA_RT_CALL(cudaMemcpy(&calculated_res[0], col_pagerank->data, sizeof(val_t) * m, cudaMemcpyDeviceToHost)); + std::sort(calculated_res.begin(), calculated_res.end()); + FILE* fpin = fopen(param.result_file.c_str(),"rb"); + ASSERT_TRUE(fpin != NULL) << " Cannot read file with reference data: " << param.result_file << std::endl; + std::vector expected_res(m); + ASSERT_EQ(read_binary_vector(fpin, m, expected_res), 0); + fclose(fpin); + val_t err; + int n_err = 0; + for (int i = 0; i < m; i++) { + err = fabs(expected_res[i] - calculated_res[i]); + if (err> 1e-5) { + n_err++; // count the number of mismatches + } + } + if (n_err) { + EXPECT_LE(n_err, 0.001*m); // tolerate 0.1% of values with a litte difference + } + } +} + +class Tests_MGPagerank : public ::testing::TestWithParam { + public: + Tests_MGPagerank() { } + static void SetupTestCase() { } + static void TearDownTestCase() { } + virtual void SetUp() { } + virtual void TearDown() { } + + static std::vector mgpr_time; + + template + void run_current_test(const MGPagerank_Usecase& param) { + const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info(); + std::stringstream ss; + std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file)+ std::string("_") + ss.str().c_str(); + + int m, k, nnz, n_gpus, max_iter=50; + val_t alpha = 0.85; + MM_typecode mc; + + double t; + + FILE* fpin = fopen(param.matrix_file.c_str(),"r"); + + ASSERT_EQ(mm_properties(fpin, 1, &mc, &m, &k, &nnz),0) << "could not read Matrix Market file properties"<< "\n"; + ASSERT_TRUE(mm_is_matrix(mc)); + ASSERT_TRUE(mm_is_coordinate(mc)); + ASSERT_FALSE(mm_is_complex(mc)); + ASSERT_FALSE(mm_is_skew(mc)); + + // Allocate memory on host + std::vector cooRowInd(nnz), cooColInd(nnz), csrColInd(nnz), csrRowPtr(m+1); + std::vector cooVal(nnz), csrVal(nnz), pagerank_h(m, 1.0/m); + + // Read + ASSERT_EQ( (mm_to_coo(fpin, 1, nnz, &cooRowInd[0], &cooColInd[0], NULL, NULL)) , 0)<< "could not read matrix data"<< "\n"; + ASSERT_EQ(fclose(fpin),0); + + // WARNING transpose happening here + coo2csr(cooColInd, cooRowInd, csrRowPtr, csrColInd); + + CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus)); + std::vector v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus+1); + random_vals(csrVal); + gdf_column *col_pagerank[n_gpus]; + + if (nnz<1200000000) + { + #pragma omp parallel num_threads(1) + { + auto i = omp_get_thread_num(); + auto p = omp_get_num_threads(); + CUDA_RT_CALL(cudaSetDevice(i)); + + #ifdef SNMG_VERBOSE + #pragma omp master + { + std::cout << "Number of GPUs : "<< n_gpus < pr_solver(env, &part_offset[0], static_cast(col_off->data), static_cast(col_ind->data)); + pr_solver.setup(alpha); + + val_t* pagerank[p]; + for (auto i = 0; i < p; ++i) + pagerank[i]= static_cast(col_pagerank[i]->data); + + pr_solver.solve(max_iter, pagerank); + #pragma omp master + {std::cout << omp_get_wtime() - t << " ";} + + verify_pr(col_pagerank[i], param); + + gdf_col_delete(col_off); + gdf_col_delete(col_ind); + gdf_col_delete(col_val); + gdf_col_delete(col_pagerank[i]); + } + } +// TODO Enable when degree function is present +#if 0 + if (n_gpus > 1) + { + // Only using the 4 fully connected GPUs on DGX1 + if (n_gpus == 8) + n_gpus = 4; + + #pragma omp parallel num_threads(n_gpus) + { + auto i = omp_get_thread_num(); + auto p = omp_get_num_threads(); + CUDA_RT_CALL(cudaSetDevice(i)); + + #ifdef SNMG_VERBOSE + #pragma omp master + { + std::cout << "Number of GPUs : "<< n_gpus < pr_solver(env, &part_offset[0], static_cast(col_off->data), static_cast(col_ind->data)); + pr_solver.setup(alpha); + + val_t* pagerank[p]; + for (auto i = 0; i < p; ++i) + pagerank[i]= static_cast(col_pagerank[i]->data); + + pr_solver.solve(max_iter, pagerank); + #pragma omp master + {std::cout << omp_get_wtime() - t << " ";} + + verify_pr(col_pagerank[i], param); + + gdf_col_delete(col_off); + gdf_col_delete(col_ind); + gdf_col_delete(col_val); + gdf_col_delete(col_pagerank[i]); + + + } + } +#endif + std::cout << std::endl; + } + +}; + +class Tests_MGPR_hibench : public ::testing::TestWithParam { + public: + Tests_MGPR_hibench() { } + static void SetupTestCase() { } + static void TearDownTestCase() { } + virtual void SetUp() { } + virtual void TearDown() { } + + static std::vector mgspmv_time; + + template + void run_current_test(const MGPagerank_Usecase& param) { + const ::testing::TestInfo* const test_info =::testing::UnitTest::GetInstance()->current_test_info(); + std::stringstream ss; + std::string test_id = std::string(test_info->test_case_name()) + std::string(".") + std::string(test_info->name()) + std::string("_") + getFileName(param.matrix_file)+ std::string("_") + ss.str().c_str(); + + int m, nnz, n_gpus, max_iter=50; + val_t alpha = 0.85; + std::vector cooRowInd, cooColInd; + double t; + + ASSERT_EQ(read_single_file(param.matrix_file.c_str(),cooRowInd,cooColInd),0); + nnz = cooRowInd.size(); + m = std::max( *(std::max_element(cooRowInd.begin(), cooRowInd.end())), + *(std::max_element(cooColInd.begin(), cooColInd.end()))); + + // Allocate memory on host + std::vector csrColInd(nnz), csrRowPtr(m+1); + std::vector cooVal(nnz), csrVal(nnz), pagerank_h(m, 1.0/m); + coo2csr(cooRowInd, cooColInd, csrRowPtr, csrColInd); + CUDA_RT_CALL(cudaGetDeviceCount(&n_gpus)); + std::vector v_loc(n_gpus), e_loc(n_gpus), part_offset(n_gpus+1); + random_vals(csrVal); + gdf_column *col_pagerank[n_gpus]; + + if (n_gpus > 1) + { + // Only using the 4 fully connected GPUs on DGX1 + if (n_gpus == 8) + n_gpus = 4; + + #pragma omp parallel num_threads(n_gpus) + { + auto i = omp_get_thread_num(); + auto p = omp_get_num_threads(); + CUDA_RT_CALL(cudaSetDevice(i)); + + #ifdef SNMG_VERBOSE + #pragma omp master + { + std::cout << "Number of GPUs : "<< n_gpus < pr_solver(env, &part_offset[0], static_cast(col_off->data), static_cast(col_ind->data)); + pr_solver.setup(alpha); + + val_t* pagerank[p]; + for (auto i = 0; i < p; ++i) + pagerank[i]= static_cast(col_pagerank[i]->data); + + pr_solver.solve(max_iter, pagerank); + #pragma omp master + {std::cout << omp_get_wtime() - t << " ";} + + verify_pr(col_pagerank[i], param); + + gdf_col_delete(col_off); + gdf_col_delete(col_ind); + gdf_col_delete(col_val); + gdf_col_delete(col_pagerank[i]); + } + } + std::cout << std::endl; + } +}; + + +TEST_P(Tests_MGPagerank, CheckFP32_mtx) { + run_current_test(GetParam()); +} +TEST_P(Tests_MGPagerank, CheckFP64) { + run_current_test(GetParam()); +} +TEST_P(Tests_MGPR_hibench, CheckFP32_hibench) { + run_current_test(GetParam()); +} + +INSTANTIATE_TEST_CASE_P(mtx_test, Tests_MGPagerank, + ::testing::Values( MGPagerank_Usecase("test/datasets/karate.mtx", "") + ,MGPagerank_Usecase("test/datasets/web-BerkStan.mtx", "test/ref/pagerank/web-BerkStan.pagerank_val_0.85.bin") + ,MGPagerank_Usecase("test/datasets/web-Google.mtx", "test/ref/pagerank/web-Google.pagerank_val_0.85.bin") + ,MGPagerank_Usecase("test/datasets/wiki-Talk.mtx", "test/ref/pagerank/wiki-Talk.pagerank_val_0.85.bin") + ,MGPagerank_Usecase("test/datasets/cit-Patents.mtx", "test/ref/pagerank/cit-Patents.pagerank_val_0.85.bin") + ,MGPagerank_Usecase("test/datasets/ljournal-2008.mtx","test/ref/pagerank/ljournal-2008.pagerank_val_0.85.bin") + ,MGPagerank_Usecase("test/datasets/webbase-1M.mtx", "test/ref/pagerank/webbase-1M.pagerank_val_0.85.bin") + ) + ); + +INSTANTIATE_TEST_CASE_P(hibench_test, Tests_MGPR_hibench, + ::testing::Values( MGPagerank_Usecase("benchmark/hibench/1/Input-small/edges/part-00000", "") + ,MGPagerank_Usecase("benchmark/hibench/1/Input-large/edges/part-00000", "") + ,MGPagerank_Usecase("benchmark/hibench/1/Input-huge/edges/part-00000", "") + ) + ); + + + +int main(int argc, char **argv) { + srand(42); + ::testing::InitGoogleTest(&argc, argv); + + return RUN_ALL_TESTS(); +} + + diff --git a/cpp/src/tests/test_utils.h b/cpp/src/tests/test_utils.h index dc87d7403ea..0c82a4e1b43 100644 --- a/cpp/src/tests/test_utils.h +++ b/cpp/src/tests/test_utils.h @@ -222,8 +222,8 @@ void printCsrMatI(int m, int n, int nnz,std::vector & csrRowPtr, std::vecto */ template int mm_properties(FILE * f, int tg, MM_typecode * t, - IndexType_ * m, IndexType_ * n, - IndexType_ * nnz) { + IndexType_ * m, IndexType_ * n, + IndexType_ * nnz) { // Read matrix properties from file int mint, nint, nnzint; @@ -279,7 +279,7 @@ int mm_properties(FILE * f, int tg, MM_typecode * t, // Check if entry is diagonal if(row == col) - --(*nnz); + --(*nnz); } } @@ -310,8 +310,8 @@ int mm_properties(FILE * f, int tg, MM_typecode * t, */ template int mm_to_coo(FILE *f, int tg, IndexType_ nnz, - IndexType_ * cooRowInd, IndexType_ * cooColInd, - ValueType_ * cooRVal , ValueType_ * cooIVal) { + IndexType_ * cooRowInd, IndexType_ * cooColInd, + ValueType_ * cooRVal , ValueType_ * cooIVal) { // Read matrix properties from file MM_typecode t; @@ -381,20 +381,20 @@ int mm_to_coo(FILE *f, int tg, IndexType_ nnz, // Modify entry value if matrix is skew symmetric or Hermitian if(mm_is_skew(t)) { - rval = -rval; - ival = -ival; + rval = -rval; + ival = -ival; } else if(mm_is_hermitian(t)) { - ival = -ival; + ival = -ival; } // Record entry cooRowInd[j] = col; cooColInd[j] = row; if(cooRVal != NULL) - cooRVal[j] = rval; + cooRVal[j] = rval; if(cooIVal != NULL) - cooIVal[j] = ival; + cooIVal[j] = ival; ++j; } @@ -435,10 +435,10 @@ class lesser_tuple { */ template void coo_sort(IndexType_ nnz, int sort_by_row, - IndexType_ * cooRowInd, - IndexType_ * cooColInd, - ValueType_ * cooRVal, - ValueType_ * cooIVal) { + IndexType_ * cooRowInd, + IndexType_ * cooColInd, + ValueType_ * cooRVal, + ValueType_ * cooIVal) { // Determine whether to sort by row or by column int i; @@ -451,21 +451,21 @@ void coo_sort(IndexType_ nnz, int sort_by_row, using namespace thrust; if((cooRVal==NULL) && (cooIVal==NULL)) stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd)), - make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz)), - lesser_tuple(i)); + make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz)), + lesser_tuple(i)); else if((cooRVal==NULL) && (cooIVal!=NULL)) stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooIVal)), - make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooIVal+nnz)), - lesser_tuple(i)); + make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooIVal+nnz)), + lesser_tuple(i)); else if((cooRVal!=NULL) && (cooIVal==NULL)) stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooRVal)), - make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooRVal+nnz)), - lesser_tuple(i)); + make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz,cooRVal+nnz)), + lesser_tuple(i)); else stable_sort(make_zip_iterator(make_tuple(cooRowInd,cooColInd,cooRVal,cooIVal)), - make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz, - cooRVal+nnz,cooIVal+nnz)), - lesser_tuple(i)); + make_zip_iterator(make_tuple(cooRowInd+nnz,cooColInd+nnz, + cooRVal+nnz,cooIVal+nnz)), + lesser_tuple(i)); } template @@ -632,7 +632,7 @@ gdf_column_ptr create_gdf_column(std::vector const & host_vector) // Allocate device storage for gdf_column and copy contents from host_vector const size_t input_size_bytes = host_vector.size() * sizeof(col_type); cudaStream_t stream{nullptr}; - ALLOC_MANAGED_TRY((void**)&(the_column->data), input_size_bytes, stream); + ALLOC_TRY((void**)&(the_column->data), input_size_bytes, stream); cudaMemcpy(the_column->data, host_vector.data(), input_size_bytes, cudaMemcpyHostToDevice); // Deduce the type and set the gdf_dtype accordingly @@ -666,7 +666,7 @@ void create_gdf_column(std::vector const & host_vector, gdf_column * t // Allocate device storage for gdf_column and copy contents from host_vector const size_t input_size_bytes = host_vector.size() * sizeof(col_type); cudaStream_t stream{nullptr}; - ALLOC_MANAGED_TRY((void**)&(the_column->data), input_size_bytes, stream); + ALLOC_TRY((void**)&(the_column->data), input_size_bytes, stream); cudaMemcpy(the_column->data, host_vector.data(), input_size_bytes, cudaMemcpyHostToDevice); // Deduce the type and set the gdf_dtype accordingly diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu new file mode 100644 index 00000000000..c42be78943c --- /dev/null +++ b/cpp/src/traversal/bfs.cu @@ -0,0 +1,498 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + * + */ + +#include +#include +#include +#include "bfs.cuh" +#include +#include "rmm_utils.h" + +#include "utilities/graph_utils.cuh" +#include "bfs_kernels.cuh" + +using namespace bfs_kernels; + +namespace cugraph { + enum BFS_ALGO_STATE { + TOPDOWN, BOTTOMUP + }; + + template + void Bfs::setup() { + + // Determinism flag, false by default + deterministic = false; + //Working data + //Each vertex can be in the frontier at most once + ALLOC_TRY(&frontier, n * sizeof(IndexType), nullptr); + + //We will update frontier during the execution + //We need the orig to reset frontier, or ALLOC_FREE_TRY + original_frontier = frontier; + + //size of bitmaps for vertices + vertices_bmap_size = (n / (8 * sizeof(int)) + 1); + //ith bit of visited_bmap is set <=> ith vertex is visited + ALLOC_TRY(&visited_bmap, sizeof(int) * vertices_bmap_size, nullptr); + + //ith bit of isolated_bmap is set <=> degree of ith vertex = 0 + ALLOC_TRY(&isolated_bmap, sizeof(int) * vertices_bmap_size, nullptr); + + //vertices_degree[i] = degree of vertex i + ALLOC_TRY(&vertex_degree, sizeof(IndexType) * n, nullptr); + + //Cub working data + cub_exclusive_sum_alloc(n + 1, d_cub_exclusive_sum_storage, cub_exclusive_sum_storage_bytes); + + //We will need (n+1) ints buffer for two differents things (bottom up or top down) - sharing it since those uses are mutually exclusive + ALLOC_TRY(&buffer_np1_1, (n + 1) * sizeof(IndexType), nullptr); + ALLOC_TRY(&buffer_np1_2, (n + 1) * sizeof(IndexType), nullptr); + + //Using buffers : top down + + //frontier_vertex_degree[i] is the degree of vertex frontier[i] + frontier_vertex_degree = buffer_np1_1; + //exclusive sum of frontier_vertex_degree + exclusive_sum_frontier_vertex_degree = buffer_np1_2; + + //Using buffers : bottom up + //contains list of unvisited vertices + unvisited_queue = buffer_np1_1; + //size of the "last" unvisited queue : size_last_unvisited_queue + //refers to the size of unvisited_queue + //which may not be up to date (the queue may contains vertices that are now visited) + + //We may leave vertices unvisited after bottom up main kernels - storing them here + left_unvisited_queue = buffer_np1_2; + + //We use buckets of edges (32 edges per bucket for now, see exact macro in bfs_kernels). frontier_vertex_degree_buckets_offsets[i] is the index k such as frontier[k] is the source of the first edge of the bucket + //See top down kernels for more details + ALLOC_TRY(&exclusive_sum_frontier_vertex_buckets_offsets, + ((nnz / TOP_DOWN_EXPAND_DIMX + 1) * NBUCKETS_PER_BLOCK + 2) * sizeof(IndexType), nullptr); + + //Init device-side counters + //Those counters must be/can be reset at each bfs iteration + //Keeping them adjacent in memory allow use call only one cudaMemset - launch latency is the current bottleneck + ALLOC_TRY(&d_counters_pad, 4 * sizeof(IndexType), nullptr); + + d_new_frontier_cnt = &d_counters_pad[0]; + d_mu = &d_counters_pad[1]; + d_unvisited_cnt = &d_counters_pad[2]; + d_left_unvisited_cnt = &d_counters_pad[3]; + + //Lets use this int* for the next 3 lines + //Its dereferenced value is not initialized - so we dont care about what we put in it + IndexType * d_nisolated = d_new_frontier_cnt; + cudaMemsetAsync(d_nisolated, 0, sizeof(IndexType), stream); + + //Computing isolated_bmap + //Only dependent on graph - not source vertex - done once + flag_isolated_vertices(n, isolated_bmap, row_offsets, vertex_degree, d_nisolated, stream); + cudaMemcpyAsync(&nisolated, d_nisolated, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); + + //We need nisolated to be ready to use + cudaStreamSynchronize(stream); + } + + template + void Bfs::configure(IndexType *_distances, + IndexType *_predecessors, + int *_edge_mask) + { + distances = _distances; + predecessors = _predecessors; + edge_mask = _edge_mask; + + useEdgeMask = (edge_mask != NULL); + computeDistances = (distances != NULL); + computePredecessors = (predecessors != NULL); + + //We need distances to use bottom up + if (directed && !computeDistances) + ALLOC_TRY(&distances, n * sizeof(IndexType), nullptr); + } + + template + void Bfs::traverse(IndexType source_vertex) { + + //Init visited_bmap + //If the graph is undirected, we not that + //we will never discover isolated vertices (in degree = out degree = 0) + //we avoid a lot of work by flagging them now + //in g500 graphs they represent ~25% of total vertices + //more than that for wiki and twitter graphs + + if (directed) { + cudaMemsetAsync(visited_bmap, 0, vertices_bmap_size * sizeof(int), stream); + } + else { + cudaMemcpyAsync(visited_bmap, + isolated_bmap, + vertices_bmap_size * sizeof(int), + cudaMemcpyDeviceToDevice, + stream); + } + + //If needed, setting all vertices as undiscovered (inf distance) + //We dont use computeDistances here + //if the graph is undirected, we may need distances even if + //computeDistances is false + if (distances) + fill_vec(distances, n, vec_t::max, stream); + + //If needed, setting all predecessors to non-existent (-1) + if (computePredecessors) { + cudaMemsetAsync(predecessors, -1, n * sizeof(IndexType), stream); + } + + // + //Initial frontier + // + + frontier = original_frontier; + + if (distances) { + cudaMemsetAsync(&distances[source_vertex], 0, sizeof(IndexType), stream); + } + + //Setting source_vertex as visited + //There may be bit already set on that bmap (isolated vertices) - if the graph is undirected + int current_visited_bmap_source_vert = 0; + + if (!directed) { + cudaMemcpyAsync(¤t_visited_bmap_source_vert, + &visited_bmap[source_vertex / INT_SIZE], + sizeof(int), + cudaMemcpyDeviceToHost); + //We need current_visited_bmap_source_vert + cudaStreamSynchronize(stream); + } + + int m = (1 << (source_vertex % INT_SIZE)); + + //In that case, source is isolated, done now + if (!directed && (m & current_visited_bmap_source_vert)) { + //Init distances and predecessors are done, (cf Streamsync in previous if) + return; + } + + m |= current_visited_bmap_source_vert; + + cudaMemcpyAsync(&visited_bmap[source_vertex / INT_SIZE], + &m, + sizeof(int), + cudaMemcpyHostToDevice, + stream); + + //Adding source_vertex to init frontier + cudaMemcpyAsync(&frontier[0], + &source_vertex, + sizeof(IndexType), + cudaMemcpyHostToDevice, + stream); + + //mf : edges in frontier + //nf : vertices in frontier + //mu : edges undiscovered + //nu : nodes undiscovered + //lvl : current frontier's depth + IndexType mf, nf, mu, nu; + bool growing; + IndexType lvl = 1; + + //Frontier has one vertex + nf = 1; + + //all edges are undiscovered (by def isolated vertices have 0 edges) + mu = nnz; + + //all non isolated vertices are undiscovered (excepted source vertex, which is in frontier) + //That number is wrong if source_vertex is also isolated - but it's not important + nu = n - nisolated - nf; + + //Last frontier was 0, now it is 1 + growing = true; + + IndexType size_last_left_unvisited_queue = n; //we just need value > 0 + IndexType size_last_unvisited_queue = 0; //queue empty + + //Typical pre-top down workflow. set_frontier_degree + exclusive-scan + set_frontier_degree(frontier_vertex_degree, frontier, vertex_degree, nf, stream); + exclusive_sum(d_cub_exclusive_sum_storage, + cub_exclusive_sum_storage_bytes, + frontier_vertex_degree, + exclusive_sum_frontier_vertex_degree, + nf + 1, + stream); + + cudaMemcpyAsync(&mf, + &exclusive_sum_frontier_vertex_degree[nf], + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + + //We need mf + cudaStreamSynchronize(stream); + + //At first we know we have to use top down + BFS_ALGO_STATE algo_state = TOPDOWN; + + //useDistances : we check if a vertex is a parent using distances in bottom up - distances become working data + //undirected g : need parents to be in children's neighbors + bool can_use_bottom_up = !directed && distances; + + while (nf > 0) { + //Each vertices can appear only once in the frontierer array - we know it will fit + new_frontier = frontier + nf; + IndexType old_nf = nf; + resetDevicePointers(); + + if (can_use_bottom_up) { + //Choosing algo + //Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf + + switch (algo_state) { + case TOPDOWN: + if (mf > mu / alpha) + algo_state = BOTTOMUP; + break; + case BOTTOMUP: + if (!growing && nf < n / beta) { + + //We need to prepare the switch back to top down + //We couldnt keep track of mu during bottom up - because we dont know what mf is. Computing mu here + count_unvisited_edges(unvisited_queue, + size_last_unvisited_queue, + visited_bmap, + vertex_degree, + d_mu, + stream); + + //Typical pre-top down workflow. set_frontier_degree + exclusive-scan + set_frontier_degree(frontier_vertex_degree, + frontier, + vertex_degree, + nf, + stream); + exclusive_sum(d_cub_exclusive_sum_storage, + cub_exclusive_sum_storage_bytes, + frontier_vertex_degree, + exclusive_sum_frontier_vertex_degree, + nf + 1, + stream); + + cudaMemcpyAsync(&mf, + &exclusive_sum_frontier_vertex_degree[nf], + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + + cudaMemcpyAsync(&mu, d_mu, sizeof(IndexType), cudaMemcpyDeviceToHost, stream); + + //We will need mf and mu + cudaStreamSynchronize(stream); + algo_state = TOPDOWN; + } + break; + } + } + + //Executing algo + + switch (algo_state) { + case TOPDOWN: + compute_bucket_offsets(exclusive_sum_frontier_vertex_degree, + exclusive_sum_frontier_vertex_buckets_offsets, + nf, + mf, + stream); + frontier_expand(row_offsets, + col_indices, + frontier, + nf, + mf, + lvl, + new_frontier, + d_new_frontier_cnt, + exclusive_sum_frontier_vertex_degree, + exclusive_sum_frontier_vertex_buckets_offsets, + visited_bmap, + distances, + predecessors, + edge_mask, + isolated_bmap, + directed, + stream, + deterministic); + + mu -= mf; + + cudaMemcpyAsync(&nf, + d_new_frontier_cnt, + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + cudaCheckError(); + + //We need nf + cudaStreamSynchronize(stream); + + if (nf) { + //Typical pre-top down workflow. set_frontier_degree + exclusive-scan + set_frontier_degree(frontier_vertex_degree, + new_frontier, + vertex_degree, + nf, + stream); + exclusive_sum(d_cub_exclusive_sum_storage, + cub_exclusive_sum_storage_bytes, + frontier_vertex_degree, + exclusive_sum_frontier_vertex_degree, + nf + 1, + stream); + cudaMemcpyAsync(&mf, + &exclusive_sum_frontier_vertex_degree[nf], + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + + //We need mf + cudaStreamSynchronize(stream); + } + break; + + case BOTTOMUP: + fill_unvisited_queue(visited_bmap, + vertices_bmap_size, + n, + unvisited_queue, + d_unvisited_cnt, + stream, + deterministic); + + size_last_unvisited_queue = nu; + + bottom_up_main(unvisited_queue, + size_last_unvisited_queue, + left_unvisited_queue, + d_left_unvisited_cnt, + visited_bmap, + row_offsets, + col_indices, + lvl, + new_frontier, + d_new_frontier_cnt, + distances, + predecessors, + edge_mask, + stream, + deterministic); + + //The number of vertices left unvisited decreases + //If it wasnt necessary last time, it wont be this time + if (size_last_left_unvisited_queue) { + cudaMemcpyAsync(&size_last_left_unvisited_queue, + d_left_unvisited_cnt, + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + cudaCheckError() + //We need last_left_unvisited_size + cudaStreamSynchronize(stream); + bottom_up_large(left_unvisited_queue, + size_last_left_unvisited_queue, + visited_bmap, + row_offsets, + col_indices, + lvl, + new_frontier, + d_new_frontier_cnt, + distances, + predecessors, + edge_mask, + stream, + deterministic); + } + cudaMemcpyAsync(&nf, + d_new_frontier_cnt, + sizeof(IndexType), + cudaMemcpyDeviceToHost, + stream); + cudaCheckError() + + //We will need nf + cudaStreamSynchronize(stream); + break; + } + + //Updating undiscovered edges count + nu -= nf; + + //Using new frontier + frontier = new_frontier; + growing = (nf > old_nf); + + ++lvl; + } + } + + template + void Bfs::resetDevicePointers() { + cudaMemsetAsync(d_counters_pad, 0, 4 * sizeof(IndexType), stream); + } + + template + void Bfs::clean() { + //the vectors have a destructor that takes care of cleaning + ALLOC_FREE_TRY(original_frontier, nullptr); + ALLOC_FREE_TRY(visited_bmap, nullptr); + ALLOC_FREE_TRY(isolated_bmap, nullptr); + ALLOC_FREE_TRY(vertex_degree, nullptr); + ALLOC_FREE_TRY(d_cub_exclusive_sum_storage, nullptr); + ALLOC_FREE_TRY(buffer_np1_1, nullptr); + ALLOC_FREE_TRY(buffer_np1_2, nullptr); + ALLOC_FREE_TRY(exclusive_sum_frontier_vertex_buckets_offsets, nullptr); + ALLOC_FREE_TRY(d_counters_pad, nullptr); + + //In that case, distances is a working data + if (directed && !computeDistances) + ALLOC_FREE_TRY(distances, nullptr); + } + + template class Bfs ; +} // end namespace cugraph + +gdf_error gdf_bfs(gdf_graph *graph, gdf_column *distances, gdf_column *predecessors, int start_vertex, bool directed) { + GDF_REQUIRE(graph->adjList != nullptr || graph->edgeList != nullptr, GDF_INVALID_API_CALL); + gdf_error err = gdf_add_adj_list(graph); + if (err != GDF_SUCCESS) + return err; + GDF_REQUIRE(graph->adjList->offsets->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); + GDF_REQUIRE(graph->adjList->indices->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); + GDF_REQUIRE(distances->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); + GDF_REQUIRE(predecessors->dtype == GDF_INT32, GDF_UNSUPPORTED_DTYPE); + + int n = graph->adjList->offsets->size - 1; + int e = graph->adjList->indices->size; + int* offsets_ptr = (int*)graph->adjList->offsets->data; + int* indices_ptr = (int*)graph->adjList->indices->data; + int* distances_ptr = (int*)distances->data; + int* predecessors_ptr = (int*)predecessors->data; + int alpha = 15; + int beta = 18; + + cugraph::Bfs bfs(n, e, offsets_ptr, indices_ptr, directed, alpha, beta); + bfs.configure(distances_ptr, predecessors_ptr, nullptr); + bfs.traverse(start_vertex); + return GDF_SUCCESS; +} + diff --git a/cpp/src/bfs.cuh b/cpp/src/traversal/bfs.cuh old mode 100755 new mode 100644 similarity index 98% rename from cpp/src/bfs.cuh rename to cpp/src/traversal/bfs.cuh index c665aabb6e3..a35b9b4bea4 --- a/cpp/src/bfs.cuh +++ b/cpp/src/traversal/bfs.cuh @@ -13,8 +13,6 @@ #include -//Used in nvgraph.h - #define TRAVERSAL_DEFAULT_ALPHA 15 #define TRAVERSAL_DEFAULT_BETA 18 @@ -97,5 +95,5 @@ namespace cugraph { void traverse(IndexType source_vertex); }; -} // end namespace nvgraph +} // end namespace cugraph diff --git a/cpp/src/traversal/bfs_kernels.cuh b/cpp/src/traversal/bfs_kernels.cuh new file mode 100644 index 00000000000..d4b31887b74 --- /dev/null +++ b/cpp/src/traversal/bfs_kernels.cuh @@ -0,0 +1,1566 @@ +/* + * Copyright (c) 2018 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include + +#include +#include + +#define MAXBLOCKS 65535 +#define WARP_SIZE 32 +#define INT_SIZE 32 + +// +// Bottom up macros +// + +#define FILL_UNVISITED_QUEUE_DIMX 256 + +#define COUNT_UNVISITED_EDGES_DIMX 256 + +#define MAIN_BOTTOMUP_DIMX 256 +#define MAIN_BOTTOMUP_NWARPS (MAIN_BOTTOMUP_DIMX/WARP_SIZE) + +#define LARGE_BOTTOMUP_DIMX 256 + +//Number of edges processed in the main bottom up kernel +#define MAIN_BOTTOMUP_MAX_EDGES 6 + +//Power of 2 < 32 (strict <) +#define BOTTOM_UP_LOGICAL_WARP_SIZE 4 + +// +// Top down macros +// + +// We will precompute the results the binsearch_maxle every TOP_DOWN_BUCKET_SIZE edges +#define TOP_DOWN_BUCKET_SIZE 32 + +// DimX of the kernel +#define TOP_DOWN_EXPAND_DIMX 256 + +// TOP_DOWN_EXPAND_DIMX edges -> NBUCKETS_PER_BLOCK buckets +#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX/TOP_DOWN_BUCKET_SIZE) + +// How many items_per_thread we can process with one bucket_offset loading +// the -1 is here because we need the +1 offset +#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1) + +// instruction parallelism +// for how many edges will we create instruction parallelism +#define TOP_DOWN_BATCH_SIZE 2 + +#define COMPUTE_BUCKET_OFFSETS_DIMX 512 + +//Other macros + +#define FLAG_ISOLATED_VERTICES_DIMX 128 + +//Number of vertices handled by one thread +//Must be power of 2, lower than 32 +#define FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD 4 + +//Number of threads involved in the "construction" of one int in the bitset +#define FLAG_ISOLATED_VERTICES_THREADS_PER_INT (INT_SIZE/FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD) + +// +// Parameters of the heuristic to switch between bottomup/topdown +//Finite machine described in http://parlab.eecs.berkeley.edu/sites/all/parlab/files/main.pdf +// + +using namespace cugraph; + +namespace bfs_kernels { + // + // gives the equivalent vectors from a type + // for the max val, would be better to use numeric_limits<>::max() once + // cpp11 is allowed in nvgraph + // + + template + struct vec_t { + typedef int4 vec4; + typedef int2 vec2; + }; + + template<> + struct vec_t { + typedef int4 vec4; + typedef int2 vec2; + static const int max = INT_MAX; + }; + + template<> + struct vec_t { + typedef longlong4 vec4; + typedef longlong2 vec2; + static const long long int max = LLONG_MAX; + }; + + // + // ------------------------- Helper device functions ------------------- + // + + __forceinline__ __device__ int getMaskNRightmostBitSet(int n) { + if (n == INT_SIZE) + return (~0); + int mask = (1 << n) - 1; + return mask; + } + + __forceinline__ __device__ int getMaskNLeftmostBitSet(int n) { + if (n == 0) + return 0; + int mask = ~((1 << (INT_SIZE - n)) - 1); + return mask; + } + + __forceinline__ __device__ int getNextZeroBit(int& val) { + int ibit = __ffs(~val) - 1; + val |= (1 << ibit); + + return ibit; + } + + struct BitwiseAnd + { + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return (a & b); + } + }; + + struct BitwiseOr + { + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return (a | b); + } + }; + + template + __device__ IndexType binsearch_maxle(const IndexType *vec, + const IndexType val, + IndexType low, + IndexType high) { + while (true) { + if (low == high) + return low; //we know it exists + if ((low + 1) == high) + return (vec[high] <= val) ? high : low; + + IndexType mid = low + (high - low) / 2; + + if (vec[mid] > val) + high = mid - 1; + else + low = mid; + + } + } + + // + // ------------------------- Bottom up ------------------------- + // + + // + // fill_unvisited_queue_kernel + // + // Finding unvisited vertices in the visited_bmap, and putting them in the queue + // Vertices represented by the same int in the bitmap are adjacent in the queue, and sorted + // For instance, the queue can look like this : + // 34 38 45 58 61 4 18 24 29 71 84 85 90 + // Because they are represented by those ints in the bitmap : + // [34 38 45 58 61] [4 18 24 29] [71 84 85 90] + + //visited_bmap_nints = the visited_bmap is made of that number of ints + + template + __global__ void fill_unvisited_queue_kernel(int *visited_bmap, + IndexType visited_bmap_nints, + IndexType n, + IndexType *unvisited, + IndexType *unvisited_cnt) { + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage scan_temp_storage; + + //When filling the "unvisited" queue, we use "unvisited_cnt" to know where to write in the queue (equivalent of int off = atomicAddd(unvisited_cnt, 1) ) + //We will actually do only one atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common offset for the block in + //unvisited_common_block_offset + __shared__ IndexType unvisited_common_block_offset; + + //We don't want threads divergence in the loop (we're going to call __syncthreads) + //Using a block-only dependent in the condition of the loop + for (IndexType block_v_idx = blockIdx.x * blockDim.x; + block_v_idx < visited_bmap_nints; + block_v_idx += blockDim.x * gridDim.x) { + + //Index of visited_bmap that this thread will compute + IndexType v_idx = block_v_idx + threadIdx.x; + + int thread_visited_int = (v_idx < visited_bmap_nints) + ? visited_bmap[v_idx] + : + (~0); //will be neutral in the next lines (virtual vertices all visited) + + //The last int can only be partially valid + //If we are indeed taking care of the last visited int in this thread, + //We need to first disable (ie set as "visited") the inactive bits (vertices >= n) + if (v_idx == (visited_bmap_nints - 1)) { + int active_bits = n - (INT_SIZE * v_idx); + int inactive_bits = INT_SIZE - active_bits; + int mask = getMaskNLeftmostBitSet(inactive_bits); + thread_visited_int |= mask; //Setting inactive bits as visited + } + + //Counting number of unvisited vertices represented by this int + int n_unvisited_in_int = __popc(~thread_visited_int); + int unvisited_thread_offset; + + //We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue + //We ask for that space when computing the block scan, that will tell where to write those + //vertices in the queue, using the common offset of the block (see below) + BlockScan(scan_temp_storage).ExclusiveSum(n_unvisited_in_int, unvisited_thread_offset); + + //Last thread knows how many vertices will be written to the queue by this block + //Asking for that space in the queue using the global count, and saving the common offset + if (threadIdx.x == (FILL_UNVISITED_QUEUE_DIMX - 1)) { + IndexType total = unvisited_thread_offset + n_unvisited_in_int; + unvisited_common_block_offset = atomicAdd(unvisited_cnt, total); + } + + //syncthreads for two reasons : + // - we need to broadcast unvisited_common_block_offset + // - we will reuse scan_temp_storage (cf CUB doc) + __syncthreads(); + + IndexType current_unvisited_index = unvisited_common_block_offset + + unvisited_thread_offset; + int nvertices_to_write = n_unvisited_in_int; + + // getNextZeroBit uses __ffs, which gives least significant bit set + // which means that as long as n_unvisited_in_int is valid, + // we will use valid bits + + while (nvertices_to_write > 0) { + if (nvertices_to_write >= 4 && (current_unvisited_index % 4) == 0) { + typename vec_t::vec4 vec_v; + + vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + vec_v.z = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + vec_v.w = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + + typename vec_t::vec4 *unvisited_i4 = reinterpret_cast::vec4*>(&unvisited[current_unvisited_index]); + *unvisited_i4 = vec_v; + + current_unvisited_index += 4; + nvertices_to_write -= 4; + } + else if (nvertices_to_write >= 2 && (current_unvisited_index % 2) == 0) { + typename vec_t::vec2 vec_v; + + vec_v.x = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + vec_v.y = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + + typename vec_t::vec2 *unvisited_i2 = reinterpret_cast::vec2*>(&unvisited[current_unvisited_index]); + *unvisited_i2 = vec_v; + + current_unvisited_index += 2; + nvertices_to_write -= 2; + } else { + IndexType v = v_idx * INT_SIZE + getNextZeroBit(thread_visited_int); + + unvisited[current_unvisited_index] = v; + + current_unvisited_index += 1; + nvertices_to_write -= 1; + } + + } + } + } + + //Wrapper + template + void fill_unvisited_queue(int *visited_bmap, + IndexType visited_bmap_nints, + IndexType n, + IndexType *unvisited, + IndexType *unvisited_cnt, + cudaStream_t m_stream, + bool deterministic) { + dim3 grid, block; + block.x = FILL_UNVISITED_QUEUE_DIMX; + + grid.x = min((IndexType) MAXBLOCKS, (visited_bmap_nints + block.x - 1) / block.x); + + fill_unvisited_queue_kernel<<>>(visited_bmap, + visited_bmap_nints, + n, + unvisited, + unvisited_cnt); + cudaCheckError(); + } + + // + // count_unvisited_edges_kernel + // Couting the total number of unvisited edges in the graph - using an potentially unvisited queue + // We need the current unvisited vertices to be in the unvisited queue + // But visited vertices can be in the potentially_unvisited queue + // We first check if the vertex is still unvisited before using it + // Useful when switching from "Bottom up" to "Top down" + // + + template + __global__ void count_unvisited_edges_kernel(const IndexType *potentially_unvisited, + const IndexType potentially_unvisited_size, + const int *visited_bmap, + IndexType *degree_vertices, + IndexType *mu) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage reduce_temp_storage; + + //number of undiscovered edges counted by this thread + IndexType thread_unvisited_edges_count = 0; + + for (IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < potentially_unvisited_size; + idx += blockDim.x * gridDim.x) { + + IndexType u = potentially_unvisited[idx]; + int u_visited_bmap = visited_bmap[u / INT_SIZE]; + int is_visited = u_visited_bmap & (1 << (u % INT_SIZE)); + + if (!is_visited) + thread_unvisited_edges_count += degree_vertices[u]; + + } + + //We need all thread_unvisited_edges_count to be ready before reducing + __syncthreads(); + + IndexType block_unvisited_edges_count = + BlockReduce(reduce_temp_storage).Sum(thread_unvisited_edges_count); + + //block_unvisited_edges_count is only defined is th.x == 0 + if (threadIdx.x == 0) + atomicAdd(mu, block_unvisited_edges_count); + } + + //Wrapper + template + void count_unvisited_edges(const IndexType *potentially_unvisited, + const IndexType potentially_unvisited_size, + const int *visited_bmap, + IndexType *node_degree, + IndexType *mu, + cudaStream_t m_stream) { + dim3 grid, block; + block.x = COUNT_UNVISITED_EDGES_DIMX; + grid.x = min((IndexType) MAXBLOCKS, (potentially_unvisited_size + block.x - 1) / block.x); + + count_unvisited_edges_kernel<<>>(potentially_unvisited, + potentially_unvisited_size, + visited_bmap, + node_degree, + mu); + cudaCheckError(); + } + + // + // Main Bottom Up kernel + // Here we will start to process unvisited vertices in the unvisited queue + // We will only consider the first MAIN_BOTTOMUP_MAX_EDGES edges + // If it's not possible to define a valid parent using only those edges, + // add it to the "left_unvisited_queue" + // + + // + // We will use the "vertices represented by the same int in the visited bmap are adjacents and sorted in the unvisited queue" property + // It is used to do a reduction locally and fully build the new visited_bmap + // + + template + __global__ void main_bottomup_kernel(const IndexType *unvisited, + const IndexType unvisited_size, + IndexType *left_unvisited, + IndexType *left_unvisited_cnt, + int *visited_bmap, + const IndexType *row_ptr, + const IndexType *col_ind, + IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_cnt, + IndexType *distances, + IndexType *predecessors, + int *edge_mask) { + typedef cub::BlockDiscontinuity BlockDiscontinuity; + typedef cub::WarpReduce WarpReduce; + typedef cub::BlockScan BlockScan; + + __shared__ typename BlockDiscontinuity::TempStorage discontinuity_temp_storage; + __shared__ typename WarpReduce::TempStorage reduce_temp_storage; + __shared__ typename BlockScan::TempStorage scan_temp_storage; + + //To write vertices in the frontier, + //We will use a block scan to locally compute the offsets + //frontier_common_block_offset contains the common offset for the block + __shared__ IndexType frontier_common_block_offset; + + // When building the new visited_bmap, we reduce (using a bitwise and) the visited_bmap ints + // from the vertices represented by the same int (for instance vertices 1, 5, 9, 13, 23) + // vertices represented by the same int will be designed as part of the same "group" + // To detect the deliminations between those groups, we use BlockDiscontinuity + // Then we need to create the new "visited_bmap" within those group. + // We use a warp reduction that takes into account limits between groups to do it + // But a group can be cut in two different warps : in that case, the second warp + // put the result of its local reduction in local_visited_bmap_warp_head + // the first warp will then read it and finish the reduction + + __shared__ int local_visited_bmap_warp_head[MAIN_BOTTOMUP_NWARPS]; + + const int warpid = threadIdx.x / WARP_SIZE; + const int laneid = threadIdx.x % WARP_SIZE; + + // we will call __syncthreads inside the loop + // we need to keep complete block active + for (IndexType block_off = blockIdx.x * blockDim.x; + block_off < unvisited_size; + block_off += blockDim.x * gridDim.x) + { + IndexType idx = block_off + threadIdx.x; + + // This thread will take care of unvisited_vertex + // in the visited_bmap, it is represented by the int at index + // visited_bmap_index = unvisited_vertex/INT_SIZE + // it will be used by BlockDiscontinuity + // to flag the separation between groups of vertices (vertices represented by different in in visited_bmap) + IndexType visited_bmap_index[1]; //this is an array of size 1 because CUB needs one + visited_bmap_index[0] = -1; + IndexType unvisited_vertex = -1; + + // local_visited_bmap gives info on the visited bit of unvisited_vertex + // + // By default, everything is visited + // This is because we only take care of unvisited vertices here, + // The other are by default unvisited + // If a vertex remain unvisited, we will notice it here + // That's why by default we consider everything visited ( ie ~0 ) + // If we fail to assign one parent to an unvisited vertex, we will + // explicitly unset the bit + int local_visited_bmap = (~0); + int found = 0; + int more_to_visit = 0; + IndexType valid_parent; + IndexType left_unvisited_off; + + if (idx < unvisited_size) + { + //Processing first STPV edges of unvisited v + //If bigger than that, push to left_unvisited queue + unvisited_vertex = unvisited[idx]; + + IndexType edge_begin = row_ptr[unvisited_vertex]; + IndexType edge_end = row_ptr[unvisited_vertex + 1]; + + visited_bmap_index[0] = unvisited_vertex / INT_SIZE; + + IndexType degree = edge_end - edge_begin; + + for (IndexType edge = edge_begin; + edge < min(edge_end, edge_begin + MAIN_BOTTOMUP_MAX_EDGES); ++edge) + { + if (edge_mask && !edge_mask[edge]) + continue; + + IndexType parent_candidate = col_ind[edge]; + + if (distances[parent_candidate] == (lvl - 1)) + { + found = 1; + valid_parent = parent_candidate; + break; + } + } + + // This vertex will remain unvisited at the end of this kernel + // Explicitly say it + if (!found) + local_visited_bmap &= ~(1 << (unvisited_vertex % INT_SIZE)); //let this one unvisited + else + { + if (distances) + distances[unvisited_vertex] = lvl; + if (predecessors) + predecessors[unvisited_vertex] = valid_parent; + } + + //If we haven't found a parent and there's more edge to check + if (!found && degree > MAIN_BOTTOMUP_MAX_EDGES) + { + left_unvisited_off = atomicAdd(left_unvisited_cnt, (IndexType) 1); + more_to_visit = 1; + } + + } + + // + // We will separate vertices in group + // Two vertices are in the same group if represented by same int in visited_bmap + // ie u and v in same group <=> u/32 == v/32 + // + // We will now flag the head of those group (first element of each group) + // + // 1) All vertices within the same group are adjacent in the queue (cf fill_unvisited_queue) + // 2) A group is of size <= 32, so a warp will contain at least one head, and a group will be contained + // at most by two warps + + int is_head_a[1]; //CUB need an array + BlockDiscontinuity(discontinuity_temp_storage).FlagHeads(is_head_a, + visited_bmap_index, + cub::Inequality()); + int is_head = is_head_a[0]; + + // Computing the warp reduce within group + // This primitive uses the is_head flags to know where the limits of the groups are + // We use bitwise and as operator, because of the fact that 1 is the default value + // If a vertex is unvisited, we have to explicitly ask for it + int local_bmap_agg = + WarpReduce(reduce_temp_storage).HeadSegmentedReduce(local_visited_bmap, + is_head, + BitwiseAnd()); + + // We need to take care of the groups cut in two in two different warps + // Saving second part of the reduce here, then applying it on the first part bellow + // Corner case : if the first thread of the warp is a head, then this group is not cut in two + // and then we have to be neutral (for an bitwise and, it's an ~0) + if (laneid == 0) + { + local_visited_bmap_warp_head[warpid] = (is_head) ? (~0) : local_bmap_agg; + } + + //broadcasting local_visited_bmap_warp_head + __syncthreads(); + + int head_ballot = cugraph::utils::ballot(is_head); + + //As long as idx < unvisited_size, we know there's at least one head per warp + int laneid_last_head_in_warp = INT_SIZE - 1 - __clz(head_ballot); + + int is_last_head_in_warp = (laneid == laneid_last_head_in_warp); + + // if laneid == 0 && is_last_head_in_warp, it's a special case where + // a group of size 32 starts exactly at lane 0 + // in that case, nothing to do (this group is not cut by a warp delimitation) + // we also have to make sure that a warp actually exists after this one (this corner case is handled after) + if (laneid != 0 && (is_last_head_in_warp & (warpid + 1) < MAIN_BOTTOMUP_NWARPS)) + { + local_bmap_agg &= local_visited_bmap_warp_head[warpid + 1]; + } + + //Three cases : + // -> This is the first group of the block - it may be cut in two (with previous block) + // -> This is the last group of the block - same thing + // -> This group is completely contained in this block + + if (warpid == 0 && laneid == 0) + { + //The first elt of this group considered in this block is unvisited_vertex + //We know that's the case because elts are sorted in a group, and we are at laneid == 0 + //We will do an atomicOr - we have to be neutral about elts < unvisited_vertex + int iv = unvisited_vertex % INT_SIZE; // we know that this unvisited_vertex is valid + int mask = getMaskNLeftmostBitSet(INT_SIZE - iv); + local_bmap_agg &= mask; //we have to be neutral for elts < unvisited_vertex + atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); + } + else if (warpid == (MAIN_BOTTOMUP_NWARPS - 1) && + laneid >= laneid_last_head_in_warp && // We need the other ones to go in else case + idx < unvisited_size //we could be out + ) + { + //Last head of the block + //We don't know if this group is complete + + //last_v is the last unvisited_vertex of the group IN THIS block + //we dont know about the rest - we have to be neutral about elts > last_v + + //the destination thread of the __shfl is active + int laneid_max = min((IndexType) (WARP_SIZE - 1), + (unvisited_size - (block_off + 32 * warpid))); + IndexType last_v = cugraph::utils::shfl(unvisited_vertex, + laneid_max, + WARP_SIZE, + __activemask()); + + if (is_last_head_in_warp) + { + int ilast_v = last_v % INT_SIZE + 1; + int mask = getMaskNRightmostBitSet(ilast_v); + local_bmap_agg &= mask; //we have to be neutral for elts > last_unvisited_vertex + atomicOr(&visited_bmap[unvisited_vertex / INT_SIZE], local_bmap_agg); + } + } + else + { + //group completely in block + if (is_head && idx < unvisited_size) { + visited_bmap[unvisited_vertex / INT_SIZE] = local_bmap_agg; //no atomics needed, we know everything about this int + } + } + + //Saving in frontier + + int thread_frontier_offset; + BlockScan(scan_temp_storage).ExclusiveSum(found, thread_frontier_offset); + IndexType inclusive_sum = thread_frontier_offset + found; + if (threadIdx.x == (MAIN_BOTTOMUP_DIMX - 1) && inclusive_sum) + { + frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); + } + + //1) Broadcasting frontier_common_block_offset + //2) we want to reuse the *_temp_storage + __syncthreads(); + + if (found) + new_frontier[frontier_common_block_offset + thread_frontier_offset] = unvisited_vertex; + if (more_to_visit) + left_unvisited[left_unvisited_off] = unvisited_vertex; + + } + } + + template + void bottom_up_main(IndexType *unvisited, + IndexType unvisited_size, + IndexType *left_unvisited, + IndexType *d_left_unvisited_idx, + int *visited, + const IndexType *row_ptr, + const IndexType *col_ind, + IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_idx, + IndexType *distances, + IndexType *predecessors, + int *edge_mask, + cudaStream_t m_stream, + bool deterministic) { + dim3 grid, block; + block.x = MAIN_BOTTOMUP_DIMX; + + grid.x = min((IndexType) MAXBLOCKS, ((unvisited_size + block.x - 1)) / block.x); + + main_bottomup_kernel<<>>(unvisited, + unvisited_size, + left_unvisited, + d_left_unvisited_idx, + visited, + row_ptr, + col_ind, + lvl, + new_frontier, + new_frontier_idx, + distances, + predecessors, + edge_mask); + cudaCheckError(); + } + + // + // bottom_up_large_degree_kernel + // finishing the work started in main_bottomup_kernel for vertex with degree > MAIN_BOTTOMUP_MAX_EDGES && no parent found + // + template + __global__ void bottom_up_large_degree_kernel(IndexType *left_unvisited, + IndexType left_unvisited_size, + int *visited, + const IndexType *row_ptr, + const IndexType *col_ind, + IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_cnt, + IndexType *distances, + IndexType *predecessors, + int *edge_mask) { + + int logical_lane_id = threadIdx.x % BOTTOM_UP_LOGICAL_WARP_SIZE; + int logical_warp_id = threadIdx.x / BOTTOM_UP_LOGICAL_WARP_SIZE; + int logical_warps_per_block = blockDim.x / BOTTOM_UP_LOGICAL_WARP_SIZE; + + //Inactive threads are not a pb for __ballot (known behaviour) + for (IndexType idx = logical_warps_per_block * blockIdx.x + logical_warp_id; + idx < left_unvisited_size; + idx += gridDim.x * logical_warps_per_block) { + + //Unvisited vertices - potentially in the next frontier + IndexType v = left_unvisited[idx]; + + //Used only with symmetric graphs + //Parents are included in v's neighbors + IndexType first_i_edge = row_ptr[v] + MAIN_BOTTOMUP_MAX_EDGES; //we already have checked the first MAIN_BOTTOMUP_MAX_EDGES edges in find_unvisited + + IndexType end_i_edge = row_ptr[v + 1]; + + //We can have warp divergence in the next loop + //It's not a pb because the behaviour of __ballot + //is know with inactive threads + for (IndexType i_edge = first_i_edge + logical_lane_id; + i_edge < end_i_edge; + i_edge += BOTTOM_UP_LOGICAL_WARP_SIZE) { + + IndexType valid_parent = -1; + + if (!edge_mask || edge_mask[i_edge]) { + IndexType u = col_ind[i_edge]; + IndexType lvl_u = distances[u]; + + if (lvl_u == (lvl - 1)) { + valid_parent = u; + } + } + + unsigned int warp_valid_p_ballot = cugraph::utils::ballot((valid_parent != -1)); + + int logical_warp_id_in_warp = (threadIdx.x % WARP_SIZE) / BOTTOM_UP_LOGICAL_WARP_SIZE; + unsigned int mask = (1 << BOTTOM_UP_LOGICAL_WARP_SIZE) - 1; + unsigned int logical_warp_valid_p_ballot = warp_valid_p_ballot + >> (BOTTOM_UP_LOGICAL_WARP_SIZE * logical_warp_id_in_warp); + logical_warp_valid_p_ballot &= mask; + + int chosen_thread = __ffs(logical_warp_valid_p_ballot) - 1; + + if (chosen_thread == logical_lane_id) { + //Using only one valid parent (reduce bw) + IndexType off = atomicAdd(new_frontier_cnt, (IndexType) 1); + int m = 1 << (v % INT_SIZE); + atomicOr(&visited[v / INT_SIZE], m); + distances[v] = lvl; + + if (predecessors) + predecessors[v] = valid_parent; + + new_frontier[off] = v; + } + + if (logical_warp_valid_p_ballot) { + break; + } + } + + } + } + + template + void bottom_up_large(IndexType *left_unvisited, + IndexType left_unvisited_size, + int *visited, + const IndexType *row_ptr, + const IndexType *col_ind, + IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_idx, + IndexType *distances, + IndexType *predecessors, + int *edge_mask, + cudaStream_t m_stream, + bool deterministic) { + dim3 grid, block; + block.x = LARGE_BOTTOMUP_DIMX; + grid.x = min( (IndexType) MAXBLOCKS, + ((left_unvisited_size + block.x - 1) * BOTTOM_UP_LOGICAL_WARP_SIZE) / block.x); + + bottom_up_large_degree_kernel<<>>(left_unvisited, + left_unvisited_size, + visited, + row_ptr, + col_ind, + lvl, + new_frontier, + new_frontier_idx, + distances, + predecessors, + edge_mask); + cudaCheckError(); + } + + // + // + // ------------------------------ Top down ------------------------------ + // + // + + // + // compute_bucket_offsets_kernel + // simply compute the position in the frontier corresponding all valid edges with index=TOP_DOWN_BUCKET_SIZE * k, k integer + // + + template + __global__ void compute_bucket_offsets_kernel(const IndexType *frontier_degrees_exclusive_sum, + IndexType *bucket_offsets, + const IndexType frontier_size, + IndexType total_degree) { + IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX + * NBUCKETS_PER_BLOCK + 1); + + for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x; + bid <= end; + bid += gridDim.x * blockDim.x) { + + IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1); + + bucket_offsets[bid] = binsearch_maxle(frontier_degrees_exclusive_sum, + eid, + (IndexType) 0, + frontier_size - 1); + + } + } + + template + void compute_bucket_offsets(IndexType *cumul, + IndexType *bucket_offsets, + IndexType frontier_size, + IndexType total_degree, + cudaStream_t m_stream) { + dim3 grid, block; + block.x = COMPUTE_BUCKET_OFFSETS_DIMX; + + grid.x = min( (IndexType) MAXBLOCKS, + ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX + * NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x); + + compute_bucket_offsets_kernel<<>>(cumul, + bucket_offsets, + frontier_size, + total_degree); + cudaCheckError(); + } + + // + // topdown_expand_kernel + // Read current frontier and compute new one with top down paradigm + // One thread = One edge + // To know origin of edge, we have to find where is index_edge in the values of frontier_degrees_exclusive_sum (using a binary search, max less or equal than) + // This index k will give us the origin of this edge, which is frontier[k] + // This thread will then process the (linear_idx_thread - frontier_degrees_exclusive_sum[k])-ith edge of vertex frontier[k] + // + // To process blockDim.x = TOP_DOWN_EXPAND_DIMX edges, we need to first load NBUCKETS_PER_BLOCK bucket offsets - those will help us do the binary searches + // We can load up to TOP_DOWN_EXPAND_DIMX of those bucket offsets - that way we prepare for the next MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x edges + // + // Once we have those offsets, we may still need a few values from frontier_degrees_exclusive_sum to compute exact index k + // To be able to do it, we will load the values that we need from frontier_degrees_exclusive_sum in shared memory + // We know that it will fit because we never add node with degree == 0 in the frontier, so we have an upper bound on the number of value to load (see below) + // + // We will then look which vertices are not visited yet : + // 1) if the unvisited vertex is isolated (=> degree == 0), we mark it as visited, update distances and predecessors, and move on + // 2) if the unvisited vertex has degree > 0, we add it to the "frontier_candidates" queue + // + // We then treat the candidates queue using the threadIdx.x < ncandidates + // If we are indeed the first thread to discover that vertex (result of atomicOr(visited)) + // We add it to the new frontier + // + + template + __global__ void topdown_expand_kernel(const IndexType *row_ptr, + const IndexType *col_ind, + const IndexType *frontier, + const IndexType frontier_size, + const IndexType totaldegree, + const IndexType max_items_per_thread, + const IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_cnt, + const IndexType *frontier_degrees_exclusive_sum, + const IndexType *frontier_degrees_exclusive_sum_buckets_offsets, + int *bmap, + IndexType *distances, + IndexType *predecessors, + const int *edge_mask, + const int *isolated_bmap, + bool directed) { + //BlockScan + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage scan_storage; + + // We will do a scan to know where to write in frontier + // This will contain the common offset of the block + __shared__ IndexType frontier_common_block_offset; + + __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1]; + __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1]; + + // + // Frontier candidates local queue + // We process TOP_DOWN_BATCH_SIZE vertices in parallel, so we need to be able to store everything + // We also save the predecessors here, because we will not be able to retrieve it after + // + __shared__ IndexType shared_local_new_frontier_candidates[TOP_DOWN_BATCH_SIZE + * TOP_DOWN_EXPAND_DIMX]; + __shared__ IndexType shared_local_new_frontier_predecessors[TOP_DOWN_BATCH_SIZE + * TOP_DOWN_EXPAND_DIMX]; + __shared__ IndexType block_n_frontier_candidates; + + IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread; + IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1) + / TOP_DOWN_EXPAND_DIMX; + + n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left); + + for (; + (n_items_per_thread_left > 0) && (block_offset < totaldegree); + + block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x, + n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) { + + // In this loop, we will process batch_set_size batches + IndexType nitems_per_thread = min( n_items_per_thread_left, + (IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD); + + // Loading buckets offset (see compute_bucket_offsets_kernel) + + if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1)) + shared_buckets_offsets[threadIdx.x] = + frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE + + threadIdx.x]; + + // We will use shared_buckets_offsets + __syncthreads(); + + // + // shared_buckets_offsets gives us a range of the possible indexes + // for edge of linear_threadx, we are looking for the value k such as + // k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx + // + // we have 0 <= k < frontier_size + // but we also have : + // + // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE] + // <= k + // <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1] + // + // To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below) + // We will load them here + // We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop + // Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below) + + //We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[ + //If it doesn't fit, --right until it does, then loop + //It is excepted to fit on the first try, that's why we start right = nitems_per_thread + + IndexType left = 0; + IndexType right = nitems_per_thread; + + while (left < nitems_per_thread) { + // + // Values that are necessary to compute the local binary searches + // We only need those with indexes between extremes indexes of buckets_offsets + // We need the next val for the binary search, hence the +1 + // + + IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] + - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; + + //If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1 + while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) { + --right; + + nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK] + - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1; + } + + IndexType nitems_per_thread_for_this_load = right - left; + + IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left + * NBUCKETS_PER_BLOCK]; + + if (threadIdx.x < nvalues_to_load) { + shared_frontier_degrees_exclusive_sum[threadIdx.x] = + frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset + + threadIdx.x]; + } + + if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) { + shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] = + frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset + + TOP_DOWN_EXPAND_DIMX]; + } + + //shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync + __syncthreads(); + + // Now we will process the edges + // Here each thread will process nitems_per_thread_for_this_load + for (IndexType item_index = 0; + item_index < nitems_per_thread_for_this_load; + item_index += TOP_DOWN_BATCH_SIZE) { + + // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism) + // Reduces latency + + IndexType current_max_edge_index = min(block_offset + + (left + + nitems_per_thread_for_this_load) + * blockDim.x, + totaldegree); + + //We will need vec_u (source of the edge) until the end if we need to save the predecessors + //For others informations, we will reuse pointers on the go (nvcc does not color well the registers in that case) + + IndexType vec_u[TOP_DOWN_BATCH_SIZE]; + IndexType local_buf1[TOP_DOWN_BATCH_SIZE]; + IndexType local_buf2[TOP_DOWN_BATCH_SIZE]; + + IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0]; + +#pragma unroll + for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + + IndexType ibatch = left + item_index + iv; + IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x; + + if (gid < current_max_edge_index) { + IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x) + / TOP_DOWN_BUCKET_SIZE; + IndexType bucket_start = shared_buckets_offsets[start_off_idx] + - frontier_degrees_exclusive_sum_block_offset; + IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1] + - frontier_degrees_exclusive_sum_block_offset; + + IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum, + gid, + bucket_start, + bucket_end) + + frontier_degrees_exclusive_sum_block_offset; + vec_u[iv] = frontier[k]; // origin of this edge + vec_frontier_degrees_exclusive_sum_index[iv] = + frontier_degrees_exclusive_sum[k]; + } else { + vec_u[iv] = -1; + vec_frontier_degrees_exclusive_sum_index[iv] = -1; + } + + } + + IndexType *vec_row_ptr_u = &local_buf1[0]; +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType u = vec_u[iv]; + //row_ptr for this vertex origin u + vec_row_ptr_u[iv] = (u != -1) + ? row_ptr[u] + : + -1; + } + + //We won't need row_ptr after that, reusing pointer + IndexType *vec_dest_v = vec_row_ptr_u; + +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType thread_item_index = left + item_index + iv; + IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x; + + IndexType row_ptr_u = vec_row_ptr_u[iv]; + IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv]; + + if (edge_mask && !edge_mask[edge]) + row_ptr_u = -1; //disabling edge + + //Destination of this edge + vec_dest_v[iv] = (row_ptr_u != -1) + ? col_ind[edge] + : + -1; + } + + //We don't need vec_frontier_degrees_exclusive_sum_index anymore + IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index; +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_dest_v[iv]; + vec_v_visited_bmap[iv] = (v != -1) + ? bmap[v / INT_SIZE] + : + (~0); //will look visited + } + + // From now on we will consider v as a frontier candidate + // If for some reason vec_candidate[iv] should be put in the new_frontier + // Then set vec_candidate[iv] = -1 + IndexType *vec_frontier_candidate = vec_dest_v; + +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_frontier_candidate[iv]; + int m = 1 << (v % INT_SIZE); + + int is_visited = vec_v_visited_bmap[iv] & m; + + if (is_visited) + vec_frontier_candidate[iv] = -1; + } + + if (directed) { + //vec_v_visited_bmap is available + + IndexType *vec_is_isolated_bmap = vec_v_visited_bmap; + +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_frontier_candidate[iv]; + vec_is_isolated_bmap[iv] = (v != -1) + ? isolated_bmap[v / INT_SIZE] + : + -1; + } + +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_frontier_candidate[iv]; + int m = 1 << (v % INT_SIZE); + int is_isolated = vec_is_isolated_bmap[iv] & m; + + //If v is isolated, we will not add it to the frontier (it's not a frontier candidate) + // 1st reason : it's useless + // 2nd reason : it will make top down algo fail + // we need each node in frontier to have a degree > 0 + // If it is isolated, we just need to mark it as visited, and save distance and predecessor here. Not need to check return value of atomicOr + + if (is_isolated && v != -1) { + int m = 1 << (v % INT_SIZE); + atomicOr(&bmap[v / INT_SIZE], m); + if (distances) + distances[v] = lvl; + + if (predecessors) + predecessors[v] = vec_u[iv]; + + //This is no longer a candidate, neutralize it + vec_frontier_candidate[iv] = -1; + } + + } + } + + //Number of successor candidate hold by this thread + IndexType thread_n_frontier_candidates = 0; + +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + IndexType v = vec_frontier_candidate[iv]; + if (v != -1) + ++thread_n_frontier_candidates; + } + + // We need to have all nfrontier_candidates to be ready before doing the scan + __syncthreads(); + + // We will put the frontier candidates in a local queue + // Computing offsets + IndexType thread_frontier_candidate_offset = 0; //offset inside block + BlockScan(scan_storage).ExclusiveSum(thread_n_frontier_candidates, + thread_frontier_candidate_offset); + +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + //May have bank conflicts + IndexType frontier_candidate = vec_frontier_candidate[iv]; + + if (frontier_candidate != -1) { + shared_local_new_frontier_candidates[thread_frontier_candidate_offset] = + frontier_candidate; + shared_local_new_frontier_predecessors[thread_frontier_candidate_offset] = + vec_u[iv]; + ++thread_frontier_candidate_offset; + } + } + + if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { + //No need to add nsuccessor_candidate, even if its an + //exclusive sum + //We incremented the thread_frontier_candidate_offset + block_n_frontier_candidates = thread_frontier_candidate_offset; + } + + //broadcast block_n_frontier_candidates + __syncthreads(); + + IndexType naccepted_vertices = 0; + //We won't need vec_frontier_candidate after that + IndexType *vec_frontier_accepted_vertex = vec_frontier_candidate; + +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + const int idx_shared = iv * blockDim.x + threadIdx.x; + vec_frontier_accepted_vertex[iv] = -1; + + if (idx_shared < block_n_frontier_candidates) { + IndexType v = shared_local_new_frontier_candidates[idx_shared]; //popping queue + int m = 1 << (v % INT_SIZE); + int q = atomicOr(&bmap[v / INT_SIZE], m); //atomicOr returns old + + if (!(m & q)) { //if this thread was the first to discover this node + if (distances) + distances[v] = lvl; + + if (predecessors) { + IndexType pred = shared_local_new_frontier_predecessors[idx_shared]; + predecessors[v] = pred; + } + + vec_frontier_accepted_vertex[iv] = v; + ++naccepted_vertices; + } + } + + } + + //We need naccepted_vertices to be ready + __syncthreads(); + + IndexType thread_new_frontier_offset; + + BlockScan(scan_storage).ExclusiveSum(naccepted_vertices, thread_new_frontier_offset); + + if (threadIdx.x == (TOP_DOWN_EXPAND_DIMX - 1)) { + + IndexType inclusive_sum = thread_new_frontier_offset + naccepted_vertices; + //for this thread, thread_new_frontier_offset + has_successor (exclusive sum) + if (inclusive_sum) + frontier_common_block_offset = atomicAdd(new_frontier_cnt, inclusive_sum); + } + + //Broadcasting frontier_common_block_offset + __syncthreads(); + +#pragma unroll + for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) { + const int idx_shared = iv * blockDim.x + threadIdx.x; + if (idx_shared < block_n_frontier_candidates) { + + IndexType new_frontier_vertex = vec_frontier_accepted_vertex[iv]; + + if (new_frontier_vertex != -1) { + IndexType off = frontier_common_block_offset + thread_new_frontier_offset++; + new_frontier[off] = new_frontier_vertex; + } + } + } + + } + + //We need to keep shared_frontier_degrees_exclusive_sum coherent + __syncthreads(); + + //Preparing for next load + left = right; + right = nitems_per_thread; + } + + //we need to keep shared_buckets_offsets coherent + __syncthreads(); + } + + } + + template + void frontier_expand(const IndexType *row_ptr, + const IndexType *col_ind, + const IndexType *frontier, + const IndexType frontier_size, + const IndexType totaldegree, + const IndexType lvl, + IndexType *new_frontier, + IndexType *new_frontier_cnt, + const IndexType *frontier_degrees_exclusive_sum, + const IndexType *frontier_degrees_exclusive_sum_buckets_offsets, + int *visited_bmap, + IndexType *distances, + IndexType *predecessors, + const int *edge_mask, + const int *isolated_bmap, + bool directed, + cudaStream_t m_stream, + bool deterministic) { + if (!totaldegree) + return; + + dim3 block; + block.x = TOP_DOWN_EXPAND_DIMX; + + IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1) + / (MAXBLOCKS * block.x); + + dim3 grid; + grid.x = min( (totaldegree + max_items_per_thread * block.x - 1) + / (max_items_per_thread * block.x), + (IndexType) MAXBLOCKS); + + topdown_expand_kernel<<>>(row_ptr, + col_ind, + frontier, + frontier_size, + totaldegree, + max_items_per_thread, + lvl, + new_frontier, + new_frontier_cnt, + frontier_degrees_exclusive_sum, + frontier_degrees_exclusive_sum_buckets_offsets, + visited_bmap, + distances, + predecessors, + edge_mask, + isolated_bmap, + directed); + cudaCheckError(); + } + + template + __global__ void flag_isolated_vertices_kernel(IndexType n, + int *isolated_bmap, + const IndexType *row_ptr, + IndexType *degrees, + IndexType *nisolated) { + typedef cub::BlockLoad BlockLoad; + typedef cub::BlockStore BlockStore; + typedef cub::BlockReduce BlockReduce; + typedef cub::WarpReduce WarpReduce; + + __shared__ typename BlockLoad::TempStorage load_temp_storage; + __shared__ typename BlockStore::TempStorage store_temp_storage; + __shared__ typename BlockReduce::TempStorage block_reduce_temp_storage; + + __shared__ typename WarpReduce::TempStorage warp_reduce_temp_storage[FLAG_ISOLATED_VERTICES_DIMX + / FLAG_ISOLATED_VERTICES_THREADS_PER_INT]; + + __shared__ IndexType row_ptr_tail[FLAG_ISOLATED_VERTICES_DIMX]; + + for (IndexType block_off = FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + * (blockDim.x * blockIdx.x); + block_off < n; + block_off += FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * (blockDim.x * gridDim.x)) { + + IndexType thread_off = block_off + + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD * threadIdx.x; + IndexType last_node_thread = thread_off + FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1; + + IndexType thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; + IndexType block_valid_items = n - block_off + 1; //+1, we need row_ptr[last_node+1] + + BlockLoad(load_temp_storage).Load(row_ptr + block_off, + thread_row_ptr, + block_valid_items, + -1); + + //To compute 4 degrees, we need 5 values of row_ptr + //Saving the "5th" value in shared memory for previous thread to use + if (threadIdx.x > 0) { + row_ptr_tail[threadIdx.x - 1] = thread_row_ptr[0]; + } + + //If this is the last thread, it needs to load its row ptr tail value + if (threadIdx.x == (FLAG_ISOLATED_VERTICES_DIMX - 1) && last_node_thread < n) { + row_ptr_tail[threadIdx.x] = row_ptr[last_node_thread + 1]; + + } + __syncthreads(); // we may reuse temp_storage + + int local_isolated_bmap = 0; + + IndexType imax = (n - thread_off); + + IndexType local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD]; + +#pragma unroll + for (int i = 0; i < (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1); ++i) { + IndexType degree = local_degree[i] = thread_row_ptr[i + 1] - thread_row_ptr[i]; + + if (i < imax) + local_isolated_bmap |= ((degree == 0) << i); + } + + if (last_node_thread < n) { + IndexType degree = local_degree[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1] = + row_ptr_tail[threadIdx.x] + - thread_row_ptr[FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1]; + + local_isolated_bmap |= ((degree == 0) + << (FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD - 1)); + + } + + local_isolated_bmap <<= (thread_off % INT_SIZE); + + IndexType local_nisolated = __popc(local_isolated_bmap); + + //We need local_nisolated and local_isolated_bmap to be ready for next steps + __syncthreads(); + + IndexType total_nisolated = BlockReduce(block_reduce_temp_storage).Sum(local_nisolated); + + if (threadIdx.x == 0 && total_nisolated) { + atomicAdd(nisolated, total_nisolated); + } + + int logicalwarpid = threadIdx.x / FLAG_ISOLATED_VERTICES_THREADS_PER_INT; + + //Building int for bmap + int int_aggregate_isolated_bmap = + WarpReduce(warp_reduce_temp_storage[logicalwarpid]).Reduce(local_isolated_bmap, + BitwiseOr()); + + int is_head_of_visited_int = + ((threadIdx.x % (FLAG_ISOLATED_VERTICES_THREADS_PER_INT)) == 0); + if (is_head_of_visited_int) { + isolated_bmap[thread_off / INT_SIZE] = int_aggregate_isolated_bmap; + } + + BlockStore(store_temp_storage).Store(degrees + block_off, local_degree, block_valid_items); + } + } + + template + void flag_isolated_vertices(IndexType n, + int *isolated_bmap, + const IndexType *row_ptr, + IndexType *degrees, + IndexType *nisolated, + cudaStream_t m_stream) { + dim3 grid, block; + block.x = FLAG_ISOLATED_VERTICES_DIMX; + + grid.x = min( (IndexType) MAXBLOCKS, + (n / FLAG_ISOLATED_VERTICES_VERTICES_PER_THREAD + 1 + block.x - 1) / block.x); + + flag_isolated_vertices_kernel<<>>(n, + isolated_bmap, + row_ptr, + degrees, + nisolated); + cudaCheckError(); + } + + // + // + // + // Some utils functions + // + // + + //Creates CUB data for graph size n + template + void cub_exclusive_sum_alloc(IndexType n, void*& d_temp_storage, size_t &temp_storage_bytes) { + // Determine temporary device storage requirements for exclusive prefix scan + d_temp_storage = NULL; + temp_storage_bytes = 0; + IndexType *d_in = NULL, *d_out = NULL; + cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n); + // Allocate temporary storage for exclusive prefix scan + cudaStream_t stream{nullptr}; + ALLOC_TRY(&d_temp_storage, temp_storage_bytes, stream); + } + + template + __global__ void fill_kernel(IndexType *vec, IndexType n, IndexType val) { + for (IndexType u = blockDim.x * blockIdx.x + threadIdx.x; + u < n; + u += gridDim.x * blockDim.x) + vec[u] = val; + + } + + template + void fill(IndexType *vec, IndexType n, IndexType val, cudaStream_t m_stream) { + dim3 grid, block; + block.x = 256; + grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS); + fill_kernel<<>>(vec, n, val); + cudaCheckError(); + } + + template + __global__ void set_frontier_degree_kernel(IndexType *frontier_degree, + IndexType *frontier, + const IndexType *degree, + IndexType n) { + for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x; + idx < n; + idx += gridDim.x * blockDim.x) { + IndexType u = frontier[idx]; + frontier_degree[idx] = degree[u]; + } + } + + template + void set_frontier_degree(IndexType *frontier_degree, + IndexType *frontier, + const IndexType *degree, + IndexType n, + cudaStream_t m_stream) { + dim3 grid, block; + block.x = 256; + grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS); + set_frontier_degree_kernel<<>>(frontier_degree, + frontier, + degree, + n); + cudaCheckError(); + } + + template + void exclusive_sum(void *d_temp_storage, + size_t temp_storage_bytes, + IndexType *d_in, + IndexType *d_out, + IndexType num_items, + cudaStream_t m_stream) { + if (num_items <= 1) + return; //DeviceScan fails if n==1 + cub::DeviceScan::ExclusiveSum(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + m_stream); + } + + template + __global__ void fill_vec_kernel(T *vec, T n, T val) { + for (T idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < n; + idx += blockDim.x * gridDim.x) + vec[idx] = val; + } + + template + void fill_vec(T *vec, T n, T val, cudaStream_t stream) { + dim3 grid, block; + block.x = 256; + grid.x = (n + block.x - 1) / block.x; + + fill_vec_kernel<<>>(vec, n, val); + cudaCheckError(); + } +} +// diff --git a/cpp/src/traversal/nvgraph_sssp.cu b/cpp/src/traversal/nvgraph_sssp.cu new file mode 100644 index 00000000000..fdccfa23c91 --- /dev/null +++ b/cpp/src/traversal/nvgraph_sssp.cu @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** ---------------------------------------------------------------------------* + * @brief Wrapper functions for Nvgraph sssp + * + * @file nvgraph_sssp.cu + * ---------------------------------------------------------------------------**/ + +#include +#include +#include +#include "utilities/error_utils.h" +#include "converters/nvgraph.cuh" +#include + +gdf_error gdf_sssp_nvgraph(gdf_graph *gdf_G, + const int *source_vert, + gdf_column *sssp_distances) { + GDF_REQUIRE(gdf_G != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(*source_vert >= 0, GDF_INVALID_API_CALL); + GDF_REQUIRE(*source_vert < sssp_distances->size, GDF_INVALID_API_CALL); + GDF_REQUIRE(sssp_distances != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(sssp_distances->data != nullptr, GDF_INVALID_API_CALL); + GDF_REQUIRE(!sssp_distances->valid, GDF_VALIDITY_UNSUPPORTED); + GDF_REQUIRE(sssp_distances->size > 0, GDF_INVALID_API_CALL); + + // init nvgraph + // TODO : time this call + nvgraphHandle_t nvg_handle = 0; + nvgraphGraphDescr_t nvgraph_G = 0; + cudaDataType_t settype; + + NVG_TRY(nvgraphCreate(&nvg_handle)); + GDF_TRY(gdf_createGraph_nvgraph(nvg_handle, gdf_G, &nvgraph_G, true)); + + int sssp_index = 0; + int weight_index = 0; + rmm::device_vector d_val; + + cudaStream_t stream{nullptr}; + + if (gdf_G->transposedAdjList->edge_data == nullptr) { + // use a fp32 vector [1,...,1] + settype = CUDA_R_32F; + d_val.resize(gdf_G->transposedAdjList->indices->size); + thrust::fill(rmm::exec_policy(stream)->on(stream), d_val.begin(), d_val.end(), 1.0); + NVG_TRY(nvgraphAttachEdgeData(nvg_handle, + nvgraph_G, + weight_index, + settype, + (void * ) thrust::raw_pointer_cast(d_val.data()))); + } + else { + switch (gdf_G->transposedAdjList->edge_data->dtype) { + case GDF_FLOAT32: + settype = CUDA_R_32F; + break; + case GDF_FLOAT64: + settype = CUDA_R_64F; + break; + default: + return GDF_UNSUPPORTED_DTYPE; + } + } + + NVG_TRY(nvgraphAttachVertexData(nvg_handle, nvgraph_G, 0, settype, sssp_distances->data)); + + NVG_TRY(nvgraphSssp(nvg_handle, nvgraph_G, weight_index, source_vert, sssp_index)); + + NVG_TRY(nvgraphDestroyGraphDescr(nvg_handle, nvgraph_G)); + NVG_TRY(nvgraphDestroy(nvg_handle)); + + return GDF_SUCCESS; +} diff --git a/cpp/src/two_hop_neighbors.cu b/cpp/src/traversal/two_hop_neighbors.cu similarity index 87% rename from cpp/src/two_hop_neighbors.cu rename to cpp/src/traversal/two_hop_neighbors.cu index 6a38d46504b..de8bd9bfb0c 100644 --- a/cpp/src/two_hop_neighbors.cu +++ b/cpp/src/traversal/two_hop_neighbors.cu @@ -22,6 +22,7 @@ #include "two_hop_neighbors.cuh" #include "utilities/error_utils.h" #include +#include #include #include @@ -38,27 +39,28 @@ gdf_error gdf_get_two_hop_neighbors_impl(IndexType num_verts, IndexType num_edges; cudaMemcpy(&num_edges, &offsets[num_verts], sizeof(IndexType), cudaMemcpyDefault); + cudaStream_t stream {nullptr}; + // Allocate memory for temporary stuff IndexType *exsum_degree = nullptr; IndexType *first_pair = nullptr; IndexType *second_pair = nullptr; IndexType *block_bucket_offsets = nullptr; - ALLOC_MANAGED_TRY(&exsum_degree, sizeof(IndexType) * (num_edges + 1), nullptr); + ALLOC_TRY(&exsum_degree, sizeof(IndexType) * (num_edges + 1), stream); // Find the degree of the out vertex of each edge degree_iterator deg_it(offsets); deref_functor, IndexType> deref(deg_it); - rmm_temp_allocator allocator(nullptr); - thrust::fill(thrust::cuda::par(allocator).on(nullptr), exsum_degree, exsum_degree + 1, 0); - thrust::transform(thrust::cuda::par(allocator).on(nullptr), + thrust::fill(rmm::exec_policy(stream)->on(stream), exsum_degree, exsum_degree + 1, 0); + thrust::transform(rmm::exec_policy(stream)->on(stream), indices, indices + num_edges, exsum_degree + 1, deref); // Take the inclusive sum of the degrees - thrust::inclusive_scan(thrust::cuda::par(allocator).on(nullptr), + thrust::inclusive_scan(rmm::exec_policy(stream)->on(stream), exsum_degree + 1, exsum_degree + num_edges + 1, exsum_degree + 1); @@ -68,12 +70,12 @@ gdf_error gdf_get_two_hop_neighbors_impl(IndexType num_verts, cudaMemcpy(&output_size, &exsum_degree[num_edges], sizeof(IndexType), cudaMemcpyDefault); // Allocate memory for the scattered output - ALLOC_MANAGED_TRY(&second_pair, sizeof(IndexType) * output_size, nullptr); - ALLOC_MANAGED_TRY(&first_pair, sizeof(IndexType) * output_size, nullptr); + ALLOC_TRY(&second_pair, sizeof(IndexType) * output_size, stream); + ALLOC_TRY(&first_pair, sizeof(IndexType) * output_size, stream); // Figure out number of blocks and allocate memory for block bucket offsets IndexType num_blocks = (output_size + TWO_HOP_BLOCK_SIZE - 1) / TWO_HOP_BLOCK_SIZE; - ALLOC_MANAGED_TRY(&block_bucket_offsets, sizeof(IndexType) * (num_blocks + 1), nullptr); + ALLOC_TRY(&block_bucket_offsets, sizeof(IndexType) * (num_blocks + 1), stream); // Compute the block bucket offsets dim3 grid, block; @@ -100,18 +102,18 @@ gdf_error gdf_get_two_hop_neighbors_impl(IndexType num_verts, // Remove duplicates and self pairings auto tuple_start = thrust::make_zip_iterator(thrust::make_tuple(first_pair, second_pair)); auto tuple_end = tuple_start + output_size; - thrust::sort(thrust::cuda::par(allocator).on(nullptr), tuple_start, tuple_end); - tuple_end = thrust::copy_if(thrust::cuda::par(allocator).on(nullptr), + thrust::sort(rmm::exec_policy(stream)->on(stream), tuple_start, tuple_end); + tuple_end = thrust::copy_if(rmm::exec_policy(stream)->on(stream), tuple_start, tuple_end, tuple_start, self_loop_flagger()); - tuple_end = thrust::unique(thrust::cuda::par(allocator).on(nullptr), tuple_start, tuple_end); + tuple_end = thrust::unique(rmm::exec_policy(stream)->on(stream), tuple_start, tuple_end); // Get things ready to return outputSize = tuple_end - tuple_start; - ALLOC_MANAGED_TRY(first, sizeof(IndexType) * outputSize, nullptr); - ALLOC_MANAGED_TRY(second, sizeof(IndexType) * outputSize, nullptr); + ALLOC_TRY(first, sizeof(IndexType) * outputSize, nullptr); + ALLOC_TRY(second, sizeof(IndexType) * outputSize, nullptr); cudaMemcpy(*first, first_pair, sizeof(IndexType) * outputSize, cudaMemcpyDefault); cudaMemcpy(*second, second_pair, sizeof(IndexType) * outputSize, cudaMemcpyDefault); diff --git a/cpp/src/two_hop_neighbors.cuh b/cpp/src/traversal/two_hop_neighbors.cuh similarity index 100% rename from cpp/src/two_hop_neighbors.cuh rename to cpp/src/traversal/two_hop_neighbors.cuh diff --git a/cpp/src/utilities/degree.cu b/cpp/src/utilities/degree.cu new file mode 100644 index 00000000000..5f84f68feab --- /dev/null +++ b/cpp/src/utilities/degree.cu @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "utilities/error_utils.h" +#include "utilities/graph_utils.cuh" + +gdf_error gdf_degree_impl(int n, int e, gdf_column* col_ptr, gdf_column* degree, bool offsets) { + if(offsets == true) { + dim3 nthreads, nblocks; + nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); + nblocks.y = 1; + nblocks.z = 1; + + switch (col_ptr->dtype) { + case GDF_INT32: cugraph::degree_offsets <<>>(n, e, static_cast(col_ptr->data), static_cast(degree->data));break; + default: return GDF_UNSUPPORTED_DTYPE; + } + } + else { + dim3 nthreads, nblocks; + nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); + nblocks.y = 1; + nblocks.z = 1; + + switch (col_ptr->dtype) { + case GDF_INT32: cugraph::degree_coo <<>>(n, e, static_cast(col_ptr->data), static_cast(degree->data));break; + default: return GDF_UNSUPPORTED_DTYPE; + } + } + return GDF_SUCCESS; +} + + +gdf_error gdf_degree(gdf_graph *graph, gdf_column *degree, int x) { + // Calculates the degree of all vertices of the graph + // x = 0: in+out degree + // x = 1: in-degree + // x = 2: out-degree + GDF_REQUIRE(graph->adjList != nullptr || graph->transposedAdjList != nullptr, GDF_INVALID_API_CALL); + int n; + int e; + if(graph->adjList != nullptr) { + n = graph->adjList->offsets->size -1; + e = graph->adjList->indices->size; + } + else { + n = graph->transposedAdjList->offsets->size - 1; + e = graph->transposedAdjList->indices->size; + } + + if(x!=1) { + // Computes out-degree for x=0 and x=2 + if(graph->adjList) + gdf_degree_impl(n, e, graph->adjList->offsets, degree, true); + else + gdf_degree_impl(n, e, graph->transposedAdjList->indices, degree, false); + } + + if(x!=2) { + // Computes in-degree for x=0 and x=1 + if(graph->adjList) + gdf_degree_impl(n, e, graph->adjList->indices, degree, false); + else + gdf_degree_impl(n, e, graph->transposedAdjList->offsets, degree, true); + } + return GDF_SUCCESS; +} diff --git a/cpp/src/utilities/error_utils.h b/cpp/src/utilities/error_utils.h index 6b8416da844..c50feca3a12 100644 --- a/cpp/src/utilities/error_utils.h +++ b/cpp/src/utilities/error_utils.h @@ -23,9 +23,20 @@ #define GDF_ERRORUTILS_H #include + #include #include +#include +#include "nvgraph_error_utils.h" + +#define cudaCheckError() { \ + cudaError_t e=cudaGetLastError(); \ + if(e!=cudaSuccess) { \ + std::cerr << "Cuda failure: " << cudaGetErrorString(e) << " at: " << __FILE__ << ':' << __LINE__ << std::endl; \ + } \ + } + #define CUDA_TRY( call ) \ { \ cudaError_t cudaStatus = call; \ diff --git a/cpp/src/utilities/graph_utils.cuh b/cpp/src/utilities/graph_utils.cuh new file mode 100644 index 00000000000..a5331ef6bb4 --- /dev/null +++ b/cpp/src/utilities/graph_utils.cuh @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + * + */ + +// Interanl helper functions +// Author: Alex Fender afender@nvidia.com +#pragma once + +#include +#include +//#include +//#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "utilities/error_utils.h" + +#define USE_CG 1 +//#define DEBUG 1 + +namespace cugraph +{ + +#define CUDA_MAX_BLOCKS 65535 +#define CUDA_MAX_KERNEL_THREADS 256 //kernel will launch at most 256 threads per block +#define DEFAULT_MASK 0xffffffff +#define US + + template + static __device__ __forceinline__ T shfl_up(T r, int offset, int bound = 32, int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __shfl_up_sync(mask, r, offset, bound); +#else + return __shfl_up(r, offset, bound); +#endif +#else + return 0.0f; +#endif + } + + template + static __device__ __forceinline__ T shfl(T r, int lane, int bound = 32, int mask = DEFAULT_MASK) { +#if __CUDA_ARCH__ >= 300 +#if USE_CG + return __shfl_sync(mask, r, lane, bound); +#else + return __shfl(r, lane, bound); +#endif +#else + return 0.0f; +#endif + } + + template + __inline__ __device__ + ValType parallel_prefix_sum(IdxType n, IdxType *ind, ValType *w) { + IdxType i, j, mn; + ValType v, last; + ValType sum = 0.0; + bool valid; + + //Parallel prefix sum (using __shfl) + mn = (((n + blockDim.x - 1) / blockDim.x) * blockDim.x); //n in multiple of blockDim.x + for (i = threadIdx.x; i < mn; i += blockDim.x) { + //All threads (especially the last one) must always participate + //in the shfl instruction, otherwise their sum will be undefined. + //So, the loop stopping condition is based on multiple of n in loop increments, + //so that all threads enter into the loop and inside we make sure we do not + //read out of bounds memory checking for the actual size n. + + //check if the thread is valid + valid = i < n; + + //Notice that the last thread is used to propagate the prefix sum. + //For all the threads, in the first iteration the last is 0, in the following + //iterations it is the value at the last thread of the previous iterations. + + //get the value of the last thread + last = shfl(sum, blockDim.x - 1, blockDim.x); + + //if you are valid read the value from memory, otherwise set your value to 0 + sum = (valid) ? w[ind[i]] : 0.0; + + //do prefix sum (of size warpSize=blockDim.x =< 32) + for (j = 1; j < blockDim.x; j *= 2) { + v = shfl_up(sum, j, blockDim.x); + if (threadIdx.x >= j) + sum += v; + } + //shift by last + sum += last; + //notice that no __threadfence or __syncthreads are needed in this implementation + } + //get the value of the last thread (to all threads) + last = shfl(sum, blockDim.x - 1, blockDim.x); + + return last; + } + +//dot + template + T dot(size_t n, T* x, T* y) { + cudaStream_t stream {nullptr}; + T result = thrust::inner_product(rmm::exec_policy(stream)->on(stream), + thrust::device_pointer_cast(x), + thrust::device_pointer_cast(x + n), + thrust::device_pointer_cast(y), + 0.0f); + cudaCheckError(); + return result; + } + +//axpy + template + struct axpy_functor: public thrust::binary_function { + const T a; + axpy_functor(T _a) : + a(_a) { + } + __host__ __device__ + T operator()(const T& x, const T& y) const { + return a * x + y; + } + }; + + template + void axpy(size_t n, T a, T* x, T* y) { + cudaStream_t stream {nullptr}; + thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::device_pointer_cast(x), + thrust::device_pointer_cast(x + n), + thrust::device_pointer_cast(y), + thrust::device_pointer_cast(y), + axpy_functor(a)); + cudaCheckError(); + } + +//norm + template + struct square { + __host__ __device__ + T operator()(const T& x) const { + return x * x; + } + }; + + template + T nrm2(size_t n, T* x) { + cudaStream_t stream {nullptr}; + T init = 0; + T result = std::sqrt(thrust::transform_reduce(rmm::exec_policy(stream)->on(stream), + thrust::device_pointer_cast(x), + thrust::device_pointer_cast(x + n), + square(), + init, + thrust::plus())); + cudaCheckError(); + return result; + } + + template + T nrm1(size_t n, T* x) { + cudaStream_t stream {nullptr}; + T result = thrust::reduce(rmm::exec_policy(stream)->on(stream), + thrust::device_pointer_cast(x), + thrust::device_pointer_cast(x + n)); + cudaCheckError(); + return result; + } + + template + void scal(size_t n, T val, T* x) { + cudaStream_t stream {nullptr}; + thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::device_pointer_cast(x), + thrust::device_pointer_cast(x + n), + thrust::make_constant_iterator(val), + thrust::device_pointer_cast(x), + thrust::multiplies()); + cudaCheckError(); + } + + template + void addv(size_t n, T val, T* x) { + cudaStream_t stream {nullptr}; + thrust::transform(rmm::exec_policy(stream)->on(stream), + thrust::device_pointer_cast(x), + thrust::device_pointer_cast(x + n), + thrust::make_constant_iterator(val), + thrust::device_pointer_cast(x), + thrust::plus()); + cudaCheckError(); + } + + template + void fill(size_t n, T* x, T value) { + cudaStream_t stream {nullptr}; + thrust::fill(rmm::exec_policy(stream)->on(stream), + thrust::device_pointer_cast(x), + thrust::device_pointer_cast(x + n), value); + cudaCheckError(); + } + + template + void printv(size_t n, T* vec, int offset) { + thrust::device_ptr dev_ptr(vec); + std::cout.precision(15); + std::cout << "sample size = " << n << ", offset = " << offset << std::endl; + thrust::copy(dev_ptr + offset, dev_ptr + offset + n, std::ostream_iterator(std::cout, " ")); //Assume no RMM dependency; TODO: check / test (potential BUG !!!!!) + cudaCheckError(); + std::cout << std::endl; + } + + template + void copy(size_t n, T *x, T *res) { + thrust::device_ptr dev_ptr(x); + thrust::device_ptr res_ptr(res); + cudaStream_t stream {nullptr}; + thrust::copy_n(rmm::exec_policy(stream)->on(stream), dev_ptr, n, res_ptr); + cudaCheckError(); + } + + template + struct is_zero { + __host__ __device__ + bool operator()(const T x) { + return x == 0; + } + }; + + template + struct dangling_functor: public thrust::unary_function { + const T val; + dangling_functor(T _val) : + val(_val) { + } + __host__ __device__ + T operator()(const T& x) const { + return val + x; + } + }; + + template + void update_dangling_nodes(size_t n, T* dangling_nodes, T damping_factor) { + cudaStream_t stream {nullptr}; + thrust::transform_if(rmm::exec_policy(stream)->on(stream), + thrust::device_pointer_cast(dangling_nodes), + thrust::device_pointer_cast(dangling_nodes + n), + thrust::device_pointer_cast(dangling_nodes), + dangling_functor(1.0 - damping_factor), + is_zero()); + cudaCheckError(); + } + +//google matrix kernels + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + degree_coo(const IndexType n, const IndexType e, const IndexType *ind, ValueType *degree) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) + atomicAdd(°ree[ind[i]], (ValueType)1.0); + } + + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + flag_leafs_kernel(const size_t n, const IndexType *degree, ValueType *bookmark) { + for (auto i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) + if (degree[i] == 0) + bookmark[i] = 1.0; + } + + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + degree_offsets(const IndexType n, const IndexType e, const IndexType *ind, ValueType *degree) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x) + degree[i] += ind[i+1]-ind[i]; + } + + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + type_convert(FromType* array, int n) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += gridDim.x * blockDim.x){ + ToType val = array[i]; + ToType* vals = (ToType*)array; + vals[i] = val; + } + } + + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + equi_prob3(const IndexType n, + const IndexType e, + const IndexType *csrPtr, + const IndexType *csrInd, + ValueType *val, + IndexType *degree) { + int j, row, col; + for (row = threadIdx.z + blockIdx.z * blockDim.z; row < n; row += gridDim.z * blockDim.z) { + for (j = csrPtr[row] + threadIdx.y + blockIdx.y * blockDim.y; j < csrPtr[row + 1]; + j += gridDim.y * blockDim.y) { + col = csrInd[j]; + val[j] = 1.0 / degree[col]; + //val[j] = 999; + } + } + } + + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + equi_prob2(const IndexType n, + const IndexType e, + const IndexType *csrPtr, + const IndexType *csrInd, + ValueType *val, + IndexType *degree) { + int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < n) { + int row_begin = csrPtr[row]; + int row_end = csrPtr[row + 1]; + int col; + for (int i = row_begin; i < row_end; i++) { + col = csrInd[i]; + val[i] = 1.0 / degree[col]; + } + } + } + +// compute the H^T values for an already transposed adjacency matrix, leveraging coo info + template + void HT_matrix_csc_coo(const IndexType n, + const IndexType e, + const IndexType *csrPtr, + const IndexType *csrInd, + ValueType *val, + ValueType *bookmark) { + IndexType *degree; + cudaStream_t stream { nullptr }; + ALLOC_TRY((void**)°ree, sizeof(IndexType) * n, stream); + cudaMemset(degree, 0, sizeof(IndexType) * n); + + dim3 nthreads, nblocks; + nthreads.x = min(e, CUDA_MAX_KERNEL_THREADS); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min((e + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); + nblocks.y = 1; + nblocks.z = 1; + degree_coo <<>>(n, e, csrInd, degree); + cudaCheckError(); + + int y = 4; + nthreads.x = 32 / y; + nthreads.y = y; + nthreads.z = 8; + nblocks.x = 1; + nblocks.y = 1; + nblocks.z = min((n + nthreads.z - 1) / nthreads.z, CUDA_MAX_BLOCKS); //1; + equi_prob3 <<>>(n, e, csrPtr, csrInd, val, degree); + cudaCheckError(); + + ValueType a = 0.0; + fill(n, bookmark, a); + cudaCheckError(); + + nthreads.x = min(n, CUDA_MAX_KERNEL_THREADS); + nthreads.y = 1; + nthreads.z = 1; + nblocks.x = min((n + nthreads.x - 1) / nthreads.x, CUDA_MAX_BLOCKS); + nblocks.y = 1; + nblocks.z = 1; + flag_leafs_kernel <<>>(n, degree, bookmark); + cudaCheckError(); + ALLOC_FREE_TRY(degree, stream); + } + + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) + permute_vals_kernel(const IndexType e, IndexType *perm, ValueType *in, ValueType *out) { + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < e; i += gridDim.x * blockDim.x) + out[i] = in[perm[i]]; + } + + template + void permute_vals(const IndexType e, IndexType *perm, ValueType *in, ValueType *out) { + int nthreads = min(e, CUDA_MAX_KERNEL_THREADS); + int nblocks = min((e + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS); + permute_vals_kernel<<>>(e, perm, in, out); + } + +// This will remove duplicate along with sorting +// This will sort the COO Matrix, row will be sorted and each column of same row will be sorted. + template + void remove_duplicate(IndexType* src, IndexType* dest, ValueType* val, SizeT &nnz) { + cudaStream_t stream {nullptr}; + if (val != NULL) { + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), + thrust::raw_pointer_cast(val), + thrust::raw_pointer_cast(val) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src), + thrust::raw_pointer_cast(dest)))); + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), + thrust::raw_pointer_cast(dest), + thrust::raw_pointer_cast(dest + nnz), + thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src), + thrust::raw_pointer_cast(val)))); + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), + thrust::raw_pointer_cast(src), + thrust::raw_pointer_cast(src + nnz), + thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(dest), + thrust::raw_pointer_cast(val)))); + + typedef thrust::tuple IteratorTuple; + typedef thrust::zip_iterator ZipIterator; + typedef thrust::tuple ZipIteratorTuple; + typedef thrust::zip_iterator ZipZipIterator; + + ZipZipIterator newEnd = + thrust::unique(rmm::exec_policy(stream)->on(stream), + thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src), + thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(dest), + thrust::raw_pointer_cast(val))))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src + nnz), + thrust::make_zip_iterator(thrust::make_tuple(dest + nnz, + val + nnz))))); + + ZipIteratorTuple endTuple = newEnd.get_iterator_tuple(); + IndexType* row_end = thrust::get<0>(endTuple); + + nnz = ((size_t) row_end - (size_t) src) / sizeof(IndexType); + } + else + { + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), + thrust::raw_pointer_cast(dest), + thrust::raw_pointer_cast(dest + nnz), + thrust::raw_pointer_cast(src)); + thrust::stable_sort_by_key(rmm::exec_policy(stream)->on(stream), + thrust::raw_pointer_cast(src), + thrust::raw_pointer_cast(src + nnz), + thrust::raw_pointer_cast(dest)); + + typedef thrust::tuple IteratorTuple; + typedef thrust::zip_iterator ZipIterator; + + ZipIterator newEnd = + thrust::unique(rmm::exec_policy(stream)->on(stream), + thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src), + thrust::raw_pointer_cast(dest))), + thrust::make_zip_iterator(thrust::make_tuple(thrust::raw_pointer_cast(src + nnz), + thrust::raw_pointer_cast(dest + nnz)))); + + IteratorTuple endTuple = newEnd.get_iterator_tuple(); + IndexType* row_end = thrust::get<0>(endTuple); + + nnz = ((size_t) row_end - (size_t) src) / sizeof(IndexType); + } + } + + template + __global__ void __launch_bounds__(CUDA_MAX_KERNEL_THREADS) offsets_to_indices_kernel(const IndexType *offsets, + IndexType v, + IndexType *indices) { + int tid, ctaStart; + tid = threadIdx.x; + ctaStart = blockIdx.x; + + for (int j = ctaStart; j < v; j += gridDim.x) { + IndexType colStart = offsets[j]; + IndexType colEnd = offsets[j + 1]; + IndexType rowNnz = colEnd - colStart; + + for (int i = 0; i < rowNnz; i += blockDim.x) { + if ((colStart + tid + i) < colEnd) { + indices[colStart + tid + i] = j; + } + } + } + } + + template + void offsets_to_indices(const IndexType *offsets, IndexType v, IndexType *indices) { + int nthreads = min(v, CUDA_MAX_KERNEL_THREADS); + int nblocks = min((v + nthreads - 1) / nthreads, CUDA_MAX_BLOCKS); + offsets_to_indices_kernel<<>>(offsets, v, indices); + cudaCheckError(); + } + + template + void sequence(IndexType n, IndexType *vec, IndexType init = 0) { + thrust::sequence(thrust::device, + thrust::device_pointer_cast(vec), + thrust::device_pointer_cast(vec + n), + init); + cudaCheckError(); + } + +} //namespace cugraph diff --git a/cpp/src/grmat.cu b/cpp/src/utilities/grmat.cu similarity index 96% rename from cpp/src/grmat.cu rename to cpp/src/utilities/grmat.cu index f0b9b79b456..8b5a50aacd7 100644 --- a/cpp/src/grmat.cu +++ b/cpp/src/utilities/grmat.cu @@ -176,15 +176,13 @@ gdf_error main_(gdf_column *src, gdf_column *dest, gdf_column *val, CommandLine if (util::SetDevice(gpu_idx[0])) return GDF_CUDA_ERROR; - //RMM: - // - cudaStream_t stream{nullptr}; - rmm_temp_allocator allocator(stream); - ALLOC_MANAGED_TRY((void**)&coo.row, sizeof(VertexId) * rmat_all_edges, stream); - ALLOC_MANAGED_TRY((void**)&coo.col, sizeof(VertexId) * rmat_all_edges, stream); + + cudaStream_t stream {nullptr}; + ALLOC_TRY((void**)&coo.row, sizeof(VertexId) * rmat_all_edges, stream); + ALLOC_TRY((void**)&coo.col, sizeof(VertexId) * rmat_all_edges, stream); if (val != nullptr) { - ALLOC_MANAGED_TRY((void**)&coo.val, sizeof(Value) * rmat_all_edges, stream); + ALLOC_TRY((void**)&coo.val, sizeof(Value) * rmat_all_edges, stream); } if ((coo.row == NULL) ||(coo.col == NULL)) { @@ -247,7 +245,7 @@ gdf_error main_(gdf_column *src, gdf_column *dest, gdf_column *val, CommandLine cudaMemcpy((void*)&nodes_row, (void*)&(coo.row[rmat_all_edges-1]), sizeof(VertexId), cudaMemcpyDeviceToHost); - tmp = thrust::max_element(thrust::cuda::par(allocator).on(stream), + tmp = thrust::max_element(rmm::exec_policy(stream)->on(stream), thrust::device_pointer_cast((VertexId*)(coo.col)), thrust::device_pointer_cast((VertexId*)(coo.col + rmat_all_edges))); nodes_col = tmp[0]; @@ -348,7 +346,7 @@ gdf_error gdf_grmat_gen (const char* argv, size_t& vertices, size_t& edges, gdf_ { status = main_ (src, dest, val, &args, vertices, edges); } - else + else { status = main_ (src, dest, val, &args, vertices, edges); } diff --git a/cpp/src/heap.cuh b/cpp/src/utilities/heap.cuh similarity index 100% rename from cpp/src/heap.cuh rename to cpp/src/utilities/heap.cuh diff --git a/cpp/src/utilities/nvgraph_error_utils.h b/cpp/src/utilities/nvgraph_error_utils.h new file mode 100644 index 00000000000..8ece5630d43 --- /dev/null +++ b/cpp/src/utilities/nvgraph_error_utils.h @@ -0,0 +1,71 @@ +#ifndef NVGRAPH_ERRORUTILS_H +#define NVGRAPH_ERRORUTILS_H + +#include + +#ifdef VERBOSE +#define NVG_TRY(call) \ +{ \ + nvgraphStatus_t err_code = (call); \ + if (err_code != NVGRAPH_STATUS_SUCCESS) { \ + switch (err_code) { \ + case NVGRAPH_STATUS_SUCCESS: \ + return GDF_SUCCESS; \ + case NVGRAPH_STATUS_NOT_INITIALIZED: \ + return GDF_INVALID_API_CALL; \ + case NVGRAPH_STATUS_INVALID_VALUE: \ + return GDF_INVALID_API_CALL; \ + case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED: \ + return GDF_UNSUPPORTED_DTYPE; \ + case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED: \ + return GDF_INVALID_API_CALL; \ + default: \ + return GDF_CUDA_ERROR; \ + } \ + } \ +} +#else +#define NVG_TRY(call) \ +{ \ + nvgraphStatus_t err_code = (call); \ + if (err_code != NVGRAPH_STATUS_SUCCESS) { \ + switch (err_code) { \ + case NVGRAPH_STATUS_NOT_INITIALIZED: \ + std::cerr << "nvGRAPH not initialized"; \ + return GDF_CUDA_ERROR; \ + case NVGRAPH_STATUS_ALLOC_FAILED: \ + std::cerr << "nvGRAPH alloc failed"; \ + return GDF_CUDA_ERROR; \ + case NVGRAPH_STATUS_INVALID_VALUE: \ + std::cerr << "nvGRAPH invalid value"; \ + return GDF_CUDA_ERROR; \ + case NVGRAPH_STATUS_ARCH_MISMATCH: \ + std::cerr << "nvGRAPH arch mismatch"; \ + return GDF_CUDA_ERROR; \ + case NVGRAPH_STATUS_MAPPING_ERROR: \ + std::cerr << "nvGRAPH mapping error"; \ + return GDF_CUDA_ERROR; \ + case NVGRAPH_STATUS_EXECUTION_FAILED: \ + std::cerr << "nvGRAPH execution failed"; \ + return GDF_CUDA_ERROR; \ + case NVGRAPH_STATUS_INTERNAL_ERROR: \ + std::cerr << "nvGRAPH internal error"; \ + return GDF_CUDA_ERROR; \ + case NVGRAPH_STATUS_TYPE_NOT_SUPPORTED: \ + std::cerr << "nvGRAPH type not supported"; \ + return GDF_CUDA_ERROR; \ + case NVGRAPH_STATUS_NOT_CONVERGED: \ + std::cerr << "nvGRAPH algorithm failed to converge"; \ + return GDF_CUDA_ERROR; \ + case NVGRAPH_STATUS_GRAPH_TYPE_NOT_SUPPORTED: \ + std::cerr << "nvGRAPH graph type not supported"; \ + return GDF_CUDA_ERROR; \ + default: \ + std::cerr << "Unknown nvGRAPH Status"; \ + return GDF_CUDA_ERROR; \ + } \ + } \ +} +#endif + +#endif diff --git a/datasets/get_test_data.sh b/datasets/get_test_data.sh index a416baf7256..aa59d5d4c20 100644 --- a/datasets/get_test_data.sh +++ b/datasets/get_test_data.sh @@ -6,15 +6,22 @@ cd tmp wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/test/datasets.tgz wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/test/ref/pagerank.tgz wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/test/ref/sssp.tgz +wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/benchmark/hibench/hibench_1_huge.tgz +wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/benchmark/hibench/hibench_1_large.tgz +wget https://s3.us-east-2.amazonaws.com/rapidsai-data/cugraph/benchmark/hibench/hibench_1_small.tgz cd .. mkdir test mkdir test/ref +mkdir benchmark echo Decompressing ... tar xvzf tmp/datasets.tgz -C test tar xvzf tmp/pagerank.tgz -C test/ref tar xvzf tmp/sssp.tgz -C test/ref +tar xvzf tmp/hibench_1_huge.tgz -C benchmark +tar xvzf tmp/hibench_1_large.tgz -C benchmark +tar xvzf tmp/hibench_1_small.tgz -C benchmark rm -rf tmp diff --git a/docs/source/conf.py b/docs/source/conf.py index affc1c0ec6e..1551147de15 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -64,9 +64,9 @@ # built documents. # # The short X.Y version. -version = '0.7' +version = '0.8' # The full version, including alpha/beta/rc tags. -release = '0.7.0.dev0' +release = '0.8.0a' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages.