diff --git a/.gitmodules b/.gitmodules
index e69de29bb2d..605fac63cc4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,12 @@
+[submodule "cpp/nvgraph/cpp/thirdparty/cnmem"]
+ path = cpp/nvgraph/cpp/thirdparty/cnmem
+ url = https://github.com/NVIDIA/cnmem.git
+ branch = master
+[submodule "cpp/nvgraph/cpp/thirdparty/cub"]
+ path = cpp/nvgraph/cpp/thirdparty/cub
+ url = https://github.com/NVlabs/cub.git
+ branch = 1.8.0
+[submodule "cpp/nvgraph/external/cusp"]
+ path = cpp/nvgraph/external/cusp
+ url = https://github.com/cusplibrary/cusplibrary.git
+ branch = cuda9
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5d16706725f..8cce3b51853 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,10 +7,10 @@
## Improvements
- PR #157 Removed cudatoolkit dependency in setup.py
- PR #185 Update docs version
+- PR #194 Open source nvgraph in cugraph repository #194
- PR #190 Added a copy option in graph creation
- PR #196 Fix typos in readme intro
-
## Bug Fixes
- PR #169 Disable terminal output in sssp
- PR #191 Fix double upload bug
diff --git a/README.md b/README.md
index db970371aae..10cb81027fd 100644
--- a/README.md
+++ b/README.md
@@ -192,7 +192,7 @@ conda activate cugraph_dev
3) Build and install `libcugraph`. CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
- This project uses cmake for building the C/C++ library. To configure cmake, run:
+ This project uses cmake for building the C/C++ library. CMake will also automatically build and install nvGraph library (`$CUGRAPH_HOME/cpp/nvgraph`) which may take a few minutes. To configure cmake, run:
```bash
# Set the localtion to cuGraph in an environment variable CUGRAPH_HOME
@@ -320,8 +320,9 @@ unset LD_LIBRARY_PATH
+## nvGraph
-
+The nvGraph library is now open source and part of cuGraph. It can be build as a stand alone by following nvgraph's [readme](cpp/nvgraph/).
------
diff --git a/ci/cpu/cugraph/upload-anaconda.sh b/ci/cpu/cugraph/upload-anaconda.sh
index 7ea8c7069ac..19461c77d4f 100755
--- a/ci/cpu/cugraph/upload-anaconda.sh
+++ b/ci/cpu/cugraph/upload-anaconda.sh
@@ -4,26 +4,18 @@
set -e
-if [ "$BUILD_CUGRAPH" == "1" ]; then
- if [ "$BUILD_ABI" == "1" ]; then
- export UPLOADFILE=`conda build conda/recipes/cugraph -c rapidsai -c nvidia -c numba -c conda-forge -c defaults --python=$PYTHON --output`
- else
- export UPLOADFILE=`conda build conda/recipes/cugraph -c rapidsai/label/cf201901 -c nvidia/label/cf201901 -c numba -c conda-forge/label/cf201901 -c defaults --python=$PYTHON --output`
- fi
+if [ "$UPLOAD_CUGRAPH" == "1" ]; then
+ export UPLOADFILE=`conda build conda/recipes/cugraph -c rapidsai -c nvidia -c numba -c conda-forge -c defaults --python=$PYTHON --output`
SOURCE_BRANCH=master
# Have to label all CUDA versions due to the compatibility to work with any CUDA
- if [ "$LABEL_MAIN" == "1" -a "$BUILD_ABI" == "1" ]; then
+ if [ "$LABEL_MAIN" == "1" ]; then
LABEL_OPTION="--label main --label cuda9.2 --label cuda10.0"
- elif [ "$LABEL_MAIN" == "0" -a "$BUILD_ABI" == "1" ]; then
+ elif [ "$LABEL_MAIN" == "0" ]; then
LABEL_OPTION="--label dev --label cuda9.2 --label cuda10.0"
- elif [ "$LABEL_MAIN" == "1" -a "$BUILD_ABI" == "0" ]; then
- LABEL_OPTION="--label cf201901 --label cf201901-cuda9.2 --label cf201901-cuda10.0"
- elif [ "$LABEL_MAIN" == "0" -a "$BUILD_ABI" == "0" ]; then
- LABEL_OPTION="--label cf201901-dev --label cf201901-cuda9.2 --label cf201901-cuda10.0"
else
- echo "Unknown label configuration LABEL_MAIN='$LABEL_MAIN' BUILD_ABI='$BUILD_ABI'"
+ echo "Unknown label configuration LABEL_MAIN='$LABEL_MAIN'"
exit 1
fi
echo "LABEL_OPTION=${LABEL_OPTION}"
@@ -44,4 +36,7 @@ if [ "$BUILD_CUGRAPH" == "1" ]; then
echo "Upload"
echo ${UPLOADFILE}
anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --force ${UPLOADFILE}
+else
+ echo "Skipping cugraph upload"
+ return 0
fi
\ No newline at end of file
diff --git a/ci/cpu/libcugraph/upload-anaconda.sh b/ci/cpu/libcugraph/upload-anaconda.sh
index 6c66a3169f0..c66c8cae137 100755
--- a/ci/cpu/libcugraph/upload-anaconda.sh
+++ b/ci/cpu/libcugraph/upload-anaconda.sh
@@ -4,31 +4,23 @@
set -e
-if [ "$BUILD_LIBCUGRAPH" == "1" ]; then
+if [ "$UPLOAD_LIBCUGRAPH" == "1" ]; then
CUDA_REL=${CUDA:0:3}
if [ "${CUDA:0:2}" == '10' ]; then
# CUDA 10 release
CUDA_REL=${CUDA:0:4}
fi
- if [ "$BUILD_ABI" == "1" ]; then
- export UPLOADFILE=`conda build conda/recipes/libcugraph -c rapidsai -c nvidia -c numba -c conda-forge -c defaults --python=$PYTHON --output`
- else
- export UPLOADFILE=`conda build conda/recipes/libcugraph -c rapidsai/label/cf201901 -c nvidia/label/cf201901 -c numba -c conda-forge/label/cf201901 -c defaults --python=$PYTHON --output`
- fi
+ export UPLOADFILE=`conda build conda/recipes/libcugraph -c rapidsai -c nvidia -c numba -c conda-forge -c defaults --python=$PYTHON --output`
SOURCE_BRANCH=master
- if [ "$LABEL_MAIN" == "1" -a "$BUILD_ABI" == "1" ]; then
+ if [ "$LABEL_MAIN" == "1" ]; then
LABEL_OPTION="--label main --label cuda${CUDA_REL}"
- elif [ "$LABEL_MAIN" == "0" -a "$BUILD_ABI" == "1" ]; then
+ elif [ "$LABEL_MAIN" == "0" ]; then
LABEL_OPTION="--label dev --label cuda${CUDA_REL}"
- elif [ "$LABEL_MAIN" == "1" -a "$BUILD_ABI" == "0" ]; then
- LABEL_OPTION="--label cf201901 --label cf201901-cuda${CUDA_REL}"
- elif [ "$LABEL_MAIN" == "0" -a "$BUILD_ABI" == "0" ]; then
- LABEL_OPTION="--label cf201901-dev --label cf201901-cuda${CUDA_REL}"
else
- echo "Unknown label configuration LABEL_MAIN='$LABEL_MAIN' BUILD_ABI='$BUILD_ABI'"
+ echo "Unknown label configuration LABEL_MAIN='$LABEL_MAIN'"
exit 1
fi
echo "LABEL_OPTION=${LABEL_OPTION}"
@@ -49,4 +41,6 @@ if [ "$BUILD_LIBCUGRAPH" == "1" ]; then
echo "Upload"
echo ${UPLOADFILE}
anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --force ${UPLOADFILE}
+else
+ echo "Skipping libcugraph upload"
fi
diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index c7303a1a298..1a1c2a69064 100644
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -1,15 +1,17 @@
#!/usr/bin/env bash
export BUILD_ABI=1
+export BUILD_CUGRAPH=1
+export BUILD_LIBCUGRAPH=1
if [[ "$CUDA" == "9.2" ]]; then
- export BUILD_CUGRAPH=1
+ export UPLOAD_CUGRAPH=1
else
- export BUILD_CUGRAPH=0
+ export UPLOAD_CUGRAPH=0
fi
if [[ "$PYTHON" == "3.6" ]]; then
- export BUILD_LIBCUGRAPH=1
+ export UPLOAD_LIBCUGRAPH=1
else
- export BUILD_LIBCUGRAPH=0
+ export UPLOAD_LIBCUGRAPH=0
fi
diff --git a/conda/environments/cugraph_dev.yml b/conda/environments/cugraph_dev.yml
index efd1c15097c..3323a846901 100644
--- a/conda/environments/cugraph_dev.yml
+++ b/conda/environments/cugraph_dev.yml
@@ -7,7 +7,6 @@ channels:
- defaults
dependencies:
- cudf>=0.5.1
-- nvgraph
- scipy
- networkx
- python-louvain
diff --git a/conda/environments/cugraph_dev_cuda10.yml b/conda/environments/cugraph_dev_cuda10.yml
index 7168452a843..51a114ccc29 100644
--- a/conda/environments/cugraph_dev_cuda10.yml
+++ b/conda/environments/cugraph_dev_cuda10.yml
@@ -7,7 +7,6 @@ channels:
- defaults
dependencies:
- cudf>=0.5.1
-- nvgraph
- scipy
- networkx
- python-louvain
diff --git a/conda/environments/cugraph_nightly.yml b/conda/environments/cugraph_nightly.yml
index 65ba1a1fa84..e5ee17033c7 100644
--- a/conda/environments/cugraph_nightly.yml
+++ b/conda/environments/cugraph_nightly.yml
@@ -8,7 +8,6 @@ channels:
- defaults
dependencies:
- cudf=0.6
-- nvgraph
- scipy
- networkx
- python-louvain
diff --git a/conda/environments/cugraph_nightly_cuda10.yml b/conda/environments/cugraph_nightly_cuda10.yml
index 5097c044a54..d8070b78793 100644
--- a/conda/environments/cugraph_nightly_cuda10.yml
+++ b/conda/environments/cugraph_nightly_cuda10.yml
@@ -8,7 +8,6 @@ channels:
- defaults
dependencies:
- cudf=0.6
-- nvgraph
- scipy
- networkx
- python-louvain
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index af3122d0f19..15da0bc8396 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -129,6 +129,11 @@ message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
# - cmake custom modules --------------------------------------------------------------------------
include(ConfigureGoogleTest)
+# speedup build time by avoiding features that are not exposed
+set(NVGRAPH_LIGHT True)
+# build nvgraph
+include(ConfigureNvgraph)
+
###################################################################################################
# - Find and add different modules and supporting repos -------------------------------------------
find_package(Boost 1.45.0 COMPONENTS system)
@@ -186,12 +191,11 @@ endif (RMM_INCLUDE AND RMM_LIBRARY)
###################################################################################################
# - add nvgraph -----------------------------------------------------------------------------------
+
find_path(NVGRAPH_INCLUDE "nvgraph"
- HINTS "$ENV{NVGRAPH_ROOT}/include"
- "$ENV{CONDA_PREFIX}/include")
-find_library(NVGRAPH_LIBRARY "nvgraph_st"
- HINTS "$ENV{NVGRAPH_ROOT}/lib"
- "$ENV{CONDA_PREFIX}/lib")
+ HINTS "$ENV{CONDA_PREFIX}/include")
+find_library(NVGRAPH_LIBRARY "nvgraph_rapids"
+ HINTS "$ENV{CONDA_PREFIX}/lib")
add_library( nvgraph SHARED IMPORTED)
if (NVGRAPH_INCLUDE AND NVGRAPH_LIBRARY)
diff --git a/cpp/cmake/Modules/ConfigureNvgraph.cmake b/cpp/cmake/Modules/ConfigureNvgraph.cmake
new file mode 100644
index 00000000000..16d9a55dbe8
--- /dev/null
+++ b/cpp/cmake/Modules/ConfigureNvgraph.cmake
@@ -0,0 +1,65 @@
+set(NVGRAPH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/nvgraph")
+
+set(NVGRAPH_CMAKE_ARGS "")
+ #" -DNVGRAPH_build_samples=ON"
+ #" -DCMAKE_VERBOSE_MAKEFILE=ON")
+
+if(NOT CMAKE_CXX11_ABI)
+ message(STATUS "NVGRAPH: Disabling the GLIBCXX11 ABI")
+ list(APPEND NVGRAPH_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
+ list(APPEND NVGRAPH_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
+elseif(CMAKE_CXX11_ABI)
+ message(STATUS "NVGRAPH: Enabling the GLIBCXX11 ABI")
+ list(APPEND NVGRAPH_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
+ list(APPEND NVGRAPH_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
+endif(NOT CMAKE_CXX11_ABI)
+
+#configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/Nvgraph.CMakeLists.txt.cmake"
+# "${NVGRAPH_ROOT}/cpp/CMakeLists.txt")
+
+file(MAKE_DIRECTORY "${NVGRAPH_ROOT}/cpp/build")
+#file(MAKE_DIRECTORY "${NVGRAPH_ROOT}/install")
+
+execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} .. -DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX} -DNVGRAPH_LIGHT=${NVGRAPH_LIGHT}
+ RESULT_VARIABLE NVGRAPH_CONFIG
+ WORKING_DIRECTORY ${NVGRAPH_ROOT}/cpp/build)
+
+if(NVGRAPH_CONFIG)
+ message(FATAL_ERROR "Configuring nvgraph failed: " ${NVGRAPH_CONFIG})
+endif(NVGRAPH_CONFIG)
+
+set(PARALLEL_BUILD -j)
+if($ENV{PARALLEL_LEVEL})
+ set(NUM_JOBS $ENV{PARALLEL_LEVEL})
+ set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
+endif($ENV{PARALLEL_LEVEL})
+
+if(${NUM_JOBS})
+ if(${NUM_JOBS} EQUAL 1)
+ message(STATUS "NVGRAPH BUILD: Enabling Sequential CMake build")
+ elseif(${NUM_JOBS} GREATER 1)
+ message(STATUS "NVGRAPH BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
+ endif(${NUM_JOBS} EQUAL 1)
+else()
+ message(STATUS "NVGRAPH BUILD: Enabling Parallel CMake build with all threads")
+endif(${NUM_JOBS})
+
+execute_process(COMMAND ${CMAKE_COMMAND} --build . -- ${PARALLEL_BUILD}
+ RESULT_VARIABLE NVGRAPH_BUILD
+ WORKING_DIRECTORY ${NVGRAPH_ROOT}/cpp/build)
+if(NVGRAPH_BUILD)
+ message(FATAL_ERROR "Building nvgraph failed: " ${NVGRAPH_BUILD})
+endif(NVGRAPH_BUILD)
+
+execute_process(COMMAND ${CMAKE_COMMAND} --build . --target install
+ RESULT_VARIABLE NVGRAPH_BUILD
+ WORKING_DIRECTORY ${NVGRAPH_ROOT}/cpp/build)
+
+if(NVGRAPH_BUILD)
+ message(FATAL_ERROR "Installing nvgraph failed: " ${NVGRAPH_BUILD})
+endif(NVGRAPH_BUILD)
+
+message(STATUS "nvgraph installed under: " ${CMAKE_INSTALL_PREFIX})
+set(NVGRAPH_INCLUDE "${CMAKE_INSTALL_PREFIX}/include/nvgraph.h ${CMAKE_INSTALL_PREFIX}/include/test_opt_utils.cuh")
+set(NVGRAPH_LIBRARY "${CMAKE_INSTALL_PREFIX}/lib/libnvgraph_rapids.so")
+set(NVGRAPH_FOUND TRUE)
diff --git a/cpp/nvgraph/README.md b/cpp/nvgraph/README.md
new file mode 100644
index 00000000000..173b18e4cbb
--- /dev/null
+++ b/cpp/nvgraph/README.md
@@ -0,0 +1,103 @@
+#
![](../../img/rapids_logo.png)
nvgraph - NVIDIA graph library
+
+Data analytics is a growing application of high-performance computing. Many advanced data analytics problems can be couched as graph problems. In turn, many of the common graph problems today can be couched as sparse linear algebra. This is the motivation for nvGRAPH, which harnesses the power of GPUs for linear algebra to handle large graph analytics and big data analytics problems.
+
+## Development Setup
+
+### Conda{#conda}
+
+It is easy to install nvGraph using conda. You can get a minimal conda installation with [Miniconda](https://conda.io/miniconda.html) or get the full installation with [Anaconda](https://www.anaconda.com/download).
+
+Install and update nvGraph using the conda command:
+
+```bash
+# CUDA 9.2
+conda install -c nvidia nvgraph
+
+# CUDA 10.0
+conda install -c nvidia/label/cuda10.0 nvgraph
+
+```
+
+Note: This conda installation only applies to Linux and Python versions 3.6/3.7.
+
+### Build from Source {#source}
+
+The following instructions are for developers and contributors to nvGraph OSS development. These instructions are tested on Linux Ubuntu 16.04 & 18.04. Use these instructions to build nvGraph from source and contribute to its development. Other operating systems may be compatible, but are not currently tested.
+
+The nvGraph package is a C/C++ CUDA library. It needs to be installed in order for nvGraph to operate correctly.
+
+The following instructions are tested on Linux systems.
+
+
+#### Prerequisites
+
+Compiler requirement:
+
+* `gcc` version 5.4+
+* `nvcc` version 9.2
+* `cmake` version 3.12
+
+
+
+CUDA requirement:
+
+* CUDA 9.2+
+* NVIDIA driver 396.44+
+* Pascal architecture or better
+
+You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
+Compiler requirements:
+
+
+#### Build and Install the C/C++ CUDA components
+
+To install cuGraph from source, ensure the dependencies are met and follow the steps below:
+
+1) Clone the repository and submodules
+
+ ```bash
+ # Set the localtion to cuGraph in an environment variable CUGRAPH_HOME
+ export CUGRAPH_HOME=$(pwd)/cugraph
+
+ # Download the cuGraph repo
+ git clone https://github.com/rapidsai/cugraph.git $CUGRAPH_HOME
+
+ # Next load all the submodules
+ cd $CUGRAPH_HOME
+ git submodule update --init --recursive
+ ```
+
+2) Build and install `libnvgraph_rapids.so`. CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
+
+ This project uses cmake for building the C/C++ library. To configure cmake, run:
+
+ ```bash
+ cd $CUGRAPH_HOME
+ cd cpp/nvgraph/cpp # enter nvgraph's cpp directory
+ mkdir build # create build directory
+ cd build # enter the build directory
+ cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
+
+ # now build the code
+ make -j # "-j" starts multiple threads
+ make install # install the libraries
+ ```
+
+The default installation locations are `$CMAKE_INSTALL_PREFIX/lib` and `$CMAKE_INSTALL_PREFIX/include/nvgraph` respectively.
+
+#### C++ stand alone tests
+
+```bash
+# Run the cugraph tests
+cd $CUGRAPH_HOME
+cd cpp/nvgraph/cpp/build
+gtests/NVGRAPH_TEST # this is an executable file
+```
+Other test executables require specific datasets and will result in failure if they are not present.
+## Documentation
+
+The C API documentation can be found in the [CUDA Toolkit Documentation](https://docs.nvidia.com/cuda/nvgraph/index.html).
+
+
+
diff --git a/cpp/nvgraph/conda-recipes/nvgraph/LICENSE b/cpp/nvgraph/conda-recipes/nvgraph/LICENSE
new file mode 100644
index 00000000000..d8708b3facc
--- /dev/null
+++ b/cpp/nvgraph/conda-recipes/nvgraph/LICENSE
@@ -0,0 +1,152 @@
+LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS
+(July 26, 2018 version)
+
+This license agreement, including exhibits attached ("Agreement”) is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of a NVIDIA software development kit (“SDK”).
+
+Each SDK has its own set of software and materials, but here is a description of the types of items that may be included in a SDK: source code, header files, APIs, data sets and assets (examples include images, textures, models, scenes, videos, native API input/output files), binary software, sample code, libraries, utility programs, programming code and documentation.
+
+This Agreement can be accepted only by an adult of legal age of majority in the country in which the SDK is used.
+
+If you are entering into this Agreement on behalf of a company or other legal entity, you represent that you have the legal authority to bind the entity to this Agreement, in which case “you” will mean the entity you represent.
+
+If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not download, install or use the SDK.
+
+You agree to use the SDK only for purposes that are permitted by (a) this Agreement, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions.
+
+1. License.
+
+1.1 Grant
+
+Subject to the terms of this Agreement, NVIDIA hereby grants you a non-exclusive, non-transferable license, without the right to sublicense (except as expressly provided in this Agreement) to:
+
+(i) Install and use the SDK,
+
+(ii) Modify and create derivative works of sample source code delivered in the SDK, and
+
+(iii) Distribute those portions of the SDK that are identified in this Agreement as distributable, as incorporated in object code format into a software application that meets the distribution requirements indicated in this Agreement.
+
+1.2 Distribution Requirements
+
+These are the distribution requirements for you to exercise the distribution grant:
+
+(i) Your application must have material additional functionality, beyond the included portions of the SDK.
+
+(ii) The distributable portions of the SDK shall only be accessed by your application.
+
+(iii) The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.”
+
+(iv) Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only.
+
+(v) The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. Additionally, you agree that you will protect the privacy, security and legal rights of your application users.
+
+(vi) You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SDK not in compliance with the requirements of this Agreement, and to enforce the terms of your agreements with respect to distributed SDK.
+
+1.3 Authorized Users
+
+You may allow employees and contractors of your entity or of your subsidiary(ies) to access and use the SDK from your secure network to perform work on your behalf.
+
+If you are an academic institution you may allow users enrolled or employed by the academic institution to access and use the SDK from your secure network.
+
+You are responsible for the compliance with the terms of this Agreement by your authorized users. If you become aware that your authorized users didn’t follow the terms of this Agreement, you agree to take reasonable steps to resolve the non-compliance and prevent new occurrences.
+
+1.4 Pre-Release SDK
+The SDK versions identified as alpha, beta, preview or otherwise as pre-release, may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, accessibility, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. Use of a pre-release SDK may result in unexpected results, loss of data, project delays or other unpredictable damage or loss.
+You may use a pre-release SDK at your own risk, understanding that pre-release SDKs are not intended for use in production or business-critical systems.
+NVIDIA may choose not to make available a commercial version of any pre-release SDK. NVIDIA may also choose to abandon development and terminate the availability of a pre-release SDK at any time without liability.
+1.5 Updates
+
+NVIDIA may, at its option, make available patches, workarounds or other updates to this SDK. Unless the updates are provided with their separate governing terms, they are deemed part of the SDK licensed to you as provided in this Agreement.
+
+You agree that the form and content of the SDK that NVIDIA provides may change without prior notice to you. While NVIDIA generally maintains compatibility between versions, NVIDIA may in some cases make changes that introduce incompatibilities in future versions of the SDK.
+
+1.6 Third Party Licenses
+
+The SDK may come bundled with, or otherwise include or be distributed with, third party software licensed by a NVIDIA supplier and/or open source software provided under an open source license. Use of third party software is subject to the third-party license terms, or in the absence of third party terms, the terms of this Agreement. Copyright to third party software is held by the copyright holders indicated in the third-party software or license.
+
+1.7 Reservation of Rights
+
+NVIDIA reserves all rights, title and interest in and to the SDK not expressly granted to you under this Agreement.
+
+2. Limitations.
+
+The following license limitations apply to your use of the SDK:
+
+2.1 You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SDK or copies of the SDK.
+
+2.2 Except as expressly provided in this Agreement, you may not copy, sell, rent, sublicense, transfer, distribute, modify, or create derivative works of any portion of the SDK. For clarity, you may not distribute or sublicense the SDK as a stand-alone product.
+
+2.3 Unless you have an agreement with NVIDIA for this purpose, you may not indicate that an application created with the SDK is sponsored or endorsed by NVIDIA.
+
+2.4 You may not bypass, disable, or circumvent any encryption, security, digital rights management or authentication mechanism in the SDK.
+
+2.5 You may not use the SDK in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SDK be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge.
+
+2.6 Unless you have an agreement with NVIDIA for this purpose, you may not use the SDK with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in nuclear, avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SDK for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses.
+
+2.7 You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney’s fees and costs incident to establishing the right of indemnification) arising out of or related to your use of the SDK outside of the scope of this Agreement, or not in compliance with its terms.
+
+3. Ownership.
+
+3.1 NVIDIA or its licensors hold all rights, title and interest in and to the SDK and its modifications and derivative works, including their respective intellectual property rights, subject to your rights under Section 3.2. This SDK may include software and materials from NVIDIA’s licensors, and these licensors are intended third party beneficiaries that may enforce this Agreement with respect to their intellectual property rights.
+
+3.2 You hold all rights, title and interest in and to your applications and your derivative works of the sample source code delivered in the SDK, including their respective intellectual property rights, subject to NVIDIA’s rights under section 3.1.
+
+3.3 You may, but don’t have to, provide to NVIDIA suggestions, feature requests or other feedback regarding the SDK, including possible enhancements or modifications to the SDK. For any feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) it without the payment of any royalties or fees to you. NVIDIA will use feedback at its choice. NVIDIA is constantly looking for ways to improve its products, so you may send feedback to NVIDIA through the developer portal at https://developer.nvidia.com.
+
+4. No Warranties.
+
+THE SDK IS PROVIDED BY NVIDIA “AS IS” AND “WITH ALL FAULTS.” TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES EXPRESSLY DISCLAIM ALL WARRANTIES OF ANY KIND OR NATURE, WHETHER EXPRESS, IMPLIED OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, NON-INFRINGEMENT, OR THE ABSENCE OF ANY DEFECTS THEREIN, WHETHER LATENT OR PATENT. NO WARRANTY IS MADE ON THE BASIS OF TRADE USAGE, COURSE OF DEALING OR COURSE OF TRADE.
+
+5. Limitations of Liability.
+
+TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT OR THE USE OR PERFORMANCE OF THE SDK, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY. IN NO EVENT WILL NVIDIA’S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS AGREEMENT EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT.
+
+These exclusions and limitations of liability shall apply regardless if NVIDIA or its affiliates have been advised of the possibility of such damages, and regardless of whether a remedy fails its essential purpose. These exclusions and limitations of liability form an essential basis of the bargain between the parties, and, absent any of these exclusions or limitations of liability, the provisions of this Agreement, including, without limitation, the economic terms, would be substantially different.
+
+6. Termination.
+
+6.1 This Agreement will continue to apply until terminated by either you or NVIDIA as described below.
+
+6.2 If you want to terminate this Agreement, you may do so by stopping to use the SDK.
+
+6.3 NVIDIA may, at any time, terminate this Agreement if: (i) you fail to comply with any term of this Agreement and the non-compliance is not fixed within thirty (30) days following notice from NVIDIA (or immediately if you violate NVIDIA’s intellectual property rights); (ii) you commence or participate in any legal proceeding against NVIDIA with respect to the SDK; or (iii) NVIDIA decides to no longer provide the SDK in a country or, in NVIDIA’s sole discretion, the continued use of it is no longer commercially viable.
+
+6.4 Upon any termination of this Agreement, you agree to promptly discontinue use of the SDK and destroy all copies in your possession or control. Your prior distributions in accordance with this Agreement are not affected by the termination of this Agreement. Upon written request, you will certify in writing that you have complied with your commitments under this section. Upon any termination of this Agreement all provisions survive except for the license grant provisions.
+
+7. General.
+
+If you wish to assign this Agreement or your rights and obligations, including by merger, consolidation, dissolution or operation of law, contact NVIDIA to ask for permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. NVIDIA may assign, delegate or transfer this Agreement and its rights and obligations, and if to a non-affiliate you will be notified.
+
+You agree to cooperate with NVIDIA and provide reasonably requested information to verify your compliance with this Agreement.
+
+This Agreement will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language.
+
+The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this Agreement. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction.
+
+If any court of competent jurisdiction determines that any provision of this Agreement is illegal, invalid or unenforceable, such provision will be construed as limited to the extent necessary to be consistent with and fully enforceable under the law and the remaining provisions will remain in full force and effect. Unless otherwise specified, remedies are cumulative.
+
+Each party acknowledges and agrees that the other is an independent contractor in the performance of this Agreement.
+
+The SDK has been developed entirely at private expense and is “commercial items” consisting of “commercial computer software” and “commercial computer software documentation” provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this Agreement pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (c)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051.
+
+The SDK is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SDK into any country, or use the SDK in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury’s Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this Agreement, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SDK.
+
+Any notice delivered by NVIDIA to you under this Agreement will be delivered via mail, email or fax. You agree that any notices that NVIDIA sends you electronically will satisfy any legal communication requirements. Please direct your legal notices or other correspondence to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department.
+
+This Agreement and any exhibits incorporated into this Agreement constitute the entire agreement of the parties with respect to the subject matter of this Agreement and supersede all prior negotiations or documentation exchanged between the parties relating to this SDK license. Any additional and/or conflicting terms on documents issued by you are null, void, and invalid. Any amendment or waiver under this Agreement shall be in writing and signed by representatives of both parties.
+
+
+CUDA STRING SUPPLEMENT TO SOFTWARE LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS
+(September 18, 2018 version)
+
+The terms in this supplement govern your use of the NVIDIA CUDA String SDK under the terms of your license agreement (“Agreement”) as modified by this supplement. Capitalized terms used but not defined below have the meaning assigned to them in the Agreement.
+
+This supplement is an exhibit to the Agreement and is incorporated as an integral part of the Agreement. In the event of conflict between the terms in this supplement and the terms in the Agreement, the terms in this supplement govern.
+
+1. License Scope. The SDK is licensed for you to develop applications only for use in systems with NVIDIA GPUs.
+
+2. Distribution. The following portions of the SDK are distributable under the Agreement: cuString library.
+
+3. Licensing. If the distribution terms in this Agreement are not suitable for your organization, or for any questions regarding this Agreement, please contact NVIDIA at nvidia-compute-license-questions@nvidia.com.
+
+
diff --git a/cpp/nvgraph/conda-recipes/nvgraph/build.sh b/cpp/nvgraph/conda-recipes/nvgraph/build.sh
new file mode 100644
index 00000000000..ebb2f3177f8
--- /dev/null
+++ b/cpp/nvgraph/conda-recipes/nvgraph/build.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+CMAKE_COMMON_VARIABLES=" -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX11_ABI=$CMAKE_CXX11_ABI"
+
+
+if [ -n "$MACOSX_DEPLOYMENT_TARGET" ]; then
+ # C++11 requires 10.9
+ # but cudatoolkit 8 is build for 10.11
+ export MACOSX_DEPLOYMENT_TARGET=10.11
+fi
+
+# show environment
+printenv
+# Cleanup local git
+git clean -xdf
+# Change directory for build process
+cd cpp
+# Use CMake-based build procedure
+mkdir build
+cd build
+# configure
+cmake $CMAKE_COMMON_VARIABLES ..
+# build
+make -j VERBOSE=1 install
\ No newline at end of file
diff --git a/cpp/nvgraph/conda-recipes/nvgraph/meta.yaml b/cpp/nvgraph/conda-recipes/nvgraph/meta.yaml
new file mode 100644
index 00000000000..d13066591aa
--- /dev/null
+++ b/cpp/nvgraph/conda-recipes/nvgraph/meta.yaml
@@ -0,0 +1,27 @@
+# Copyright (c) 2018, NVIDIA CORPORATION.
+
+# Usage:
+# conda build -c defaults -c conda-forge .
+{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %}
+{% set git_revision_count=environ.get('GIT_DESCRIBE_NUMBER', 0) %}
+{% set cuda_version='.'.join(environ.get('CUDA_VERSION', 'unknown').split('.')[:2]) %}
+package:
+ name: nvgraph
+ version: {{ version }}
+
+source:
+ path: ../..
+
+build:
+ number: {{ git_revision_count }}
+ string: cuda{{ cuda_version }}_{{ git_revision_count }}
+
+requirements:
+ build:
+ - cmake 3.12.4
+
+about:
+ home: http://nvidia.com/
+ license: LICENSE AGREEMENT FOR NVIDIA SOFTWARE DEVELOPMENT KITS
+ license_file: LICENSE
+ summary: nvgraph Library
diff --git a/cpp/nvgraph/conda_build.sh b/cpp/nvgraph/conda_build.sh
new file mode 100755
index 00000000000..4432989676c
--- /dev/null
+++ b/cpp/nvgraph/conda_build.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+set -xe
+
+conda install conda-build anaconda-client conda-verify -y
+conda build -c nvidia -c rapidsai -c conda-forge -c defaults conda-recipes/nvgraph
+
+if [ "$UPLOAD_PACKAGE" == '1' ]; then
+ export UPLOADFILE=`conda build -c nvidia -c rapidsai -c conda-forge -c defaults conda-recipes/nvgraph --output`
+ SOURCE_BRANCH=master
+
+ test -e ${UPLOADFILE}
+ CUDA_REL=${CUDA:0:3}
+ if [ "${CUDA:0:2}" == '10' ]; then
+ # CUDA 10 release
+ CUDA_REL=${CUDA:0:4}
+ fi
+
+ LABEL_OPTION="--label dev --label cuda${CUDA_REL}"
+ if [ "${LABEL_MAIN}" == '1' ]; then
+ LABEL_OPTION="--label main --label cuda${CUDA_REL}"
+ fi
+ echo "LABEL_OPTION=${LABEL_OPTION}"
+
+ if [ -z "$MY_UPLOAD_KEY" ]; then
+ echo "No upload key"
+ return 0
+ fi
+
+ echo "Upload"
+ echo ${UPLOADFILE}
+ anaconda -t ${MY_UPLOAD_KEY} upload -u nvidia ${LABEL_OPTION} --force ${UPLOADFILE}
+else
+ echo "Skipping upload"
+fi
\ No newline at end of file
diff --git a/cpp/nvgraph/cpp/CMakeLists.txt b/cpp/nvgraph/cpp/CMakeLists.txt
new file mode 100644
index 00000000000..42d365400e6
--- /dev/null
+++ b/cpp/nvgraph/cpp/CMakeLists.txt
@@ -0,0 +1,219 @@
+#=============================================================================
+# Copyright (c) 2019, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
+
+project(NV_GRAPH VERSION 0.4.0 LANGUAGES C CXX CUDA)
+
+###################################################################################################
+# - compiler options ------------------------------------------------------------------------------
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_C_COMPILER $ENV{CC})
+set(CMAKE_CXX_COMPILER $ENV{CXX})
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(CMAKE_CUDA_STANDARD 11)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+endif(CMAKE_COMPILER_IS_GNUCXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70")
+
+# set warnings as errors
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Werror cross-execution-space-call -Xcompiler -Wall,-Werror")
+
+# set default build type
+set(CMAKE_BUILD_TYPE "Release")
+
+option(BUILD_TESTS "Configure CMake to build tests"
+ ON)
+
+if(CMAKE_COMPILER_IS_GNUCXX)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+
+ option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" OFF)
+ if(CMAKE_CXX11_ABI)
+ message(STATUS "nvGraph: Enabling the GLIBCXX11 ABI")
+ else()
+ message(STATUS "nvGraph: Disabling the GLIBCXX11 ABI")
+ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -D_GLIBCXX_USE_CXX11_ABI=0")
+ endif(CMAKE_CXX11_ABI)
+endif(CMAKE_COMPILER_IS_GNUCXX)
+
+###################################################################################################
+# - cmake modules ---------------------------------------------------------------------------------
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/" ${CMAKE_MODULE_PATH})
+
+include(FeatureSummary)
+include(CheckIncludeFiles)
+include(CheckLibraryExists)
+
+###################################################################################################
+# - add gtest -------------------------------------------------------------------------------------
+
+if(BUILD_TESTS)
+ include(CTest)
+ include(ConfigureGoogleTest)
+
+ if(GTEST_FOUND)
+ message(STATUS "Google C++ Testing Framework (Google Test) found in ${GTEST_ROOT}")
+ include_directories(${GTEST_INCLUDE_DIR})
+ add_subdirectory(${CMAKE_SOURCE_DIR}/tests)
+ else()
+ message(AUTHOR_WARNING "Google C++ Testing Framework (Google Test) not found: automated tests are disabled.")
+ endif(GTEST_FOUND)
+endif(BUILD_TESTS)
+
+###################################################################################################
+# - include paths ---------------------------------------------------------------------------------
+
+include_directories(
+ "${CMAKE_BINARY_DIR}/include"
+ "${CMAKE_SOURCE_DIR}/include"
+ "${CMAKE_SOURCE_DIR}/thirdparty/cub"
+ "${CMAKE_SOURCE_DIR}/thirdparty/cnmem/include"
+ "${CMAKE_SOURCE_DIR}/../external"
+ "${CMAKE_SOURCE_DIR}/../external/cusp"
+ "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}"
+ )
+
+###################################################################################################
+# - library paths ---------------------------------------------------------------------------------
+
+link_directories("${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}" # CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES is an undocumented/unsupported variable containing the link directories for nvcc
+ "${CMAKE_BINARY_DIR}/lib"
+ "${GTEST_LIBRARY_DIR}")
+
+###################################################################################################
+# - library targets -------------------------------------------------------------------------------
+if(NVGRAPH_LIGHT MATCHES True)
+ add_library(nvgraph_rapids SHARED
+ thirdparty/cnmem/src/cnmem.cpp
+ src/arnoldi.cu
+ src/bfs.cu
+ src/bfs2d.cu
+ src/bfs_kernels.cu
+ src/convert.cu
+ src/csrmv.cu
+ src/csrmv_cub.cu
+ src/csr_graph.cpp
+ src/graph_extractor.cu
+ src/jaccard_gpu.cu
+ src/kmeans.cu
+ src/lanczos.cu
+ src/lobpcg.cu
+ src/matrix.cu
+ src/modularity_maximization.cu
+ src/nvgraph.cu
+ src/nvgraph_cusparse.cpp
+ src/nvgraph_cublas.cpp
+ src/nvgraph_error.cu
+ src/nvgraph_lapack.cu
+ src/nvgraph_vector_kernels.cu
+ src/pagerank.cu
+ src/pagerank_kernels.cu
+ src/partition.cu
+ src/size2_selector.cu
+ src/sssp.cu
+ src/triangles_counting.cpp
+ src/triangles_counting_kernels.cu
+ src/valued_csr_graph.cpp
+ src/widest_path.cu
+ )
+else(NVGRAPH_LIGHT MATCHES True)
+ add_library(nvgraph_rapids SHARED
+ thirdparty/cnmem/src/cnmem.cpp
+ src/arnoldi.cu
+ src/bfs.cu
+ src/bfs2d.cu
+ src/bfs_kernels.cu
+ src/convert.cu
+ src/csrmv.cu
+ src/csrmv_cub.cu
+ src/csr_graph.cpp
+ src/graph_extractor.cu
+ src/jaccard_gpu.cu
+ src/kmeans.cu
+ src/lanczos.cu
+ src/lobpcg.cu
+ src/matrix.cu
+ src/modularity_maximization.cu
+ src/nvgraph.cu
+ src/nvgraph_cusparse.cpp
+ src/nvgraph_cublas.cpp
+ src/nvgraph_error.cu
+ src/nvgraph_lapack.cu
+ src/nvgraph_vector_kernels.cu
+ src/pagerank.cu
+ src/pagerank_kernels.cu
+ src/partition.cu
+ src/size2_selector.cu
+ src/sssp.cu
+ src/triangles_counting.cpp
+ src/triangles_counting_kernels.cu
+ src/valued_csr_graph.cpp
+ src/widest_path.cu
+ src/graph_contraction/contraction_csr_max.cu
+ src/graph_contraction/contraction_csr_sum.cu
+ src/graph_contraction/contraction_mv_double_mul.cu
+ src/graph_contraction/contraction_mv_float_min.cu
+ src/graph_contraction/contraction_csr_min.cu
+ src/graph_contraction/contraction_mv_double_max.cu
+ src/graph_contraction/contraction_mv_double_sum.cu
+ src/graph_contraction/contraction_mv_float_mul.cu
+ src/graph_contraction/contraction_csr_mul.cu
+ src/graph_contraction/contraction_mv_double_min.cu
+ src/graph_contraction/contraction_mv_float_max.cu
+ src/graph_contraction/contraction_mv_float_sum.cu
+ )
+endif(NVGRAPH_LIGHT MATCHES True)
+
+###################################################################################################
+# - build options ---------------------------------------------------------------------------------
+
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+ message(STATUS "Building with debugging flags")
+ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G")
+endif(CMAKE_BUILD_TYPE MATCHES Debug)
+
+if(NVGRAPH_LIGHT MATCHES True)
+ add_definitions( -DNVGRAPH_LIGHT=${NVGRAPH_LIGHT} )
+endif(NVGRAPH_LIGHT MATCHES True)
+
+
+###################################################################################################
+# - link libraries --------------------------------------------------------------------------------
+
+target_link_libraries(nvgraph_rapids cublas cusparse curand cusolver cudart )
+
+###################################################################################################
+# - install targets -------------------------------------------------------------------------------
+
+install(TARGETS nvgraph_rapids
+ DESTINATION lib)
+
+install(FILES
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/nvgraph.h
+ ${CMAKE_CURRENT_SOURCE_DIR}/include/test_opt_utils.cuh
+ DESTINATION include/nvgraph)
+
+
diff --git a/cpp/nvgraph/cpp/cmake/Modules/ConfigureGoogleTest.cmake b/cpp/nvgraph/cpp/cmake/Modules/ConfigureGoogleTest.cmake
new file mode 100644
index 00000000000..6120dc51aba
--- /dev/null
+++ b/cpp/nvgraph/cpp/cmake/Modules/ConfigureGoogleTest.cmake
@@ -0,0 +1,55 @@
+set(GTEST_ROOT "${CMAKE_BINARY_DIR}/googletest")
+
+set(GTEST_CMAKE_ARGS " -Dgtest_build_samples=ON"
+ " -DCMAKE_VERBOSE_MAKEFILE=ON")
+if(NOT CMAKE_CXX11_ABI)
+ message(STATUS "GTEST: Disabling the GLIBCXX11 ABI")
+ list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
+ list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0")
+elseif(CMAKE_CXX11_ABI)
+ message(STATUS "GTEST: Enabling the GLIBCXX11 ABI")
+ list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_C_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
+ list(APPEND GTEST_CMAKE_ARGS " -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=1")
+endif(NOT CMAKE_CXX11_ABI)
+
+configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/GoogleTest.CMakeLists.txt.cmake"
+ "${GTEST_ROOT}/CMakeLists.txt")
+
+file(MAKE_DIRECTORY "${GTEST_ROOT}/build")
+file(MAKE_DIRECTORY "${GTEST_ROOT}/install")
+
+execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} .
+ RESULT_VARIABLE GTEST_CONFIG
+ WORKING_DIRECTORY ${GTEST_ROOT})
+
+if(GTEST_CONFIG)
+ message(FATAL_ERROR "Configuring GoogleTest failed: " ${GTEST_CONFIG})
+endif(GTEST_CONFIG)
+
+# Parallel builds cause Travis to run out of memory
+unset(PARALLEL_BUILD)
+if($ENV{TRAVIS})
+ if(NOT DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+ message(STATUS "Disabling Parallel CMake build on Travis")
+ else()
+ set(PARALLEL_BUILD --parallel)
+ message(STATUS "Using $ENV{CMAKE_BUILD_PARALLEL_LEVEL} build jobs on Travis")
+ endif(NOT DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL})
+else()
+ set(PARALLEL_BUILD --parallel)
+ message("STATUS Enabling Parallel CMake build")
+endif($ENV{TRAVIS})
+
+execute_process(COMMAND ${CMAKE_COMMAND} --build ${PARALLEL_BUILD} ..
+ RESULT_VARIABLE GTEST_BUILD
+ WORKING_DIRECTORY ${GTEST_ROOT}/build)
+
+if(GTEST_BUILD)
+ message(FATAL_ERROR "Building GoogleTest failed: " ${GTEST_BUILD})
+endif(GTEST_BUILD)
+
+message(STATUS "GoogleTest installed here: " ${GTEST_ROOT}/install)
+set(GTEST_INCLUDE_DIR "${GTEST_ROOT}/install/include")
+set(GTEST_LIBRARY_DIR "${GTEST_ROOT}/install/lib")
+set(GTEST_FOUND TRUE)
+
diff --git a/cpp/nvgraph/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake b/cpp/nvgraph/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
new file mode 100644
index 00000000000..66e1dc85a50
--- /dev/null
+++ b/cpp/nvgraph/cpp/cmake/Templates/GoogleTest.CMakeLists.txt.cmake
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.12)
+
+include(ExternalProject)
+
+ExternalProject_Add(GoogleTest
+ GIT_REPOSITORY https://github.com/google/googletest.git
+ GIT_TAG release-1.8.0
+ SOURCE_DIR "${GTEST_ROOT}/googletest"
+ BINARY_DIR "${GTEST_ROOT}/build"
+ INSTALL_DIR "${GTEST_ROOT}/install"
+ CMAKE_ARGS ${GTEST_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${GTEST_ROOT}/install)
+
+
+
+
+
+
+
+
diff --git a/cpp/nvgraph/cpp/include/2d_partitioning.h b/cpp/nvgraph/cpp/include/2d_partitioning.h
new file mode 100644
index 00000000000..c344990db12
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/2d_partitioning.h
@@ -0,0 +1,1376 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /*
+ * 2d_partitioning.h
+ *
+ * Created on: Apr 9, 2018
+ * Author: jwyles
+ */
+
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace nvgraph {
+
+ template
+ struct CSR_Result_Weighted {
+ int64_t size;
+ int64_t nnz;
+ T* rowOffsets;
+ T* colIndices;
+ W* edgeWeights;
+
+ CSR_Result_Weighted() :
+ size(0), nnz(0), rowOffsets(NULL), colIndices(NULL), edgeWeights(NULL) {
+ }
+
+ void Destroy() {
+ if (rowOffsets)
+ cudaFree(rowOffsets);
+ if (colIndices)
+ cudaFree(colIndices);
+ if (edgeWeights)
+ cudaFree(edgeWeights);
+ }
+ };
+
+ // Define kernel for copying run length encoded values into offset slots.
+ template
+ __global__ void offsetsKernel(T runCounts, T* unique, T* counts, T* offsets) {
+ for (int32_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+ idx < runCounts;
+ idx += gridDim.x * blockDim.x) {
+ offsets[unique[idx]] = counts[idx];
+ }
+ }
+
+ /**
+ * Method for converting COO to CSR format
+ * @param sources The array of source indices
+ * @param destinations The array of destination indices
+ * @param edgeWeights The array of edge weights
+ * @param nnz The number of non zero values
+ * @param maxId The largest id contained in the matrix
+ * @param result The result is stored here.
+ */
+ template
+ void ConvertCOOtoCSR_weighted(T* sources,
+ T* destinations,
+ W* edgeWeights,
+ int64_t nnz,
+ T maxId,
+ CSR_Result_Weighted& result) {
+ // Sort source and destination columns by source
+ // Allocate local memory for operating on
+ T* srcs, *dests;
+ W* weights = NULL;
+ cudaMalloc(&srcs, sizeof(T) * nnz);
+ cudaMalloc(&dests, sizeof(T) * nnz);
+ if (edgeWeights)
+ cudaMalloc(&weights, sizeof(W) * nnz);
+ cudaMemcpy(srcs, sources, sizeof(T) * nnz, cudaMemcpyDefault);
+ cudaMemcpy(dests, destinations, sizeof(T) * nnz, cudaMemcpyDefault);
+ if (edgeWeights)
+ cudaMemcpy(weights, edgeWeights, sizeof(W) * nnz, cudaMemcpyDefault);
+
+ // Call Thrust::sort_by_key to sort the arrays with srcs as keys:
+ if (edgeWeights)
+ thrust::sort_by_key(thrust::device,
+ srcs,
+ srcs + nnz,
+ thrust::make_zip_iterator(thrust::make_tuple(dests, weights)));
+ else
+ thrust::sort_by_key(thrust::device, srcs, srcs + nnz, dests);
+
+ result.size = maxId + 1;
+
+ // Allocate offsets array
+ cudaMalloc(&result.rowOffsets, (maxId + 2) * sizeof(T));
+
+ // Set all values in offsets array to zeros
+ cudaMemset(result.rowOffsets, 0, (maxId + 2) * sizeof(T));
+
+ // Allocate temporary arrays same size as sources array, and single value to get run counts
+ T* unique, *counts, *runCount;
+ cudaMalloc(&unique, (maxId + 1) * sizeof(T));
+ cudaMalloc(&counts, (maxId + 1) * sizeof(T));
+ cudaMalloc(&runCount, sizeof(T));
+
+ // Use CUB run length encoding to get unique values and run lengths
+ void *tmpStorage = NULL;
+ size_t tmpBytes = 0;
+ cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
+ cudaMalloc(&tmpStorage, tmpBytes);
+ cub::DeviceRunLengthEncode::Encode(tmpStorage, tmpBytes, srcs, unique, counts, runCount, nnz);
+ cudaFree(tmpStorage);
+
+ // Set offsets to run sizes for each index
+ T runCount_h;
+ cudaMemcpy(&runCount_h, runCount, sizeof(T), cudaMemcpyDefault);
+ int threadsPerBlock = 1024;
+ int numBlocks = min(65535, (runCount_h + threadsPerBlock - 1) / threadsPerBlock);
+ offsetsKernel<<>>(runCount_h, unique, counts, result.rowOffsets);
+
+ // Scan offsets to get final offsets
+ thrust::exclusive_scan(thrust::device,
+ result.rowOffsets,
+ result.rowOffsets + maxId + 2,
+ result.rowOffsets);
+
+ // Clean up temporary allocations
+ result.nnz = nnz;
+ result.colIndices = dests;
+ result.edgeWeights = weights;
+ cudaFree(srcs);
+ cudaFree(unique);
+ cudaFree(counts);
+ cudaFree(runCount);
+ }
+
+ /**
+ * Describes the 2D decomposition of a partitioned matrix.
+ */
+ template
+ class MatrixDecompositionDescription {
+ protected:
+ GlobalType numRows; // Global number of rows in matrix
+ GlobalType numCols; // Global number of columns in matrix
+ GlobalType nnz; // Global number of non-zeroes in matrix
+ GlobalType blockRows; // Number of rows of blocks in the decomposition
+ GlobalType blockCols; // Number of columns of rows in the decomposition
+ LocalType offset;
+ // Offsets-like arrays for rows and columns defining the start/end of the
+ // sections of the global id space belonging to each row and column.
+ std::vector rowOffsets;
+ std::vector colOffsets;
+ // Array of integers one for each block, defining the device it is assigned to
+ std::vector deviceAssignments;
+ std::vector blockStreams;
+ public:
+
+ MatrixDecompositionDescription() :
+ numRows(0), numCols(0), nnz(0), blockRows(0), blockCols(0) {
+ rowOffsets.push_back(0);
+ colOffsets.push_back(0);
+ deviceAssignments.push_back(0);
+ }
+
+ // Basic constructor, just takes in the values of its members.
+ MatrixDecompositionDescription(GlobalType numRows,
+ GlobalType numCols,
+ GlobalType nnz,
+ GlobalType blockRows,
+ GlobalType blockCols,
+ std::vector rowOffsets,
+ std::vector colOffsets,
+ std::vector deviceAssignments) :
+ numRows(numRows), numCols(numCols), nnz(nnz), blockRows(blockRows),
+ blockCols(blockCols), rowOffsets(rowOffsets), colOffsets(colOffsets),
+ deviceAssignments(deviceAssignments) {
+ }
+
+ // Constructs a MatrixDecompositionDescription for a square matrix given the
+ // number of rows in the matrix and number of rows of blocks.
+ MatrixDecompositionDescription(GlobalType numRows,
+ GlobalType numBlockRows,
+ GlobalType nnz,
+ std::vector devices) :
+ numRows(numRows),
+ numCols(numRows),
+ blockRows(numBlockRows),
+ blockCols(numBlockRows),
+ nnz(nnz) {
+ // Tracking the current set device to change back
+ int currentDevice;
+ cudaGetDevice(¤tDevice);
+
+ // Setting up the row and col offsets into equally sized chunks
+ GlobalType remainder = numRows % blockRows;
+ if (remainder != 0)
+ offset = (numRows + blockRows - remainder) / blockRows;
+ else
+ offset = numRows / blockRows;
+
+ rowOffsets.resize(blockRows + 1);
+ colOffsets.resize(blockRows + 1);
+ for (int i = 0; i < blockRows; i++) {
+ rowOffsets[i] = i * offset;
+ colOffsets[i] = i * offset;
+ }
+ rowOffsets.back() = blockRows * offset;
+ colOffsets.back() = blockCols * offset;
+
+ // Setting up the device assignments using the given device ids and also
+ // setting up the stream associated with each block.
+ deviceAssignments.resize(getNumBlocks());
+ blockStreams.resize(getNumBlocks());
+ for (int i = 0; i < getNumBlocks(); i++) {
+ int device = devices[i % devices.size()];
+ deviceAssignments[i] = device;
+ cudaSetDevice(device);
+ cudaStream_t stream;
+ cudaStreamCreate(&stream);
+ blockStreams[i] = stream;
+ }
+
+ // Restoring to current device when called
+ cudaSetDevice(currentDevice);
+ }
+
+ // Gets the row id for the block containing the given global row id
+ int32_t getRowId(GlobalType val) const {
+ return std::upper_bound(rowOffsets.begin(), rowOffsets.end(), val) - rowOffsets.begin() - 1;
+ }
+
+ // Gets the column id for the block containing the given global column id
+ int32_t getColId(GlobalType val) const {
+ return std::upper_bound(colOffsets.begin(), colOffsets.end(), val) - colOffsets.begin() - 1;
+ }
+
+ // Gets the number of blocks in the decomposition:
+ int32_t getNumBlocks() const {
+ return blockRows * blockCols;
+ }
+
+ // Getter for offset
+ LocalType getOffset() const {
+ return offset;
+ }
+
+ // Getter for deviceAssignments
+ const std::vector& getDeviceAssignments() const {
+ return deviceAssignments;
+ }
+
+ /**
+ * Getter for vector of streams for each block.
+ * @return Reference to vector of streams for each block
+ */
+ const std::vector& getBlockStreams() const {
+ return blockStreams;
+ }
+
+ /**
+ * Getter for nnz
+ * @return The global number of non-zero elements
+ */
+ GlobalType getNnz() const {
+ return nnz;
+ }
+
+ /**
+ * Getter method for numRows
+ * @return The number of global rows in the matrix
+ */
+ GlobalType getNumRows() const {
+ return numRows;
+ }
+
+ /**
+ * Getter for BlockRows
+ * @return The number of blocks in a row in the decomposition.
+ */
+ GlobalType getBlockRows() const {
+ return blockRows;
+ }
+
+ /**
+ * Getter for BlockCols
+ * @return The number of blocks in a column in the decomposition.
+ */
+ GlobalType getBlockCols() const {
+ return blockCols;
+ }
+
+ /**
+ * Given a block id, returns the row which that block is in.
+ * @param bId The block ID
+ * @return The row number
+ */
+ int32_t getBlockRow(int32_t bId) const {
+ return bId / blockCols;
+ }
+
+ /**
+ * Given a block id, returns the column which that block is in.
+ * @param bId The block ID
+ * @return The column number
+ */
+ int32_t getBlockCol(int32_t bId) const {
+ return bId % blockCols;
+ }
+
+ /**
+ * Takes a COO global row and produces the COO local row and the block to which it belongs.
+ * @param globalRow The global row ID
+ * @param globalCol The global column ID
+ * @param localRow The block local row ID (return)
+ * @param localCol The block local column ID (return)
+ * @param blockId The block ID (return)
+ */
+ void convertGlobaltoLocalRow(GlobalType globalRow,
+ GlobalType globalCol,
+ LocalType& localRow,
+ LocalType& localCol,
+ int32_t& blockId) const {
+ int32_t rowId = getRowId(globalRow);
+ int32_t colId = getColId(globalCol);
+ blockId = rowId * blockCols + colId;
+ localRow = globalRow - rowOffsets[rowId];
+ localCol = globalCol - colOffsets[colId];
+ }
+
+ /**
+ * Takes in a row ID and column ID and returns the corresponding block ID
+ * @param rowId The row ID
+ * @param colId The column ID
+ * @return The ID of the corresponding block
+ */
+ int32_t getBlockId(int32_t rowId, int32_t colId) const {
+ return rowId * blockCols + colId;
+ }
+
+ /**
+ * Helper method to synchronize all streams after operations are issued.
+ */
+ void syncAllStreams() const {
+ int32_t numBlocks = getNumBlocks();
+ int32_t current_device;
+ cudaGetDevice(¤t_device);
+ for (int32_t i = 0; i < numBlocks; i++) {
+ cudaSetDevice(deviceAssignments[i]);
+ cudaStreamSynchronize(blockStreams[i]);
+ }
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * This method is only for testing and debugging use.
+ * @return A human readable string representation of the object
+ */
+ std::string toString() const {
+ std::stringstream ss;
+ ss << "Global Info:\n\tnumRows: " << numRows << ", numCols: " << numCols << ", nnz: "
+ << nnz;
+ ss << "\n";
+ ss << "Block Info:\n\tblockRows: " << blockRows << ", blockCols: " << blockCols;
+ ss << "\n";
+ ss << "rowOffsets: [";
+ for (int i = 0; i < (int) rowOffsets.size(); i++)
+ ss << rowOffsets[i] << (i == (int) rowOffsets.size() - 1 ? "]\n" : ", ");
+ ss << "colOffsets: [";
+ for (int i = 0; i < (int) colOffsets.size(); i++)
+ ss << colOffsets[i] << (i == (int) colOffsets.size() - 1 ? "]\n" : ", ");
+ ss << "deviceAssignments: [";
+ for (int i = 0; i < (int) deviceAssignments.size(); i++)
+ ss << deviceAssignments[i] << (i == (int) deviceAssignments.size() - 1 ? "]\n" : ", ");
+ return ss.str();
+ }
+ };
+
+ template
+ class Matrix2d {
+ protected:
+ // Description of the matrix decomposition
+ MatrixDecompositionDescription description;
+
+ // Array of block matrices forming the decomposition
+ std::vector*> blocks;
+ public:
+ Matrix2d() {
+ }
+ Matrix2d(MatrixDecompositionDescription descr,
+ std::vector*> blocks) :
+ description(descr), blocks(blocks) {
+ }
+
+ const MatrixDecompositionDescription& getMatrixDecompositionDescription() {
+ return description;
+ }
+
+ MultiValuedCsrGraph* getBlockMatrix(int32_t bId) {
+ return blocks[bId];
+ }
+
+ std::string toString() {
+ std::stringstream ss;
+ ss << "MatrixDecompositionDescription:\n" << description.toString();
+ for (int i = 0; i < (int) blocks.size(); i++) {
+ ss << "Block " << i << ":\n";
+ size_t numVerts = blocks[i]->get_num_vertices();
+ size_t numEdges = blocks[i]->get_num_edges();
+ size_t numValues = blocks[i]->getNumValues();
+ ss << "numVerts: " << numVerts << ", numEdges: " << numEdges << "\n";
+ LocalType* rowOffsets = (LocalType*) malloc((numVerts + 1) * sizeof(LocalType));
+ LocalType* colIndices = (LocalType*) malloc(numEdges * sizeof(LocalType));
+ ValueType* values = NULL;
+ if (numValues > 0)
+ values = (ValueType*) malloc(numEdges * sizeof(ValueType));
+ cudaMemcpy(rowOffsets,
+ blocks[i]->get_raw_row_offsets(),
+ (numVerts + 1) * sizeof(LocalType),
+ cudaMemcpyDefault);
+ cudaMemcpy(colIndices,
+ blocks[i]->get_raw_column_indices(),
+ numEdges * sizeof(LocalType),
+ cudaMemcpyDefault);
+ if (values)
+ cudaMemcpy(values,
+ blocks[i]->get_raw_edge_dim(0),
+ numEdges * sizeof(ValueType),
+ cudaMemcpyDefault);
+ int idxCount = numEdges >= (numVerts + 1) ? numEdges : (numVerts + 1);
+ ss << "Idx\tOffset\tColInd\tValue\n";
+ for (int j = 0; j < idxCount; j++) {
+ if (j < (int) numVerts + 1 && j < (int) numEdges)
+ ss << j << ":\t" << rowOffsets[j] << "\t" << colIndices[j] << "\t"
+ << (values ? values[j] : 0)
+ << "\n";
+ else if (j < (int) numVerts + 1 && j >= (int) numEdges)
+ ss << j << ":\t" << rowOffsets[j] << "\n";
+ else if (j >= (int) numVerts + 1 && j < (int) numEdges)
+ ss << j << ":\t" << "\t" << colIndices[j] << "\t" << (values ? values[j] : 0)
+ << "\n";
+ }
+ free(rowOffsets);
+ free(colIndices);
+ free(values);
+ }
+ return ss.str();
+ }
+ };
+
+ template
+ class VertexData2D {
+ const MatrixDecompositionDescription* description;
+ int32_t n;
+ std::vector > values;
+ public:
+ /**
+ * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription
+ * object which describes the matrix the data is attached to. Data buffers are
+ * allocated for each block using the offset from the description to size the
+ * buffers, and to locate the buffers on the same GPU as the matrix block.
+ */
+ VertexData2D(const MatrixDecompositionDescription* descr) :
+ description(descr) {
+ // Resize the values array to be the same size as number of blocks
+ values.resize(descr->getNumBlocks());
+
+ // Grab the current device id to switch back after allocations are done
+ int current_device;
+ cudaGetDevice(¤t_device);
+ LocalType allocSize = descr->getOffset();
+ n = allocSize;
+ // Allocate the data for each block
+ for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+ int device = descr->getDeviceAssignments()[i];
+ cudaSetDevice(device);
+ ValueType* d_current, *d_alternate;
+ cudaMalloc(&d_current, sizeof(ValueType) * n);
+ cudaMalloc(&d_alternate, sizeof(ValueType) * n);
+ values[i].d_buffers[0] = d_current;
+ values[i].d_buffers[1] = d_alternate;
+ }
+
+ // Set the device back to what it was initially
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * Creates a VertexData2D object given a pointer to a MatrixDecompositionDescription
+ * object, which describes the matrix the data is attached to, and an integer which indicates
+ * how many data elements should be allocated for each block. Data buffers are allocated
+ * for each block using the offset from the description to size the buffers, and to locate
+ * the buffers on the same GPU as the matrix block.
+ */
+ VertexData2D(const MatrixDecompositionDescription* descr, size_t _n) :
+ description(descr) {
+ // Resize the values array to be the same size as number of blocks
+ values.resize(descr->getNumBlocks());
+
+ // Grab the current device id to switch back after allocations are done
+ int current_device;
+ cudaGetDevice(¤t_device);
+ LocalType allocSize = _n;
+ n = allocSize;
+ // Allocate the data for each block
+ for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+ int device = descr->getDeviceAssignments()[i];
+ cudaSetDevice(device);
+ ValueType* d_current, *d_alternate;
+ cudaMalloc(&d_current, sizeof(ValueType) * n);
+ cudaMalloc(&d_alternate, sizeof(ValueType) * n);
+ values[i].d_buffers[0] = d_current;
+ values[i].d_buffers[1] = d_alternate;
+ }
+
+ // Set the device back to what it was initially
+ cudaSetDevice(current_device);
+ }
+
+ ~VertexData2D() {
+ for (size_t i = 0; i < values.size(); i++) {
+ if (values[i].Current())
+ cudaFree(values[i].Current());
+ if (values[i].Alternate())
+ cudaFree(values[i].Alternate());
+ }
+ }
+
+ /**
+ * Getter for n the size of each block's allocation in elements.
+ * @return The value of n
+ */
+ int32_t getN() {
+ return n;
+ }
+
+ /**
+ * Getter for the MatrixDecompositionDescription associated with this VertexData2D
+ * @return Pointer to the MatrixDecompositionDescription for this VertexData2D
+ */
+ const MatrixDecompositionDescription* getDescription() {
+ return description;
+ }
+
+ /**
+ * Gets the current buffer corresponding to the given block ID
+ */
+ ValueType* getCurrent(int bId) {
+ return values[bId].Current();
+ }
+
+ /**
+ * Gets the alternate buffer corresponding to the given block ID
+ */
+ ValueType* getAlternate(int bId) {
+ return values[bId].Alternate();
+ }
+
+ /**
+ * Swaps the current and alternate buffers for all block IDs
+ */
+ void swapBuffers() {
+ for (size_t i = 0; i < values.size(); i++)
+ values[i].selector ^= 1;
+ }
+
+ /**
+ * Sets an element in the global array, assuming that the data is currently
+ * valid and in the diagonal blocks. After calling this method either columnScatter
+ * or rowScatter should be called to propagate the change to all blocks.
+ */
+ void setElement(GlobalType globalIndex, ValueType val) {
+ LocalType blockId = globalIndex / n;
+ LocalType blockOffset = globalIndex % n;
+ int32_t bId = description->getBlockId(blockId, blockId);
+ ValueType* copyTo = values[bId].Current() + blockOffset;
+ cudaMemcpy(copyTo, &val, sizeof(ValueType), cudaMemcpyDefault);
+ }
+
+ /**
+ * Sets the elements of the global array, using the provided array of values. The values
+ * are set in the blocks of the diagonal, columnScatter or rowScatter should be called
+ * to propogate to all blocks.
+ * @param vals Pointer to an array with the values to be set.
+ */
+ void setElements(ValueType* vals) {
+ LocalType offset = description->getOffset();
+ int32_t numRows = description->getBlockRows();
+ for (int i = 0; i < numRows; i++) {
+ int32_t id = description->getBlockId(i, i);
+ cudaStream_t stream = description->getBlockStreams()[id];
+ ValueType* copyFrom = vals + i * n;
+ ValueType* copyTo = values[id].Current();
+ cudaMemcpyAsync(copyTo, copyFrom, sizeof(ValueType) * n, cudaMemcpyDefault, stream);
+ }
+ description->syncAllStreams();
+ }
+
+ /**
+ * Fills the elements of the data array with the given value.
+ * The elements on the diagonal are filled with the given value. After filling,
+ * either rowScatter or columnScatter will copy the values across the blocks in
+ * either the rows or columns depending on the use.
+ * @param val The value to fill the array with
+ */
+ void fillElements(ValueType val) {
+ int current_device;
+ cudaGetDevice(¤t_device);
+ int32_t numRows = description->getBlockRows();
+ for (int32_t i = 0; i < numRows; i++) {
+ int32_t blockId = description->getBlockId(i, i);
+ ValueType* vals = getCurrent(blockId);
+ int deviceId = description->getDeviceAssignments()[blockId];
+ cudaStream_t stream = description->getBlockStreams()[blockId];
+ cudaSetDevice(deviceId);
+ thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val);
+ }
+ description->syncAllStreams();
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * Copies the values of the diagonal blocks in this VertexData2D into the
+ * VertexData2D specified.
+ * @param other Pointer to the VertexData2D to copy into
+ */
+ void copyTo(VertexData2D* other) {
+ const MatrixDecompositionDescription* otherDescr =
+ other->getDescription();
+ // Do a quick check that the sizes of both block arrays are the same.
+ if (description->getBlockRows() == otherDescr->getBlockRows() && n == other->getN()) {
+ // Issue asynchronous copies for each block's data
+ for (int i = 0; i < description->getBlockRows(); i++) {
+ int32_t bId = description->getBlockId(i, i);
+ ValueType* copyFrom = getCurrent(bId);
+ ValueType* copyTo = other->getCurrent(bId);
+ cudaStream_t stream = description->getBlockStreams()[bId];
+ cudaMemcpyAsync(copyTo, copyFrom, n * sizeof(ValueType), cudaMemcpyDefault, stream);
+ }
+ // Synchronize the streams after the copies are done
+ for (int i = 0; i < description->getBlockRows(); i++) {
+ int32_t bId = description->getBlockId(i, i);
+ cudaStream_t stream = description->getBlockStreams()[bId];
+ cudaStreamSynchronize(stream);
+ }
+ }
+ }
+
+ /**
+ * This method implements a row-wise reduction of each blocks data into a
+ * single array for each row. The block on the diagonal will have the result.
+ */
+ template
+ void rowReduce() {
+ int current_device;
+ cudaGetDevice(¤t_device);
+ Operator op;
+
+ // For each row in the decomposition:
+ int32_t numRows = description->getBlockRows();
+ std::vector blockIds;
+ for (int32_t i = 0; i < numRows; i++) {
+ // Put all the block ids for the row into a vector, with the ID of the diagonal block
+ // at index 0.
+ std::vector blockIds;
+ blockIds.push_back(-1);
+ for (int32_t j = 0; j < numRows; j++) {
+ if (i == j) {
+ blockIds[0] = description->getBlockId(i, j);
+ }
+ else {
+ blockIds.push_back(description->getBlockId(i, j));
+ }
+ }
+
+ // Do a binary tree reduction. At each step the primary buffer of the sender is
+ // copied into the secondary buffer of the receiver. After the copy is done
+ // each receiver performs the reduction operator and stores the result in it's
+ // primary buffer.
+ for (int32_t j = 2; (j / 2) < numRows; j *= 2) {
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id] is the receiver
+ int32_t receiverId = blockIds[id];
+
+ // blockIds[id + j/2] is the sender
+ int32_t senderId = blockIds[id + j / 2];
+
+ // Get the stream associated with the receiver's block id
+ cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+ // Copy from the sender to the receiver (use stream associated with receiver)
+ cudaMemcpyAsync(values[receiverId].Alternate(),
+ values[senderId].Current(),
+ sizeof(ValueType) * n,
+ cudaMemcpyDefault,
+ stream);
+
+ // Invoke the reduction operator on the receiver's GPU and values arrays.
+ cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+ ValueType* input1 = values[receiverId].Alternate();
+ ValueType* input2 = values[receiverId].Current();
+ thrust::transform(thrust::cuda::par.on(stream),
+ input1,
+ input1 + n,
+ input2,
+ input2,
+ op);
+ }
+ }
+ // Sync all active streams before next step
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id] is the receiver
+ int32_t receiverId = blockIds[id];
+
+ // Set the device to the receiver and sync the stream
+ cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+ cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+ }
+ }
+ }
+ }
+
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * This method implements a column-wise reduction of each blocks data into a
+ * single array for each column. The block on the diagonal will have the result.
+ */
+ template
+ void columnReduce() {
+ int current_device;
+ cudaGetDevice(¤t_device);
+ Operator op;
+
+ // For each column in the decomposition:
+ int32_t numRows = description->getBlockRows();
+ std::vector blockIds;
+ for (int32_t i = 0; i < numRows; i++) {
+ // Put all the block ids for the row into a vector, with the ID of the diagonal block
+ // at index 0.
+ std::vector blockIds;
+ blockIds.push_back(-1);
+ for (int32_t j = 0; j < numRows; j++) {
+ if (i == j) {
+ blockIds[0] = description->getBlockId(j, i);
+ }
+ else {
+ blockIds.push_back(description->getBlockId(j, i));
+ }
+ }
+
+ // Do a binary tree reduction. At each step the primary buffer of the sender is
+ // copied into the secondary buffer of the receiver. After the copy is done
+ // each receiver performs the reduction operator and stores the result in it's
+ // primary buffer.
+ for (int32_t j = 2; (j / 2) < numRows; j *= 2) {
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id] is the receiver
+ int32_t receiverId = blockIds[id];
+
+ // blockIds[id + j/2] is the sender
+ int32_t senderId = blockIds[id + j / 2];
+
+ // Get the stream associated with the receiver's block id
+ cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+ // Copy from the sender to the receiver (use stream associated with receiver)
+ cudaMemcpyAsync(values[receiverId].Alternate(),
+ values[senderId].Current(),
+ sizeof(ValueType) * n,
+ cudaMemcpyDefault,
+ stream);
+
+ // Invoke the reduction operator on the receiver's GPU and values arrays.
+ cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+ ValueType* input1 = values[receiverId].Alternate();
+ ValueType* input2 = values[receiverId].Current();
+ thrust::transform(thrust::cuda::par.on(stream),
+ input1,
+ input1 + n,
+ input2,
+ input2,
+ op);
+ }
+ }
+ // Sync all active streams before next step
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id] is the receiver
+ int32_t receiverId = blockIds[id];
+
+ // Set the device to the receiver and sync the stream
+ cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+ cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+ }
+ }
+ }
+ }
+
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * This implements a column-wise scatter of the global data from the corresponding
+ * row. i.e. The data reduced from row 1 is broadcast to all blocks in
+ * column 1. It is assumed that the data to broadcast is located in the block on
+ * the diagonal.
+ */
+ void columnScatter() {
+ int current_device;
+ cudaGetDevice(¤t_device);
+
+ // For each column in the decomposition:
+ int32_t numRows = description->getBlockRows();
+ std::vector blockIds;
+ for (int32_t i = 0; i < numRows; i++) {
+ // Put all the block ids for the column into a vector, with the ID of the diagonal block
+ // at index 0.
+ std::vector blockIds;
+ blockIds.push_back(-1);
+ for (int32_t j = 0; j < numRows; j++) {
+ if (i == j) {
+ blockIds[0] = description->getBlockId(j, i);
+ }
+ else {
+ blockIds.push_back(description->getBlockId(j, i));
+ }
+ }
+
+ // Do a binary tree scatter. At each step the primary buffer of the sender is
+ // copied into the primary buffer of the receiver.
+ int32_t max2pow = 2;
+ while (max2pow < numRows) {
+ max2pow *= 2;
+ }
+ for (int32_t j = max2pow; j >= 2; j /= 2) {
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id] is the sender
+ int32_t senderId = blockIds[id];
+
+ // blockIds[id + j/2] is the sender
+ int32_t receiverId = blockIds[id + j / 2];
+
+ // Get the stream associated with the receiver's block id
+ cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+ // Copy from the sender to the receiver (use stream associated with receiver)
+ cudaMemcpyAsync(values[receiverId].Current(),
+ values[senderId].Current(),
+ sizeof(ValueType) * n,
+ cudaMemcpyDefault,
+ stream);
+ }
+ }
+ // Synchronize all the active streams before next step.
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id + j/2] is the sender
+ int32_t receiverId = blockIds[id + j / 2];
+
+ // Set device and sync receiver's stream
+ cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+ cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+ }
+ }
+ }
+ }
+
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * This implements a row-wise scatter of the global data from the corresponding
+ * column. i.e. The data reduced from column 1 is broadcast to all blocks in
+ * row 1. It is assumed that the data to broadcast is located in the block on
+ * the diagonal.
+ */
+ void rowScatter() {
+ int current_device;
+ cudaGetDevice(¤t_device);
+
+ // For each row in the decomposition:
+ int32_t numRows = description->getBlockRows();
+ std::vector blockIds;
+ for (int32_t i = 0; i < numRows; i++) {
+ // Put all the block ids for the column into a vector, with the ID of the diagonal block
+ // at index 0.
+ std::vector blockIds;
+ blockIds.push_back(-1);
+ for (int32_t j = 0; j < numRows; j++) {
+ if (i == j) {
+ blockIds[0] = description->getBlockId(i, j);
+ }
+ else {
+ blockIds.push_back(description->getBlockId(i, j));
+ }
+ }
+
+ // Do a binary tree scatter. At each step the primary buffer of the sender is
+ // copied into the primary buffer of the receiver.
+ int32_t max2pow = 2;
+ while (max2pow < numRows) {
+ max2pow *= 2;
+ }
+ for (int32_t j = max2pow; j >= 2; j /= 2) {
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id] is the sender
+ int32_t senderId = blockIds[id];
+
+ // blockIds[id + j/2] is the receiver
+ int32_t receiverId = blockIds[id + j / 2];
+
+ // Get the stream associated with the receiver's block id
+ cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+ // Copy from the sender to the receiver (use stream associated with receiver)
+ cudaMemcpyAsync(values[receiverId].Current(),
+ values[senderId].Current(),
+ sizeof(ValueType) * n,
+ cudaMemcpyDefault,
+ stream);
+ }
+ }
+ // Sync all the active streams before next step
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id + j/2] is the receiver
+ int32_t receiverId = blockIds[id + j / 2];
+
+ // Set device and sync receiver's stream
+ cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+ cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+ }
+ }
+ }
+ }
+
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * Outputs a human readable string representation of this Vertex2d object. This is only
+ * intended to be used for de-bugging.
+ * @return Human readable string representation
+ */
+ std::string toString() {
+ std::stringstream ss;
+ ValueType* c = (ValueType*) malloc(sizeof(ValueType) * n);
+ ValueType* a = (ValueType*) malloc(sizeof(ValueType) * n);
+
+ int32_t numBlocks = description->getNumBlocks();
+
+ ss << "Vertex2d:\n";
+ for (int32_t i = 0; i < numBlocks; i++) {
+ ss << "Block " << i << ":\n";
+ ss << "Idx\tCur\tAlt\n";
+ cudaMemcpy(c, values[i].Current(), sizeof(ValueType) * n, cudaMemcpyDefault);
+ cudaMemcpy(a, values[i].Alternate(), sizeof(ValueType) * n, cudaMemcpyDefault);
+ for (int32_t j = 0; j < n; j++) {
+ ss << j << ":\t" << c[j] << "\t" << a[j] << "\n";
+ }
+ }
+
+ free(c);
+ free(a);
+
+ return ss.str();
+ }
+ };
+
+ template
+ class VertexData2D_Unbuffered {
+ const MatrixDecompositionDescription* description;
+ int32_t n;
+ std::vector values;
+
+ public:
+ /**
+ * Sets up a VertexData2D_Unbuffered object with an element allocated for each vertex
+ * in each block.
+ * @param descr Pointer to a MatrixDecompositionDescription object describing the layout
+ * of the 2D blocks.
+ */
+ VertexData2D_Unbuffered(const MatrixDecompositionDescription* descr) :
+ description(descr) {
+ // Resize the values array to be the same size as number of blocks
+ values.resize(descr->getNumBlocks());
+
+ // Grab the current device id to switch back after allocations are done
+ int current_device;
+ cudaGetDevice(¤t_device);
+ LocalType allocSize = descr->getOffset();
+ n = allocSize;
+ // Allocate the data for each block
+ for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+ int device = descr->getDeviceAssignments()[i];
+ cudaSetDevice(device);
+ cudaMalloc(&(values[i]), sizeof(ValueType) * n);
+ }
+
+ // Set the device back to what it was initially
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * Sets up a VertexData2D_Unbuffered object with _n elements allocated per block.
+ * @param descr Pointer to a MatrixDecompositionDescription object describing the layout
+ * of the 2D blocks.
+ * @param _n The number of elements to allocate per block.
+ */
+ VertexData2D_Unbuffered(const MatrixDecompositionDescription* descr,
+ size_t _n) :
+ description(descr), n(_n) {
+ // Resize the values array to be the same size as number of blocks
+ values.resize(descr->getNumBlocks());
+
+ // Grab the current device id to switch back after allocations are done
+ int current_device;
+ cudaGetDevice(¤t_device);
+ // Allocate the data for each block
+ for (size_t i = 0; i < descr->getDeviceAssignments().size(); i++) {
+ int device = descr->getDeviceAssignments()[i];
+ cudaSetDevice(device);
+ cudaMalloc(&(values[i]), sizeof(ValueType) * n);
+ }
+
+ // Set the device back to what it was initially
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * Destructor. Frees all allocated memory.
+ */
+ ~VertexData2D_Unbuffered() {
+ for (size_t i = 0; i < values.size(); i++) {
+ if (values[i]) {
+ cudaFree(values[i]);
+ }
+ }
+ }
+
+ /**
+ * Fills the elements of the data array with the given value.
+ * The elements on the diagonal are filled with the given value. After filling,
+ * either rowScatter or columnScatter will copy the values across the blocks in
+ * either the rows or columns depending on the use.
+ * @param val The value to fill the array with
+ */
+ void fillElements(ValueType val) {
+ int current_device;
+ cudaGetDevice(¤t_device);
+ int32_t numRows = description->getBlockRows();
+ for (int32_t i = 0; i < numRows; i++) {
+ int32_t blockId = description->getBlockId(i, i);
+ ValueType* vals = get(blockId);
+ int deviceId = description->getDeviceAssignments()[blockId];
+ cudaStream_t stream = description->getBlockStreams()[blockId];
+ cudaSetDevice(deviceId);
+ thrust::fill(thrust::cuda::par.on(stream), vals, vals + n, val);
+ }
+ description->syncAllStreams();
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * This implements a column-wise scatter of the global data from the corresponding
+ * row. i.e. The data reduced from row 1 is broadcast to all blocks in
+ * column 1. It is assumed that the data to broadcast is located in the block on
+ * the diagonal.
+ */
+ void columnScatter() {
+ int current_device;
+ cudaGetDevice(¤t_device);
+
+ // For each column in the decomposition:
+ int32_t numRows = description->getBlockRows();
+ std::vector blockIds;
+ for (int32_t i = 0; i < numRows; i++) {
+ // Put all the block ids for the column into a vector, with the ID of the diagonal block
+ // at index 0.
+ std::vector blockIds;
+ blockIds.push_back(-1);
+ for (int32_t j = 0; j < numRows; j++) {
+ if (i == j) {
+ blockIds[0] = description->getBlockId(j, i);
+ }
+ else {
+ blockIds.push_back(description->getBlockId(j, i));
+ }
+ }
+
+ // Do a binary tree scatter. At each step the primary buffer of the sender is
+ // copied into the primary buffer of the receiver.
+ int32_t max2pow = 2;
+ while (max2pow < numRows) {
+ max2pow *= 2;
+ }
+ for (int32_t j = max2pow; j >= 2; j /= 2) {
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id] is the sender
+ int32_t senderId = blockIds[id];
+
+ // blockIds[id + j/2] is the sender
+ int32_t receiverId = blockIds[id + j / 2];
+
+ // Get the stream associated with the receiver's block id
+ cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+ // Copy from the sender to the receiver (use stream associated with receiver)
+ cudaMemcpyAsync(values[receiverId],
+ values[senderId],
+ sizeof(ValueType) * n,
+ cudaMemcpyDefault,
+ stream);
+ }
+ }
+ // Synchronize all the active streams before next step.
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id + j/2] is the sender
+ int32_t receiverId = blockIds[id + j / 2];
+
+ // Set device and sync receiver's stream
+ cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+ cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+ }
+ }
+ }
+ }
+
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * This implements a row-wise scatter of the global data from the corresponding
+ * column. i.e. The data reduced from column 1 is broadcast to all blocks in
+ * row 1. It is assumed that the data to broadcast is located in the block on
+ * the diagonal.
+ */
+ void rowScatter() {
+ int current_device;
+ cudaGetDevice(¤t_device);
+
+ // For each row in the decomposition:
+ int32_t numRows = description->getBlockRows();
+ std::vector blockIds;
+ for (int32_t i = 0; i < numRows; i++) {
+ // Put all the block ids for the column into a vector, with the ID of the diagonal block
+ // at index 0.
+ std::vector blockIds;
+ blockIds.push_back(-1);
+ for (int32_t j = 0; j < numRows; j++) {
+ if (i == j) {
+ blockIds[0] = description->getBlockId(i, j);
+ }
+ else {
+ blockIds.push_back(description->getBlockId(i, j));
+ }
+ }
+
+ // Do a binary tree scatter. At each step the primary buffer of the sender is
+ // copied into the primary buffer of the receiver.
+ int32_t max2pow = 2;
+ while (max2pow < numRows) {
+ max2pow *= 2;
+ }
+ for (int32_t j = max2pow; j >= 2; j /= 2) {
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id] is the sender
+ int32_t senderId = blockIds[id];
+
+ // blockIds[id + j/2] is the receiver
+ int32_t receiverId = blockIds[id + j / 2];
+
+ // Get the stream associated with the receiver's block id
+ cudaStream_t stream = description->getBlockStreams()[receiverId];
+
+ // Copy from the sender to the receiver (use stream associated with receiver)
+ cudaMemcpyAsync(values[receiverId],
+ values[senderId],
+ sizeof(ValueType) * n,
+ cudaMemcpyDefault,
+ stream);
+ }
+ }
+ // Sync all the active streams before next step
+ for (int32_t id = 0; id < numRows; id++) {
+ if (id % j == 0 && id + j / 2 < numRows) {
+ // blockIds[id + j/2] is the receiver
+ int32_t receiverId = blockIds[id + j / 2];
+
+ // Set device and sync receiver's stream
+ cudaSetDevice(description->getDeviceAssignments()[receiverId]);
+ cudaStreamSynchronize(description->getBlockStreams()[receiverId]);
+ }
+ }
+ }
+ }
+
+ cudaSetDevice(current_device);
+ }
+
+ /**
+ * Getter for n
+ * @return The value of n
+ */
+ int32_t getN() {
+ return n;
+ }
+
+ /**
+ * Gets the pointer to the allocated memory for a specified block.
+ * @param bId The block id to get the memory for.
+ * @return A pointer to the allocated memory for the given block.
+ */
+ ValueType* get(int32_t bId) {
+ return values[bId];
+ }
+ };
+
+ /**
+ * This method takes in COO format matrix data and a MatrixDecompositionDescription and
+ * returns a Matrix2d object containing the given data.
+ */
+ template
+ Matrix2d COOto2d(MatrixDecompositionDescription descr,
+ GlobalType* rowIds,
+ GlobalType* colIds,
+ ValueType* values) {
+ // Grab the current device id to switch back after allocations are done
+ int current_device;
+ cudaGetDevice(¤t_device);
+
+ int32_t blockCount = descr.getNumBlocks();
+
+ // Allocate array of size global nnz to hold the block labels
+ int32_t* blockLabels = (int32_t*) malloc(descr.getNnz() * sizeof(int32_t));
+
+ // Allocate array to contain row counts for each block and initialize to zero
+ // Allocate array to contain position offsets for writing each blocks data
+ LocalType* blockCounts = (LocalType*) malloc(blockCount * sizeof(LocalType));
+ LocalType* blockPos = (LocalType*) malloc(blockCount * sizeof(LocalType));
+ for (int i = 0; i < blockCount; i++) {
+ blockCounts[i] = 0;
+ blockPos[i] = 0;
+ }
+
+ // For each edge mark in the array the id of the block to which it will belong
+ int32_t blockId;
+ LocalType localRow;
+ LocalType localCol;
+ for (int i = 0; i < descr.getNnz(); i++) {
+ descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId);
+ blockLabels[i] = blockId;
+ blockCounts[blockId]++;
+ }
+
+ // Allocate arrays for putting each blocks data into
+ LocalType** blockRowIds = (LocalType**) malloc(blockCount * sizeof(LocalType*));
+ LocalType** blockColIds = (LocalType**) malloc(blockCount * sizeof(LocalType*));
+ ValueType** blockValues = NULL;
+ if (values)
+ blockValues = (ValueType**) malloc(blockCount * sizeof(ValueType*));
+ for (int i = 0; i < blockCount; i++) {
+ blockRowIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType));
+ blockColIds[i] = (LocalType*) malloc(blockCounts[i] * sizeof(LocalType));
+ if (values)
+ blockValues[i] = (ValueType*) malloc(blockCounts[i] * sizeof(ValueType));
+ }
+
+ // Convert each blocks global rows to local ids and copy into block arrays
+ for (int i = 0; i < descr.getNnz(); i++) {
+ descr.convertGlobaltoLocalRow(rowIds[i], colIds[i], localRow, localCol, blockId);
+ blockRowIds[blockId][blockPos[blockId]] = localRow;
+ blockColIds[blockId][blockPos[blockId]] = localCol;
+ if (values)
+ blockValues[blockId][blockPos[blockId]] = values[i];
+ blockPos[blockId]++;
+ }
+
+ // Allocate the result blocks vector
+ std::vector*> blockVector(blockCount);
+
+ // Convert each blocks COO rows into CSR and create it's graph object.
+ for (int i = 0; i < blockCount; i++) {
+ // Set the device as indicated so the data ends up on the right GPU
+ cudaSetDevice(descr.getDeviceAssignments()[i]);
+ cudaStream_t stream = descr.getBlockStreams()[i];
+
+ if (blockCounts[i] > 0) {
+ CSR_Result_Weighted result;
+ ConvertCOOtoCSR_weighted(blockRowIds[i],
+ blockColIds[i],
+ values ? blockValues[i] : NULL,
+ (int64_t) blockCounts[i],
+ (descr.getOffset() - 1),
+ result);
+ MultiValuedCsrGraph* csrGraph = new MultiValuedCsrGraph((size_t) result.size, (size_t) result.nnz, stream);
+ if (values)
+ csrGraph->allocateEdgeData(1, NULL);
+ cudaMemcpy(csrGraph->get_raw_row_offsets(),
+ result.rowOffsets,
+ (result.size + 1) * sizeof(LocalType),
+ cudaMemcpyDefault);
+ cudaMemcpy(csrGraph->get_raw_column_indices(),
+ result.colIndices,
+ result.nnz * sizeof(LocalType),
+ cudaMemcpyDefault);
+ if (values)
+ cudaMemcpy(csrGraph->get_raw_edge_dim(0),
+ result.edgeWeights,
+ result.nnz * sizeof(LocalType),
+ cudaMemcpyDefault);
+ blockVector[i] = csrGraph;
+ result.Destroy();
+ }
+ else {
+ MultiValuedCsrGraph* csrGraph = new MultiValuedCsrGraph((size_t) descr.getOffset(), (size_t) 0, stream);
+ cudaMemset( csrGraph->get_raw_row_offsets(),
+ 0,
+ sizeof(LocalType) * (descr.getOffset() + 1));
+ blockVector[i] = csrGraph;
+ }
+ }
+
+ // Free temporary memory
+ for (int i = 0; i < blockCount; i++) {
+ free(blockRowIds[i]);
+ free(blockColIds[i]);
+ if (values)
+ free(blockValues[i]);
+ }
+ free(blockRowIds);
+ free(blockColIds);
+ if (values)
+ free(blockValues);
+
+ cudaSetDevice(current_device);
+
+ // Put it all together into a Matrix2d object for return
+ return Matrix2d(descr, blockVector);
+ }
+}
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_app.cu b/cpp/nvgraph/cpp/include/app/nvlouvain_app.cu
new file mode 100644
index 00000000000..b29acf1961d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/app/nvlouvain_app.cu
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include
+#include
+#include
+#include
+#include "test_opt_utils.cuh"
+#include "graph_utils.cuh"
+
+//#define ENABLE_LOG TRUE
+#define ENALBE_LOUVAIN true
+
+#include "nvlouvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+using T = float;
+
+int main(int argc, char* argv[]){
+
+ if(argc < 2)
+ {
+ std::cout<< "Help : ./louvain_test matrix_market_file.mtx"<(fin, 1, &mc, &m, &k, &nnz)) ,0);
+ EXPECT_EQ(m,k);
+
+ thrust::host_vector coo_ind_h(nnz);
+ thrust::host_vector csr_ptr_h(m+1);
+ thrust::host_vector csr_ind_h(nnz);
+ thrust::host_vector csr_val_h(nnz);
+
+ EXPECT_EQ( (mm_to_coo(fin, 1, nnz, &coo_ind_h[0], &csr_ind_h[0], &csr_val_h[0], NULL)), 0);
+ EXPECT_EQ( (coo_to_csr (m, k, nnz, &coo_ind_h[0], &csr_ind_h[0], &csr_val_h[0], NULL, &csr_ptr_h[0], NULL, NULL, NULL)), 0);
+
+ EXPECT_EQ(fclose(fin),0);
+
+ thrust::device_vector csr_ptr_d(csr_ptr_h);
+ thrust::device_vector csr_ind_d(csr_ind_h);
+ thrust::device_vector csr_val_d(csr_val_h);
+
+ thrust::device_vector tmp_1(nnz);
+ thrust::fill(thrust::cuda::par, tmp_1.begin(), tmp_1.end(), 1.0);
+ thrust::device_vector::iterator max_ele = thrust::max_element(thrust::cuda::par, csr_val_d.begin(), csr_val_d.end());
+
+ bool weighted = (*max_ele!=1.0);
+
+ //std::cout<<(weighted?"Weighted ":"Not Weigthed ")<<" n_vertex: "< cluster_d(m, 0);
+ int* csr_ptr_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+ int* csr_ind_ptr = thrust::raw_pointer_cast(csr_ind_d.data());
+ T* csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());
+ int* init_cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
+ int num_level;
+
+ cudaProfilerStart();
+ hr_clock.start();
+ nvlouvain::louvain(csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+ m, nnz,
+ weighted, has_init_cluster,
+ init_cluster_ptr, final_modulartiy, clustering_h, num_level);
+
+ hr_clock.stop(&louvain_time);
+ cudaProfilerStop();
+
+ std::cout<<"Final modularity: "<
+#include
+#include
+#include
+#include "test_opt_utils.cuh"
+#include "graph_utils.cuh"
+
+//#define ENABLE_LOG true
+#define ENALBE_LOUVAIN true
+
+#include "nvlouvain.cuh"
+#include "gtest/gtest.h"
+#include "high_res_clock.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+using T = double;
+
+int main(int argc, char* argv[]){
+
+ if(argc < 2)
+ {
+ std::cout<< "Help : ./louvain_test matrix_market_file.mtx"<(fin, 1, &mc, &m, &k, &nnz)) ,0);
+ EXPECT_EQ(m,k);
+
+ thrust::host_vector coo_ind_h(nnz);
+ thrust::host_vector csr_ptr_h(m+1);
+ thrust::host_vector csr_ind_h(nnz);
+ thrust::host_vector csr_val_h(nnz);
+
+ EXPECT_EQ( (mm_to_coo(fin, 1, nnz, &coo_ind_h[0], &csr_ind_h[0], &csr_val_h[0], NULL)), 0);
+ EXPECT_EQ( (coo_to_csr (m, k, nnz, &coo_ind_h[0], &csr_ind_h[0], &csr_val_h[0], NULL, &csr_ptr_h[0], NULL, NULL, NULL)), 0);
+
+ EXPECT_EQ(fclose(fin),0);
+
+ thrust::device_vector csr_ptr_d(csr_ptr_h);
+ thrust::device_vector csr_ind_d(csr_ind_h);
+ thrust::device_vector csr_val_d(csr_val_h);
+
+ thrust::device_vector tmp_1(nnz);
+ thrust::fill(thrust::cuda::par, tmp_1.begin(), tmp_1.end(), 1.0);
+ thrust::device_vector::iterator max_ele = thrust::max_element(thrust::cuda::par, csr_val_d.begin(), csr_val_d.end());
+
+ bool weighted = (*max_ele!=1.0);
+
+ //std::cout<<(weighted?"Weighted ":"Not Weigthed ")<<" n_vertex: "< cluster_d(m, 0);
+ std::vector< std::vector > best_cluster_vec;
+ int* csr_ptr_ptr = thrust::raw_pointer_cast(csr_ptr_d.data());
+ int* csr_ind_ptr = thrust::raw_pointer_cast(csr_ind_d.data());
+ T* csr_val_ptr = thrust::raw_pointer_cast(csr_val_d.data());
+ int* init_cluster_ptr = thrust::raw_pointer_cast(cluster_d.data());
+ int num_level;
+
+ cudaProfilerStart();
+ hr_clock.start();
+
+ nvlouvain::louvain(csr_ptr_ptr, csr_ind_ptr, csr_val_ptr,
+ m, nnz,
+ weighted, has_init_cluster,
+ init_cluster_ptr, final_modulartiy, best_cluster_vec, num_level);
+
+ hr_clock.stop(&louvain_time);
+ cudaProfilerStop();
+
+ std::cout<<"Final modularity: "<::iterator it = best_cluster_vec[i].begin(); it != best_cluster_vec[i].end(); ++it)
+ // std::cout << *it <<' ';
+ // std::cout << std::endl;
+ //}
+ }
+ return 0;
+}
+
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu b/cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu
new file mode 100644
index 00000000000..790a4788b6f
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/app/nvlouvain_sample.cu
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include
+#include
+#include
+// Turn on to see stats for each level
+//#define ENABLE_LOG true
+#include "nvlouvain.cuh"
+
+
+
+/* Louvain Clustering Sample
+
+Social network example: Zachary Karate Club
+W. Zachary, “An information flow model for conflict and fission in small groups,” Journal of Anthropological Research, vol. 33, pp. 452–473, 1977
+https://en.wikipedia.org/wiki/Zachary's_karate_club
+--------------------------------------------------------------------
+V = 34
+E = 78 bidirectional, 156 directed edges
+
+Bidirectional edges list:
+[2 1] [3 1] [3 2] [4 1] [4 2] [4 3] [5 1] [6 1] [7 1] [7 5] [7 6] [8 1] [8 2] [8 3] [8 4] [9 1] [9 3] [10 3] [11 1] [11 5] [11 6] [12 1] [13 1] [13 4] [14 1] [14 2] [14 3] [14 4] [17 6] [17 7]
+[18 1] [18 2] [20 1] [20 2] [22 1] [22 2] [26 24] [26 25] [28 3] [28 24] [28 25] [29 3] [30 24] [30 27] [31 2] [31 9] [32 1] [32 25] [32 26] [32 29] [33 3] [33 9] [33 15] [33 16]
+[33 19] [33 21] [33 23] [33 24] [33 30] [33 31] [33 32] [34 9] [34 10] [34 14] [34 15] [34 16] [34 19] [34 20] [34 21] [34 23] [34 24] [34 27] [34 28] [34 29] [34 30] [34 31]
+[34 32] [34 33]
+
+CSR representation (directed):
+csrRowPtrA_h {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, 98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156}
+csrColIndA_h {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 6, 10, 16, 0,
+4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 23,
+24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23,
+26, 27, 28, 29, 30, 31, 32}
+csrValA_h {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}
+
+--------------------------------------------------------------------
+
+Operation: Louvain Clustering default parameters in modularity maximization
+
+--------------------------------------------------------------------
+
+Expected output:
+This sample prints the modlarity score and compare against the python reference (https://python-louvain.readthedocs.io/en/latest/api.html)
+
+
+*/
+
+using namespace nvlouvain;
+
+void check_status(nvlouvainStatus_t status)
+{
+ if ((int)status != 0)
+ {
+ printf("ERROR : %s\n",nvlouvainStatusGetString(status));
+ exit(0);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ // Hard-coded Zachary Karate Club network input
+ int csrRowPtrA_input [] = {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, 98, 101, 104, 106, 110, 113, 117, 121, 127,
+ 139, 156};
+ int csrColIndA_input [] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0,
+ 6, 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33,
+ 25, 27, 31, 23, 24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15,
+ 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
+ float csrValA_input [] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+ int ref_clustering [] = {0, 0, 0, 0, 1, 1, 1, 0, 2, 0, 1, 0, 0, 0, 2, 2, 1, 0, 2, 0, 2, 0, 2, 3, 3, 3, 2, 3, 3, 2, 2, 3, 2, 2};
+ int *csrRowPtrA_h = &csrRowPtrA_input[0];
+ int *csrColIndA_h = &csrColIndA_input[0];
+ float *csrValA_h = &csrValA_input[0];
+
+ // Variables
+ const size_t n = 34, nnz = 156;
+ bool weighted = false;
+ bool has_init_cluster = false;
+ int *clustering_h, *init_cluster_ptr = nullptr;;
+ int num_levels = 0, hits =0;
+ float final_modulartiy = 0;
+ // Allocate host data for nvgraphSpectralClustering output
+ clustering_h = (int*)malloc(n*sizeof(int));
+
+ //Solve clustering with modularity maximization algorithm
+ check_status(louvain(csrRowPtrA_h, csrColIndA_h, csrValA_h, n, nnz, weighted, has_init_cluster, init_cluster_ptr, final_modulartiy, clustering_h, num_levels));
+
+ //Print quality (modualrity)
+ printf("Modularity_score: %f\n", final_modulartiy);
+ printf("num levels: %d\n", num_levels);
+ for (int i = 0; i < (int)n; i++)
+ if (clustering_h[i] == ref_clustering[i])
+ hits++;
+ printf("Hit rate : %f%% (%d hits)\n", (hits*100.0)/n, hits);
+ // Print the clustering vector in csv format
+ //for (int i = 0; i < (int)(n-1); i++)
+ // printf("%d,",clustering_h[i]);
+ //printf("%d,\n",clustering_h[n-1]);
+ free(clustering_h);
+ printf("Done!\n");
+
+ return EXIT_SUCCESS;
+}
+
diff --git a/cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu b/cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu
new file mode 100644
index 00000000000..d39551d768f
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/app/nvlouvain_sample_hierarchy.cu
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+#include
+#include
+// Turn on to see stats for each level
+//#define ENABLE_LOG true
+#include "nvlouvain.cuh"
+
+
+
+/* Louvain Clustering Sample
+
+
+Social network example: Zachary Karate Club
+W. Zachary, “An information flow model for conflict and fission in small groups,” Journal of Anthropological Research, vol. 33, pp. 452–473, 1977
+https://en.wikipedia.org/wiki/Zachary's_karate_club
+--------------------------------------------------------------------
+V = 34
+E = 78 bidirectional, 156 directed edges
+
+Bidirectional edges list:
+[2 1] [3 1] [3 2] [4 1] [4 2] [4 3] [5 1] [6 1] [7 1] [7 5] [7 6] [8 1] [8 2] [8 3] [8 4] [9 1] [9 3] [10 3] [11 1] [11 5] [11 6] [12 1] [13 1] [13 4] [14 1] [14 2] [14 3] [14 4] [17 6] [17 7]
+[18 1] [18 2] [20 1] [20 2] [22 1] [22 2] [26 24] [26 25] [28 3] [28 24] [28 25] [29 3] [30 24] [30 27] [31 2] [31 9] [32 1] [32 25] [32 26] [32 29] [33 3] [33 9] [33 15] [33 16]
+[33 19] [33 21] [33 23] [33 24] [33 30] [33 31] [33 32] [34 9] [34 10] [34 14] [34 15] [34 16] [34 19] [34 20] [34 21] [34 23] [34 24] [34 27] [34 28] [34 29] [34 30] [34 31]
+[34 32] [34 33]
+
+CSR representation (directed):
+csrRowPtrA_h {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, 98, 101, 104, 106, 110, 113, 117, 121, 127, 139, 156}
+csrColIndA_h {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0, 6, 10, 16, 0,
+4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33, 25, 27, 31, 23,
+24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15, 18, 19, 20, 22, 23,
+26, 27, 28, 29, 30, 31, 32}
+csrValA_h {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}
+
+--------------------------------------------------------------------
+
+Operation: Louvain Clustering default parameters in modularity maximization
+
+--------------------------------------------------------------------
+
+Expected output:
+This sample prints the modlarity score
+
+*/
+
+using namespace nvlouvain;
+
+void check_status(nvlouvainStatus_t status)
+{
+ if ((int)status != 0)
+ {
+ printf("ERROR : %s\n",nvlouvainStatusGetString(status));
+ exit(0);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ // Hard-coded Zachary Karate Club network input
+ int csrRowPtrA_input [] = {0, 16, 25, 35, 41, 44, 48, 52, 56, 61, 63, 66, 67, 69, 74, 76, 78, 80, 82, 84, 87, 89, 91, 93, 98, 101, 104, 106, 110, 113, 117, 121, 127,
+ 139, 156};
+ int csrColIndA_input [] = {1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 17, 19, 21, 31, 0, 2, 3, 7, 13, 17, 19, 21, 30, 0, 1, 3, 7, 8, 9, 13, 27, 28, 32, 0, 1, 2, 7, 12, 13, 0, 6, 10, 0,
+ 6, 10, 16, 0, 4, 5, 16, 0, 1, 2, 3, 0, 2, 30, 32, 33, 2, 33, 0, 4, 5, 0, 0, 3, 0, 1, 2, 3, 33, 32, 33, 32, 33, 5, 6, 0, 1, 32, 33, 0, 1, 33, 32, 33, 0, 1, 32, 33, 25, 27, 29, 32, 33,
+ 25, 27, 31, 23, 24, 31, 29, 33, 2, 23, 24, 33, 2, 31, 33, 23, 26, 32, 33, 1, 8, 32, 33, 0, 24, 25, 28, 32, 33, 2, 8, 14, 15, 18, 20, 22, 23, 29, 30, 31, 33, 8, 9, 13, 14, 15,
+ 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32};
+ float csrValA_input [] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+// int ref_clustering [] = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ int *csrRowPtrA_h = &csrRowPtrA_input[0];
+ int *csrColIndA_h = &csrColIndA_input[0];
+ float *csrValA_h = &csrValA_input[0];
+
+ // Variables
+ const size_t n = 34, nnz = 156;
+ bool weighted = false;
+ bool has_init_cluster = false;
+ int num_levels = 0;
+ int *init_cluster_ptr = nullptr;
+ float final_modulartiy = 0;
+ std::vector< std::vector > best_cluster_vec;
+
+ //Solve clustering with modularity maximization algorithm
+ check_status(louvain(csrRowPtrA_h, csrColIndA_h, csrValA_h, n, nnz, weighted, has_init_cluster, init_cluster_ptr, final_modulartiy, best_cluster_vec, num_levels));
+
+ //Print quality (modualrity)
+ printf("Modularity_score: %f\n", final_modulartiy);
+ printf("num levels: %d\n", num_levels);
+ printf("Done!\n");
+
+ //for (size_t i = 0; i < best_cluster_vec.size(); i++)
+ //{
+ // for(std::vector::iterator it = best_cluster_vec[i].begin(); it != best_cluster_vec[i].end(); ++it)
+ // std::cout << *it <<' ';
+ // std::cout << std::endl;
+ //}
+
+ return EXIT_SUCCESS;
+}
diff --git a/cpp/nvgraph/cpp/include/arnoldi.hxx b/cpp/nvgraph/cpp/include/arnoldi.hxx
new file mode 100644
index 00000000000..9b5163fc294
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/arnoldi.hxx
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include
+
+namespace nvgraph
+{
+
+template
+class ImplicitArnoldi
+{
+public:
+ typedef IndexType_ IndexType;
+ typedef ValueType_ ValueType;
+
+private:
+ //Arnoldi
+ ValuedCsrGraph m_A ;//device
+ std::vector m_Vi; // Host vector of device adresses -> no it is a 2D vect
+ Vector m_V; // Each colum is a vector of size n, colum major storage
+ Vector m_Q_d; // Device version of Q (Qt)
+ Vector m_V_tmp; // Output of V*Q <=> QtVt
+ Vector m_ritz_eigenvectors_d;
+ Vector m_eigenvectors;
+ std::vector m_H; //host
+ std::vector m_H_select; //host
+ std::vector m_H_tmp; //host (lapack likes to overwrite input)
+ std::vector m_ritz_eigenvalues; //host
+ std::vector m_ritz_eigenvalues_i; //host
+ std::vector m_shifts; //host
+ std::vector m_ritz_eigenvectors;//host
+ std::vector m_Q; //host
+ std::vector m_Q_tmp; //host (lapack likes to overwrite input)
+ std::vector m_mns_residuals; //host resuals of subspaces
+ std::vector m_mns_beta; //host resuals of subspaces
+
+ Vector m_a; // Markov
+ Vector m_b; // Markov
+ Vector m_D; // Laplacian
+
+ ValueType m_beta; // from arnoldi projection algorithm
+ ValueType m_residual; // is set by compute_residual()
+ ValueType m_damping; // for Markov and Pagerank
+
+ float m_tolerance;
+
+ int m_nr_eigenvalues; // the number of wanted eigenvals, also called k in the litterature
+ int m_n_eigenvalues; // the number of eigenvals we keep in the solver, this greater or equal to k, this can be m_nr_eigenvalues or m_nr_eigenvalues+1
+ int m_krylov_size; // the maximum size of the krylov sobspace, also called m in the litterature (m=k+p)
+ int m_iterations; // a counter of restart, each restart cost m_krylov_size-m_n_eigenvalues arnoldi iterations (~spmv)
+ int m_max_iter; // maximum number of iterations
+
+ int m_parts; // laplacian related
+
+ //miramns related ints
+ int m_nested_subspaces; // the number of subspace to evaluate in MIRAMns
+ int m_nested_subspaces_freq; // the frequence at which we should evaluate subspaces in MIRAMns
+ int m_select; // best subspace size
+ int m_select_idx; // best subspace number (0 indexed)
+ int m_safety_lower_bound; // The smallest subspace to check is m_safety_lower_bound+m_nr_eigenvalues+1
+
+ bool m_converged;
+ bool m_is_setup;
+ bool m_has_guess;
+ bool m_markov;
+ bool m_miramns;
+ bool m_dirty_bit; // to know if H has changed, so if we need to call geev
+ bool m_laplacian;
+ bool has_init_guess;
+
+ // Warning : here an iteration is a restart
+ bool solve_it();
+
+ // Input: A V[0]
+ // Output: V, H, f(=V[m_krylov_size])
+ bool solve_arnoldi(int lower_bound, int upper_bound);
+
+ // Input: H - a real square upper Hessenberg matrix
+ // Output: w - eigenvalues of H sorted according to which
+ // most wanted to least wanted order
+ // Optionally compute the eigenvalues of H
+ void select_shifts(bool dirty_bit=false);
+
+ // reorder eigenpairs by largest real part
+ void LR(int subspace_sz);
+
+ // reorder eigenpairs by largest magnitude
+ void LM(int subspace_sz);
+
+ // reorder eigenpairs by smallest real part
+ void SR(int subspace_sz);
+
+ // Input: Q -- a real square orthogonal matrix
+ // H -- a real square upper Hessenberg matrix
+ // mu -- a real shift
+ // Output: Q+ -- a real orthogonal matrix
+ // H+ -- a real square upper Hessenberg matrix
+ // This step will "refine" the subspace by "pushing" the information
+ // into the top left corner
+ void qr_step();
+
+ // Update V and f using Q+ and H+
+ void refine_basis();
+
+ // Approximate residual of the largest Ritz pair of H
+ // Optionally compute the eigenvalues of H
+ void compute_residual(int subspace_size, bool dirty_bit=false);
+
+ void compute_eigenvectors();
+
+ void select_subspace();
+
+ // extract H_select from H
+ void extract_subspace(int m);
+
+ // clean everything outside of the new_sz*new_sz hessenberg matrix (in colum major)
+ void cleanup_subspace(std::vector& v, int ld, int new_sz);
+
+ // clean everything outside of the new_sz*new_sz hessenberg matrix (in colum major)
+ void shift(std::vector& H, int ld, int m, ValueType mu);
+
+public:
+ // Simple constructor
+ ImplicitArnoldi(void) {};
+ // Simple destructor
+ ~ImplicitArnoldi(void) {};
+
+ // Create a ImplicitArnoldi Solver
+ ImplicitArnoldi(const ValuedCsrGraph & A);
+
+ // Create a ImplicitArnoldi Solver with support of graph laplacian generation
+ ImplicitArnoldi(const ValuedCsrGraph & A, int parts);
+
+ // Create a ImplicitArnoldi Solver with support of damping factor and rank one updates (pagerank, markov ...)
+ ImplicitArnoldi(const ValuedCsrGraph & A, Vector& dangling_nodes, const float tolerance, const int max_iter, ValueType alpha=0.95);
+
+ void setup( Vector& initial_guess, const int restart_it, const int nEigVals); // public because we want to use and test that directly and/or separately
+
+ // Starting from V, H, f :
+ // Call the QRstep, project the update, launch the arnlodi with the new base
+ // and check the quality of the new result
+ void implicit_restart(); // public because we want to use and test that directly and/or separately
+
+ // The total number of SPMV will be : m_krylov_size + (m_krylov_size-m_n_eigenvalues)*nb_restart
+ NVGRAPH_ERROR solve(const int restart_it, const int nEigVals,
+ Vector& initial_guess,
+ Vector& eigVals,
+ Vector& eigVecs,
+ const int n_sub_space=0);
+
+ inline ValueType get_residual() const {return m_residual;}
+ inline int get_iterations() const {return m_iterations;}
+
+ // we use that for tests, unoptimized copies/transfers inside
+ std::vector get_H_copy() {return m_H;}
+ std::vector get_Hs_copy() {return m_H_select;}
+ std::vector get_ritz_eval_copy(){return m_ritz_eigenvalues;} // should be called after select_shifts
+ std::vector get_V_copy();
+ std::vector get_f_copy();
+ std::vector get_fp_copy();
+};
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/async_event.cuh b/cpp/nvgraph/cpp/include/async_event.cuh
new file mode 100644
index 00000000000..1f4491645cc
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/async_event.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+
+class AsyncEvent
+{
+ public:
+ AsyncEvent() : async_event(NULL) { }
+ AsyncEvent(int size) : async_event(NULL) { cudaEventCreate(&async_event); }
+ ~AsyncEvent() { if (async_event != NULL) cudaEventDestroy(async_event); }
+
+ void create() { cudaEventCreate(&async_event); }
+ void record(cudaStream_t s = 0)
+ {
+ if (async_event == NULL)
+ {
+ cudaEventCreate(&async_event); // check if we haven't created the event yet
+ }
+
+ cudaEventRecord(async_event, s);
+ }
+ void sync()
+ {
+ cudaEventSynchronize(async_event);
+ }
+ private:
+ cudaEvent_t async_event;
+};
+
diff --git a/cpp/nvgraph/cpp/include/async_event.hxx b/cpp/nvgraph/cpp/include/async_event.hxx
new file mode 100644
index 00000000000..a3ad6567734
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/async_event.hxx
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace nvgraph {
+
+ class AsyncEvent {
+ public:
+ AsyncEvent() : async_event(NULL) { }
+ AsyncEvent(int size) : async_event(NULL) { cudaEventCreate(&async_event); }
+ ~AsyncEvent() { if (async_event != NULL) cudaEventDestroy(async_event); }
+
+ void create() { cudaEventCreate(&async_event); }
+ void record(cudaStream_t s=0) {
+ if (async_event == NULL)
+ cudaEventCreate(&async_event); // check if we haven't created the event yet
+ cudaEventRecord(async_event,s);
+ }
+ void sync() {
+ cudaEventSynchronize(async_event);
+ }
+ private:
+ cudaEvent_t async_event;
+ };
+
+}
+
diff --git a/cpp/nvgraph/cpp/include/atomics.hxx b/cpp/nvgraph/cpp/include/atomics.hxx
new file mode 100644
index 00000000000..4cd02764ed7
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/atomics.hxx
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+namespace nvgraph {
+//This file contains the atomic operations for floats and doubles from cusparse/src/cusparse_atomics.h
+
+static __inline__ __device__ double atomicFPAdd(double *addr, double val)
+{
+// atomicAdd for double starts with sm_60
+#if __CUDA_ARCH__ >= 600
+ return atomicAdd( addr, val );
+#else
+ unsigned long long old = __double_as_longlong( addr[0] ), assumed;
+
+ do
+ {
+ assumed = old;
+ old = atomicCAS( (unsigned long long *) addr, assumed, __double_as_longlong( val + __longlong_as_double( assumed ) ) );
+ }
+ while ( assumed != old );
+
+ return old;
+#endif
+}
+
+// atomicAdd for float starts with sm_20
+static __inline__ __device__ float atomicFPAdd(float *addr, float val)
+{
+ return atomicAdd( addr, val );
+}
+
+static __inline__ __device__ double atomicFPMin(double *addr, double val)
+{
+ double old, assumed;
+ old=*addr;
+ do{
+ assumed = old;
+ old = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+ __double_as_longlong(min(val,assumed))));
+ } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+ return old;
+}
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPMin(float *addr, float val)
+{
+ float old, assumed;
+ old=*addr;
+ do{
+ assumed = old;
+ old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int(min(val,assumed))));
+ } while (float_as_int(assumed) != float_as_int(old));
+
+ return old;
+}
+
+static __inline__ __device__ double atomicFPMax(double *addr, double val)
+{
+ double old, assumed;
+ old=*addr;
+ do{
+ assumed = old;
+ old = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+ __double_as_longlong(max(val,assumed))));
+ } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+ return old;
+}
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPMax(float *addr, float val)
+{
+ float old, assumed;
+ old=*addr;
+ do{
+ assumed = old;
+ old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int(max(val,assumed))));
+ } while (float_as_int(assumed) != float_as_int(old));
+
+ return old;
+}
+
+static __inline__ __device__ double atomicFPOr(double *addr, double val)
+{
+ double old, assumed;
+ old=*addr;
+ do{
+ assumed = old;
+ old = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+ __double_as_longlong((bool)val | (bool)assumed)));
+ } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+ return old;
+}
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPOr(float *addr, float val)
+{
+ float old, assumed;
+ old=*addr;
+ do{
+ assumed = old;
+ old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int((bool)val | (bool)assumed)));
+ } while (float_as_int(assumed) != float_as_int(old));
+
+ return old;
+}
+
+static __inline__ __device__ double atomicFPLog(double *addr, double val)
+{
+ double old, assumed;
+ old=*addr;
+ do{
+ assumed = old;
+ old = __longlong_as_double(atomicCAS((unsigned long long int *)addr, __double_as_longlong(assumed),
+ __double_as_longlong(-log(exp(-val)+exp(-assumed)))));
+ } while (__double_as_longlong(assumed) != __double_as_longlong(old));
+ return old;
+}
+
+/* atomic addition: based on Nvidia Research atomic's tricks from cusparse */
+static __inline__ __device__ float atomicFPLog(float *addr, float val)
+{
+ float old, assumed;
+ old=*addr;
+ do{
+ assumed = old;
+ old = int_as_float(atomicCAS((int *)addr, float_as_int(assumed),float_as_int(-logf(expf(-val)+expf(-assumed)))));
+ } while (float_as_int(assumed) != float_as_int(old));
+
+ return old;
+}
+
+} //end anmespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/bfs.hxx b/cpp/nvgraph/cpp/include/bfs.hxx
new file mode 100755
index 00000000000..8cd5f37a8c8
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/bfs.hxx
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#pragma once
+
+
+
+#include
+
+
+
+//Used in nvgraph.h
+
+#define TRAVERSAL_DEFAULT_ALPHA 15
+
+#define TRAVERSAL_DEFAULT_BETA 18
+
+
+
+#include "nvgraph_error.hxx"
+
+
+
+namespace nvgraph
+
+{
+
+ template
+
+ class Bfs
+
+ {
+
+ private:
+
+ IndexType n, nnz;
+
+ IndexType* row_offsets;
+
+ IndexType* col_indices;
+
+
+
+ bool directed;
+ bool deterministic;
+
+
+ // edgemask, distances, predecessors are set/read by users - using Vectors
+
+ bool useEdgeMask;
+
+ bool computeDistances;
+
+ bool computePredecessors;
+
+
+
+ IndexType *distances;
+
+ IndexType *predecessors;
+
+ int *edge_mask;
+
+
+
+ //Working data
+
+ //For complete description of each, go to bfs.cu
+
+
+
+ IndexType nisolated;
+
+ IndexType *frontier, *new_frontier;
+
+ IndexType * original_frontier;
+
+ IndexType vertices_bmap_size;
+
+ int *visited_bmap, *isolated_bmap;
+
+ IndexType *vertex_degree;
+
+ IndexType *buffer_np1_1, *buffer_np1_2;
+
+ IndexType *frontier_vertex_degree;
+
+ IndexType *exclusive_sum_frontier_vertex_degree;
+
+ IndexType *unvisited_queue;
+
+ IndexType *left_unvisited_queue;
+
+ IndexType *exclusive_sum_frontier_vertex_buckets_offsets;
+
+
+
+ IndexType *d_counters_pad;
+
+ IndexType *d_new_frontier_cnt;
+
+ IndexType *d_mu;
+
+ IndexType *d_unvisited_cnt;
+
+ IndexType *d_left_unvisited_cnt;
+
+
+
+ void *d_cub_exclusive_sum_storage;
+
+ size_t cub_exclusive_sum_storage_bytes;
+
+
+
+ //Parameters for direction optimizing
+
+ IndexType alpha, beta;
+
+
+
+ cudaStream_t stream;
+
+ //resets pointers defined by d_counters_pad (see implem)
+
+ void resetDevicePointers();
+
+ NVGRAPH_ERROR setup();
+
+ void clean();
+
+ public:
+
+ virtual ~Bfs(void) {
+
+ clean();
+
+ };
+
+
+
+ Bfs(IndexType _n, IndexType _nnz, IndexType *_row_offsets, IndexType *_col_indices, bool _directed, IndexType _alpha, IndexType _beta, cudaStream_t _stream = 0) : n(_n), nnz(_nnz), row_offsets(_row_offsets), col_indices(_col_indices), directed(_directed), alpha(_alpha), beta(_beta), stream(_stream) {
+
+ setup();
+
+ }
+
+
+
+ NVGRAPH_ERROR configure(IndexType *distances, IndexType *predecessors, int *edge_mask);
+
+ NVGRAPH_ERROR traverse(IndexType source_vertex);
+
+ //Used only for benchmarks
+
+ NVGRAPH_ERROR traverse(IndexType *source_vertices, IndexType nsources);
+
+ };
+
+
+
+} // end namespace nvgraph
+
+
+
diff --git a/cpp/nvgraph/cpp/include/bfs2d.hxx b/cpp/nvgraph/cpp/include/bfs2d.hxx
new file mode 100644
index 00000000000..52cc9b2882d
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/bfs2d.hxx
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include
+
+//Used in nvgraph.h
+#define TRAVERSAL_DEFAULT_ALPHA 15
+#define TRAVERSAL_DEFAULT_BETA 18
+
+#include "nvgraph_error.hxx"
+#include "2d_partitioning.h"
+
+namespace nvgraph {
+ template
+ class Bfs2d {
+ private:
+ Matrix2d* M;
+
+ bool directed;
+ bool deterministic;
+ GlobalType alpha;
+ GlobalType beta;
+
+ // edgemask, distances, predecessors are set/read by users - using Vectors
+ bool useEdgeMask;
+ bool computeDistances;
+ bool computePredecessors;
+ int32_t vertices_bmap_size;
+ VertexData2D* distances;
+ VertexData2D* predecessors;
+
+ //Working data
+ VertexData2D* frontier_bmap;
+ VertexData2D* visited_bmap;
+ VertexData2D_Unbuffered* frontier;
+ VertexData2D_Unbuffered* trim_frontier;
+ VertexData2D_Unbuffered* frontierSize;
+ VertexData2D_Unbuffered* degreeFlags;
+ std::vector frontierSize_h;
+ VertexData2D_Unbuffered* exSumDegree;
+ VertexData2D_Unbuffered* exSumStorage;
+ VertexData2D_Unbuffered* bucketOffsets;
+ std::vector frontierDegree_h;
+
+ // Output locations
+ GlobalType* distances_out;
+ GlobalType* predecessors_out;
+
+ NVGRAPH_ERROR setup();
+
+ void clean();
+
+ public:
+ virtual ~Bfs2d(void) {
+ clean();
+ };
+
+ Bfs2d(Matrix2d* _M,
+ bool _directed,
+ GlobalType _alpha,
+ GlobalType _beta) :
+ M(_M),
+ directed(_directed),
+ alpha(_alpha),
+ beta(_beta){
+ distances = NULL;
+ predecessors = NULL;
+ frontier_bmap = NULL;
+ visited_bmap = NULL;
+ setup();
+ }
+
+ NVGRAPH_ERROR configure(GlobalType *distances, GlobalType *predecessors);
+
+ NVGRAPH_ERROR traverse(GlobalType source_vertex);
+
+ //Used only for benchmarks
+ NVGRAPH_ERROR traverse(GlobalType *source_vertices, int32_t nsources);
+ };
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/bfs2d_kernels.cuh b/cpp/nvgraph/cpp/include/bfs2d_kernels.cuh
new file mode 100644
index 00000000000..792db1bd5e3
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/bfs2d_kernels.cuh
@@ -0,0 +1,786 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include
+#include "nvgraph_error.hxx"
+
+#define MAXBLOCKS 65535
+#define WARP_SIZE 32
+#define INT_SIZE 32
+#define FILL_QUEUE_DIMX 256
+#define COMPUTE_BUCKET_OFFSETS_DIMX 512
+#define TOP_DOWN_EXPAND_DIMX 256
+#define TOP_DOWN_BUCKET_SIZE 32
+#define NBUCKETS_PER_BLOCK (TOP_DOWN_EXPAND_DIMX/TOP_DOWN_BUCKET_SIZE)
+#define TOP_DOWN_BATCH_SIZE 2
+#define MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD (TOP_DOWN_BUCKET_SIZE - 1)
+
+using namespace nvgraph;
+namespace bfs_kernels {
+
+ struct popCount : public thrust::unary_function {
+ __device__
+ int operator()(int x) const
+ {
+ return __popc(x);
+ }
+ };
+
+ template
+ struct vec_t {
+ typedef int4 vec4;
+ typedef int2 vec2;
+ };
+
+ template<>
+ struct vec_t {
+ typedef int4 vec4;
+ typedef int2 vec2;
+ static const int max = INT_MAX;
+ };
+
+ template<>
+ struct vec_t {
+ typedef longlong4 vec4;
+ typedef longlong2 vec2;
+ static const long long int max = LLONG_MAX;
+ };
+
+ struct BitwiseOr {
+ template
+ __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+ return (a | b);
+ }
+ };
+
+ struct predMerge {
+ template
+ __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+ if (a != -1 && b != -1)
+ return min(a, b);
+ if (a != -1)
+ return a;
+ if (b != -1)
+ return b;
+ return -1;
+ }
+ };
+
+ __forceinline__ __device__ int getMaskNRightmostBitSet(int n) {
+ if (n == INT_SIZE)
+ return (~0);
+ int mask = (1 << n) - 1;
+ return mask;
+ }
+
+ __forceinline__ __device__ int getMaskNLeftmostBitSet(int n) {
+ if (n == 0)
+ return 0;
+ int mask = ~((1 << (INT_SIZE - n)) - 1);
+ return mask;
+ }
+
+ /**
+ * Finds the position of the next non-zero bit in the given value. The value is
+ * re-written with the found bit unset.
+ * @param val The integer to find the next non-zero bit in.
+ * @return The position of the next non-zero bit
+ */
+ __forceinline__ __device__ int getNextNonZeroBit(int32_t& val) {
+ int ibit = __ffs(val) - 1;
+ val &= ~(1 << ibit);
+
+ return ibit;
+ }
+
+ template
+ __device__ IndexType binsearch_maxle(const IndexType *vec,
+ const IndexType val,
+ IndexType low,
+ IndexType high) {
+ while (true) {
+ if (low == high)
+ return low; //we know it exists
+ if ((low + 1) == high)
+ return (vec[high] <= val) ? high : low;
+
+ IndexType mid = low + (high - low) / 2;
+
+ if (vec[mid] > val)
+ high = mid - 1;
+ else
+ low = mid;
+
+ }
+ }
+
+ template
+ class degreeIterator: public std::iterator {
+ IndexType* offsets;
+ size_t pos;
+ public:
+ __host__ __device__ degreeIterator(IndexType* _offsets) :
+ offsets(_offsets), pos(0) {
+ }
+ __host__ __device__ degreeIterator(IndexType* _offsets, size_t _pos) :
+ offsets(_offsets), pos(_pos) {
+ }
+ __host__ __device__ IndexType operator[](int loc) {
+ return offsets[loc + 1] - offsets[loc];
+ }
+ __host__ __device__ IndexType operator*() {
+ return offsets[pos + 1] - offsets[pos];
+ }
+ __host__ __device__ degreeIterator operator+(int inc) {
+ degreeIterator it(offsets, pos + inc);
+ return it;
+ }
+ };
+
+ template
+ size_t getCubExclusiveSumStorageSize(IndexType n) {
+ void* d_temp_storage = NULL;
+ size_t temp_storage_bytes = 0;
+ IndexType *d_in = NULL, *d_out = NULL;
+ cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, n);
+ return temp_storage_bytes;
+ }
+
+ template
+ size_t getCubSelectFlaggedStorageSize(IndexType n) {
+ void* d_temp_storage = NULL;
+ size_t temp_storage_bytes = 0;
+ IndexType *d_in = NULL, *d_out = NULL, *size_out = NULL;
+ degreeIterator degreeIt(NULL);
+ cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, degreeIt, d_out, size_out, n);
+ return temp_storage_bytes;
+ }
+
+ /**
+ * Takes in the bitmap frontier and outputs the frontier as a queue of ids.
+ * @param bmap Pointer to the bitmap
+ * @param bmap_nints The number of ints used to store the bitmap
+ * @param n The number of bits in the bitmap
+ * @param outputQueue Pointer to the output queue
+ * @param output_cnt Pointer to counter for output size
+ */
+ template
+ __global__ void convert_bitmap_to_queue_kernel(int32_t *bmap,
+ IndexType bmap_nints,
+ IndexType n,
+ IndexType *outputQueue,
+ IndexType *output_cnt) {
+ typedef cub::BlockScan BlockScan;
+ __shared__ typename BlockScan::TempStorage scan_temp_storage;
+
+ // When filling the output queue, we use output_cnt to know where to write in the queue
+ // (equivalent of int off = atomicAddd(unvisited_cnt, 1)) We will actually do only one
+ // atomicAdd per block - we first do a scan, then call one atomicAdd, and store the common
+ // offset for the block in common_block_offset
+ __shared__ IndexType common_block_offset;
+
+ // We don't want threads divergence in the loop (we're going to call __syncthreads)
+ // Using a block-only dependent in the condition of the loop
+ for (IndexType block_v_idx = blockIdx.x * blockDim.x;
+ block_v_idx < bmap_nints;
+ block_v_idx += blockDim.x * gridDim.x) {
+
+ // Index of bmap that this thread will compute
+ IndexType v_idx = block_v_idx + threadIdx.x;
+
+ int thread_int = (v_idx < bmap_nints) ? bmap[v_idx] : 0;
+
+ // The last int can be only partially valid
+ // If we are indeed taking care of the last int in this thread,
+ // We need to first disable the inactive bits (vertices >= n)
+ if (v_idx == (bmap_nints - 1)) {
+ int active_bits = n - (INT_SIZE * v_idx);
+ int inactive_bits = INT_SIZE - active_bits;
+ int mask = getMaskNLeftmostBitSet(inactive_bits);
+ thread_int &= (~mask);
+ }
+
+ //Counting number of set bits in this int
+ int n_in_int = __popc(thread_int);
+ int thread_offset;
+
+ // We will need to write n_unvisited_in_int unvisited vertices to the unvisited queue
+ // We ask for that space when computing the block scan, that will tell where to write those
+ // vertices in the queue, using the common offset of the block (see below)
+ BlockScan(scan_temp_storage).ExclusiveSum(n_in_int, thread_offset);
+
+ // Last thread knows how many vertices will be written to the queue by this block
+ // Asking for that space in the queue using the global count, and saving the common offset
+ if (threadIdx.x == (FILL_QUEUE_DIMX - 1)) {
+ IndexType total = thread_offset + n_in_int;
+ common_block_offset = atomicAdd(output_cnt, total);
+ }
+
+ // syncthreads for two reasons :
+ // - we need to broadcast common_block_offset
+ // - we will reuse scan_temp_storage (cf CUB doc)
+ __syncthreads();
+
+ IndexType current_index = common_block_offset + thread_offset;
+ int nvertices_to_write = n_in_int;
+
+ // getNextNonZeroBit uses __ffs, which gives least significant bit set
+ // which means that as long as n_unvisited_in_int is valid,
+ // we will use valid bits
+
+ while (nvertices_to_write > 0) {
+ if (nvertices_to_write >= 4 && (current_index % 4) == 0) {
+ typename vec_t::vec4 vec_v;
+
+ vec_v.x = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+ vec_v.y = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+ vec_v.z = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+ vec_v.w = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+
+ typename vec_t::vec4 *unvisited_i4 = reinterpret_cast::vec4*>(&outputQueue[current_index]);
+ *unvisited_i4 = vec_v;
+
+ current_index += 4;
+ nvertices_to_write -= 4;
+ }
+ else if (nvertices_to_write >= 2 && (current_index % 2) == 0) {
+ typename vec_t::vec2 vec_v;
+
+ vec_v.x = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+ vec_v.y = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+
+ typename vec_t::vec2 *unvisited_i2 = reinterpret_cast::vec2*>(&outputQueue[current_index]);
+ *unvisited_i2 = vec_v;
+
+ current_index += 2;
+ nvertices_to_write -= 2;
+ } else {
+ IndexType v = v_idx * INT_SIZE + getNextNonZeroBit(thread_int);
+
+ outputQueue[current_index] = v;
+
+ current_index += 1;
+ nvertices_to_write -= 1;
+ }
+
+ }
+ }
+ }
+
+ template
+ void convert_bitmap_to_queue(int32_t *bmap,
+ IndexType bmap_nints,
+ IndexType n,
+ IndexType *outputQueue,
+ IndexType *output_cnt,
+ cudaStream_t stream) {
+ dim3 grid, block;
+ block.x = FILL_QUEUE_DIMX;
+ grid.x = min((IndexType) MAXBLOCKS, (bmap_nints + block.x - 1) / block.x);
+ convert_bitmap_to_queue_kernel<<>>(bmap,
+ bmap_nints,
+ n,
+ outputQueue,
+ output_cnt);
+ cudaCheckError()
+ ;
+ }
+
+ /**
+ * Kernel to compute bucket offsets for load balancing main top-down expand kernel
+ * @param frontier_degrees_exclusive_sum Exclusive sum of the local degrees of the frontier
+ * elements.
+ * @param bucket_offsets Output location for the bucket offsets.
+ * @param frontier_size Number of elements in the frontier.
+ * @param total_degree Total local degree of frontier elements.
+ */
+ template
+ __global__ void compute_bucket_offsets_kernel(const IndexType *frontier_degrees_exclusive_sum,
+ IndexType *bucket_offsets,
+ const IndexType frontier_size,
+ IndexType total_degree) {
+ IndexType end = ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+ * NBUCKETS_PER_BLOCK + 1);
+
+ for (IndexType bid = blockIdx.x * blockDim.x + threadIdx.x;
+ bid <= end;
+ bid += gridDim.x * blockDim.x) {
+
+ IndexType eid = min(bid * TOP_DOWN_BUCKET_SIZE, total_degree - 1);
+
+ bucket_offsets[bid] = binsearch_maxle(frontier_degrees_exclusive_sum,
+ eid,
+ (IndexType) 0,
+ frontier_size - 1);
+
+ }
+ }
+
+ /**
+ * Wrapper function around compute_bucket_offsets_kernel.
+ * @param cumul Exclusive sum of the local degrees of the frontier elements.
+ * @param bucket_offsets Output location for the bucket offsets.
+ * @param frontier_size Number of elements in the frontier.
+ * @param total_degree Total local degree of frontier elements.
+ * @param m_stream Stream to use for execution.
+ */
+ template
+ void compute_bucket_offsets(IndexType *cumul,
+ IndexType *bucket_offsets,
+ IndexType frontier_size,
+ IndexType total_degree,
+ cudaStream_t m_stream) {
+ dim3 grid, block;
+ block.x = COMPUTE_BUCKET_OFFSETS_DIMX;
+
+ grid.x = min((IndexType) MAXBLOCKS,
+ ((total_degree - 1 + TOP_DOWN_EXPAND_DIMX) / TOP_DOWN_EXPAND_DIMX
+ * NBUCKETS_PER_BLOCK + 1 + block.x - 1) / block.x);
+
+ compute_bucket_offsets_kernel<<>>(cumul,
+ bucket_offsets,
+ frontier_size,
+ total_degree);
+ cudaCheckError();
+ }
+
+ /**
+ * Kernel for setting the degree of each frontier element.
+ * @param frontier_degree Output to store frontier degrees.
+ * @param frontier The frontier elements.
+ * @param degreeIt Iterator providing the degree of a given vertex ID
+ * @param n The number of elements in the frontier.
+ */
+ template
+ __global__ void set_frontier_degree_kernel(IndexType *frontier_degree,
+ IndexType *frontier,
+ InputIterator degreeIt,
+ IndexType n) {
+ for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+ idx < n;
+ idx += gridDim.x * blockDim.x) {
+ IndexType u = frontier[idx];
+ frontier_degree[idx] = degreeIt[u];
+ }
+ }
+
+ /**
+ * Wrapper function for calling set_frontier_degree_kernel
+ * @param frontier_degree Output to store frontier degrees.
+ * @param frontier The frontier elements.
+ * @param degreeIt Iterator providing the degree of a given vertex ID.
+ * @param n The number of elements in the frontier.
+ * @param m_stream The stream to use for the kernel call.
+ */
+ template
+ void set_frontier_degree(IndexType *frontier_degree,
+ IndexType *frontier,
+ InputIterator degreeIt,
+ IndexType n,
+ cudaStream_t m_stream) {
+ dim3 grid, block;
+ block.x = 256;
+ grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+ set_frontier_degree_kernel<<>>(frontier_degree,
+ frontier,
+ degreeIt,
+ n);
+ cudaCheckError();
+ }
+
+ /**
+ * Kernel for setting the degree of each frontier element.
+ * @param frontier_degree Output to store frontier degrees.
+ * @param frontier The frontier elements.
+ * @param degreeIt Iterator providing the degree of a given vertex ID
+ * @param n The number of elements in the frontier.
+ */
+ template
+ __global__ void set_degree_flags_kernel(int8_t *degree_flags,
+ IndexType *frontier,
+ InputIterator degreeIt,
+ IndexType n) {
+ for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+ idx < n;
+ idx += gridDim.x * blockDim.x) {
+ IndexType u = frontier[idx];
+ degree_flags[idx] = (degreeIt[u] == 0) ? 0 : 1;
+ }
+ }
+
+ /**
+ * Wrapper function for calling set_frontier_degree_kernel
+ * @param frontier_degree Output to store frontier degrees.
+ * @param frontier The frontier elements.
+ * @param degreeIt Iterator providing the degree of a given vertex ID.
+ * @param n The number of elements in the frontier.
+ * @param m_stream The stream to use for the kernel call.
+ */
+ template
+ void set_degree_flags(int8_t *degree_flags,
+ IndexType *frontier,
+ InputIterator degreeIt,
+ IndexType n,
+ cudaStream_t m_stream) {
+ dim3 grid, block;
+ block.x = 256;
+ grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+ set_degree_flags_kernel<<>>(degree_flags,
+ frontier,
+ degreeIt,
+ n);
+ cudaCheckError();
+ }
+
+ /**
+ * Kernel for globalizing an array of ids using a given offset. Values of -1 remain
+ * unchanged, other values are incremented by the offset.
+ * @param ids The array of ids to globalize (input and output)
+ * @param offset The offset to be applied to each id.
+ * @param n The number of ids in the array.
+ */
+ template
+ __global__ void globalize_ids_kernel(IndexType *ids,
+ IndexType offset,
+ IndexType n) {
+ for (IndexType idx = blockDim.x * blockIdx.x + threadIdx.x;
+ idx < n;
+ idx += gridDim.x * blockDim.x) {
+ IndexType id = ids[idx];
+ ids[idx] = (id == -1) ? -1 : id + offset;
+ }
+ }
+
+ /**
+ * Wrapper function for calling globalize_ids_kernel
+ * @param ids The array of ids to globalize (input and output)
+ * @param offset The offset to be applied to each id.
+ * @param n The number of ids in the array.
+ * @param m_stream The stream to use for the kernel call.
+ */
+ template
+ void globalize_ids(IndexType *ids,
+ IndexType offset,
+ IndexType n,
+ cudaStream_t m_stream) {
+ dim3 grid, block;
+ block.x = 256;
+ grid.x = min((n + block.x - 1) / block.x, (IndexType) MAXBLOCKS);
+ globalize_ids_kernel<<>>(ids, offset, n);
+ cudaCheckError();
+ }
+
+ template
+ __global__ void topdown_expand_kernel( const IndexType *row_ptr,
+ const IndexType *col_ind,
+ const IndexType *frontier,
+ const IndexType frontier_size,
+ const IndexType totaldegree,
+ const IndexType max_items_per_thread,
+ const IndexType lvl,
+ int *frontier_bmap,
+ const IndexType *frontier_degrees_exclusive_sum,
+ const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+ int *visited_bmap,
+ IndexType *distances,
+ GlobalType *predecessors) {
+ __shared__ IndexType shared_buckets_offsets[TOP_DOWN_EXPAND_DIMX - NBUCKETS_PER_BLOCK + 1];
+ __shared__ IndexType shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX + 1];
+
+ IndexType block_offset = (blockDim.x * blockIdx.x) * max_items_per_thread;
+ IndexType n_items_per_thread_left = (totaldegree - block_offset + TOP_DOWN_EXPAND_DIMX - 1)
+ / TOP_DOWN_EXPAND_DIMX;
+
+// if (threadIdx.x == 0)
+// printf("n_items_per_thread_left=%d max_items_per_thread=%d\n", n_items_per_thread_left, max_items_per_thread);
+ n_items_per_thread_left = min(max_items_per_thread, n_items_per_thread_left);
+
+ for (;
+ (n_items_per_thread_left > 0) && (block_offset < totaldegree);
+ block_offset += MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD * blockDim.x,
+ n_items_per_thread_left -= MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD) {
+
+ // In this loop, we will process batch_set_size batches
+ IndexType nitems_per_thread = min(n_items_per_thread_left,
+ (IndexType) MAX_ITEMS_PER_THREAD_PER_OFFSETS_LOAD);
+
+ // Loading buckets offset (see compute_bucket_offsets_kernel)
+
+ if (threadIdx.x < (nitems_per_thread * NBUCKETS_PER_BLOCK + 1))
+ shared_buckets_offsets[threadIdx.x] =
+ frontier_degrees_exclusive_sum_buckets_offsets[block_offset / TOP_DOWN_BUCKET_SIZE
+ + threadIdx.x];
+
+ // We will use shared_buckets_offsets
+ __syncthreads();
+
+ //
+ // shared_buckets_offsets gives us a range of the possible indexes
+ // for edge of linear_threadx, we are looking for the value k such as
+ // k is the max value such as frontier_degrees_exclusive_sum[k] <= linear_threadx
+ //
+ // we have 0 <= k < frontier_size
+ // but we also have :
+ //
+ // frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE]
+ // <= k
+ // <= frontier_degrees_exclusive_sum_buckets_offsets[linear_threadx/TOP_DOWN_BUCKET_SIZE + 1]
+ //
+ // To find the exact value in that range, we need a few values from frontier_degrees_exclusive_sum (see below)
+ // We will load them here
+ // We will load as much as we can - if it doesn't fit we will make multiple iteration of the next loop
+ // Because all vertices in frontier have degree > 0, we know it will fits if left + 1 = right (see below)
+
+ //We're going to load values in frontier_degrees_exclusive_sum for batch [left; right[
+ //If it doesn't fit, --right until it does, then loop
+ //It is excepted to fit on the first try, that's why we start right = nitems_per_thread
+
+ IndexType left = 0;
+ IndexType right = nitems_per_thread;
+
+ while (left < nitems_per_thread) {
+ //
+ // Values that are necessary to compute the local binary searches
+ // We only need those with indexes between extremes indexes of buckets_offsets
+ // We need the next val for the binary search, hence the +1
+ //
+
+ IndexType nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+ - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+
+ //If left = right + 1 we are sure to have nvalues_to_load < TOP_DOWN_EXPAND_DIMX+1
+ while (nvalues_to_load > (TOP_DOWN_EXPAND_DIMX + 1)) {
+ --right;
+
+ nvalues_to_load = shared_buckets_offsets[right * NBUCKETS_PER_BLOCK]
+ - shared_buckets_offsets[left * NBUCKETS_PER_BLOCK] + 1;
+ }
+
+ IndexType nitems_per_thread_for_this_load = right - left;
+
+ IndexType frontier_degrees_exclusive_sum_block_offset = shared_buckets_offsets[left
+ * NBUCKETS_PER_BLOCK];
+
+ //TODO put again the nvalues_to_load == 1
+ if (threadIdx.x < nvalues_to_load) {
+ shared_frontier_degrees_exclusive_sum[threadIdx.x] =
+ frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+ + threadIdx.x];
+ }
+
+ if (nvalues_to_load == (TOP_DOWN_EXPAND_DIMX + 1) && threadIdx.x == 0) {
+ shared_frontier_degrees_exclusive_sum[TOP_DOWN_EXPAND_DIMX] =
+ frontier_degrees_exclusive_sum[frontier_degrees_exclusive_sum_block_offset
+ + TOP_DOWN_EXPAND_DIMX];
+ }
+
+ //shared_frontier_degrees_exclusive_sum is in shared mem, we will use it, sync
+ //TODO we don't use it if nvalues_to_load == 1
+ __syncthreads();
+
+ // Now we will process the edges
+ // Here each thread will process nitems_per_thread_for_this_load
+ for (IndexType item_index = 0;
+ item_index < nitems_per_thread_for_this_load;
+ item_index += TOP_DOWN_BATCH_SIZE) {
+
+ // We process TOP_DOWN_BATCH_SIZE edge in parallel (instruction parallism)
+ // Reduces latency
+
+ IndexType current_max_edge_index = min(block_offset
+ + (left
+ + nitems_per_thread_for_this_load)
+ * blockDim.x,
+ totaldegree);
+
+ /**
+ * We will need vec_u (source of the edge) until the end if we need to save the
+ * predecessors. For others informations, we will reuse pointers on the go
+ * (nvcc does not color well the registers in that case)
+ */
+ IndexType vec_u[TOP_DOWN_BATCH_SIZE];
+ IndexType local_buf1[TOP_DOWN_BATCH_SIZE];
+ IndexType local_buf2[TOP_DOWN_BATCH_SIZE];
+
+ IndexType *vec_frontier_degrees_exclusive_sum_index = &local_buf2[0];
+
+#pragma unroll
+ for (IndexType iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+
+ IndexType ibatch = left + item_index + iv;
+ IndexType gid = block_offset + ibatch * blockDim.x + threadIdx.x;
+
+ if (gid < current_max_edge_index) {
+ IndexType start_off_idx = (ibatch * blockDim.x + threadIdx.x)
+ / TOP_DOWN_BUCKET_SIZE;
+ IndexType bucket_start = shared_buckets_offsets[start_off_idx]
+ - frontier_degrees_exclusive_sum_block_offset;
+ IndexType bucket_end = shared_buckets_offsets[start_off_idx + 1]
+ - frontier_degrees_exclusive_sum_block_offset;
+
+ IndexType k = binsearch_maxle(shared_frontier_degrees_exclusive_sum,
+ gid,
+ bucket_start,
+ bucket_end)
+ + frontier_degrees_exclusive_sum_block_offset;
+ vec_u[iv] = frontier[k]; // origin of this edge
+ vec_frontier_degrees_exclusive_sum_index[iv] =
+ frontier_degrees_exclusive_sum[k];
+ } else {
+ vec_u[iv] = -1;
+ vec_frontier_degrees_exclusive_sum_index[iv] = -1;
+ }
+
+ }
+
+ IndexType *vec_row_ptr_u = &local_buf1[0];
+
+#pragma unroll
+ for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+ IndexType u = vec_u[iv];
+ //row_ptr for this vertex origin u
+ vec_row_ptr_u[iv] = (u != -1) ? row_ptr[u] : -1;
+ }
+
+ //We won't need row_ptr after that, reusing pointer
+ IndexType *vec_dest_v = vec_row_ptr_u;
+
+#pragma unroll
+ for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+ IndexType thread_item_index = left + item_index + iv;
+ IndexType gid = block_offset + thread_item_index * blockDim.x + threadIdx.x;
+
+ IndexType row_ptr_u = vec_row_ptr_u[iv];
+ IndexType edge = row_ptr_u + gid - vec_frontier_degrees_exclusive_sum_index[iv];
+
+ //Destination of this edge
+ vec_dest_v[iv] = (row_ptr_u != -1) ? col_ind[edge] : -1;
+// if (vec_u[iv] != -1 && vec_dest_v[iv] != -1)
+// printf("Edge to examine: %d, %d\n", vec_u[iv],vec_dest_v[iv]);
+ }
+
+ //We don't need vec_frontier_degrees_exclusive_sum_index anymore
+ IndexType *vec_v_visited_bmap = vec_frontier_degrees_exclusive_sum_index;
+
+#pragma unroll
+ for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+ IndexType v = vec_dest_v[iv];
+ vec_v_visited_bmap[iv] = (v != -1) ? visited_bmap[v / INT_SIZE] : (~0); //will look visited
+ }
+
+ // From now on we will consider v as a frontier candidate
+ // If for some reason vec_candidate[iv] should be put in the new_frontier
+ // Then set vec_candidate[iv] = -1
+ IndexType *vec_frontier_candidate = vec_dest_v;
+
+#pragma unroll
+
+ for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+ IndexType v = vec_frontier_candidate[iv];
+ int m = 1 << (v % INT_SIZE);
+
+ int is_visited = vec_v_visited_bmap[iv] & m;
+
+ if (is_visited)
+ vec_frontier_candidate[iv] = -1;
+ }
+
+#pragma unroll
+ /**
+ * Here is where the distances, predecessors, new bitmap frontier and visited bitmap
+ * get written out.
+ */
+ for (int iv = 0; iv < TOP_DOWN_BATCH_SIZE; ++iv) {
+ IndexType v = vec_frontier_candidate[iv];
+ if (v != -1) {
+ int m = 1 << (v % INT_SIZE);
+ int q = atomicOr(&visited_bmap[v / INT_SIZE], m); //atomicOr returns old
+ int f = atomicOr(&frontier_bmap[v / INT_SIZE], m);
+ if (!(m & q)) { //if this thread was the first to discover this node
+ if (distances)
+ distances[v] = lvl;
+
+ if (predecessors) {
+ IndexType pred = vec_u[iv];
+ predecessors[v] = pred;
+ }
+ }
+ }
+ }
+
+ //We need naccepted_vertices to be ready
+ __syncthreads();
+ }
+
+ //We need to keep shared_frontier_degrees_exclusive_sum coherent
+ __syncthreads();
+
+ //Preparing for next load
+ left = right;
+ right = nitems_per_thread;
+ }
+
+ //we need to keep shared_buckets_offsets coherent
+ __syncthreads();
+ }
+ }
+
+ template
+ void frontier_expand(const IndexType *row_ptr,
+ const IndexType *col_ind,
+ const IndexType *frontier,
+ const IndexType frontier_size,
+ const IndexType totaldegree,
+ const IndexType lvl,
+ IndexType *frontier_bmap,
+ const IndexType *frontier_degrees_exclusive_sum,
+ const IndexType *frontier_degrees_exclusive_sum_buckets_offsets,
+ int *visited_bmap,
+ IndexType *distances,
+ GlobalType *predecessors,
+ cudaStream_t m_stream) {
+ if (!totaldegree)
+ return;
+
+ dim3 block;
+ block.x = TOP_DOWN_EXPAND_DIMX;
+
+ IndexType max_items_per_thread = (totaldegree + MAXBLOCKS * block.x - 1)
+ / (MAXBLOCKS * block.x);
+
+ dim3 grid;
+ grid.x = min((totaldegree + max_items_per_thread * block.x - 1)
+ / (max_items_per_thread * block.x),
+ (IndexType) MAXBLOCKS);
+
+ topdown_expand_kernel<<>>( row_ptr,
+ col_ind,
+ frontier,
+ frontier_size,
+ totaldegree,
+ max_items_per_thread,
+ lvl,
+ frontier_bmap,
+ frontier_degrees_exclusive_sum,
+ frontier_degrees_exclusive_sum_buckets_offsets,
+ visited_bmap,
+ distances,
+ predecessors);
+ cudaCheckError();
+ }
+}
diff --git a/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx b/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx
new file mode 100644
index 00000000000..2143ec8e4ac
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/cnmem_shared_ptr.hxx
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include
+#include
+
+
+//
+
+#if __cplusplus > 199711L
+#include
+#define SHARED_PREFIX std
+
+#else
+#include
+#define SHARED_PREFIX boost
+
+#endif
+
+#include
+#include "nvgraph_error.hxx"
+
+namespace nvgraph
+{
+
+template< typename T >
+class DeviceDeleter
+{
+ cudaStream_t mStream;
+public:
+ DeviceDeleter(cudaStream_t stream) : mStream(stream) {}
+ void operator()(T *ptr)
+ {
+ cnmemStatus_t status = cnmemFree(ptr, mStream);
+ if( status != CNMEM_STATUS_SUCCESS )
+ {
+ FatalError("Memory manager internal error (free)", NVGRAPH_ERR_UNKNOWN);
+ }
+ }
+};
+
+
+template< typename T >
+inline SHARED_PREFIX::shared_ptr allocateDevice(size_t n, cudaStream_t stream)
+{
+ T *ptr = NULL;
+ cnmemStatus_t status = cnmemMalloc((void**) &ptr, n*sizeof(T), stream);
+ if( status == CNMEM_STATUS_OUT_OF_MEMORY)
+ {
+ FatalError("Not enough memory", NVGRAPH_ERR_NO_MEMORY);
+ }
+ else if (status != CNMEM_STATUS_SUCCESS)
+ {
+ FatalError("Memory manager internal error (alloc)", NVGRAPH_ERR_UNKNOWN);
+ }
+ return SHARED_PREFIX::shared_ptr(ptr, DeviceDeleter(stream));
+}
+
+template< typename T >
+class DeviceReleaser
+{
+ cudaStream_t mStream;
+public:
+ DeviceReleaser(cudaStream_t stream) : mStream(stream) {}
+ void operator()(T *ptr)
+ {
+
+ }
+};
+
+template< typename T >
+inline SHARED_PREFIX::shared_ptr attachDevicePtr(T * ptr_in, cudaStream_t stream)
+{
+ T *ptr = ptr_in;
+ return SHARED_PREFIX::shared_ptr(ptr, DeviceReleaser(stream));
+}
+
+
+} // end namespace nvgraph
+
diff --git a/cpp/nvgraph/cpp/include/common_selector.cuh b/cpp/nvgraph/cpp/include/common_selector.cuh
new file mode 100644
index 00000000000..7a47d5f1300
--- /dev/null
+++ b/cpp/nvgraph/cpp/include/common_selector.cuh
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+//#pragma once
+
+namespace nvlouvain{
+
+template __inline__ __device__ T_ELEM __cachingLoad(const T_ELEM *addr) {
+#if __CUDA_ARCH__ < 350
+ return *addr;
+#else
+ return __ldg(addr);
+#endif
+}
+__device__
+inline float random_weight(int i, int j, int n)
+{
+#define RAND_MULTIPLIER 1145637293
+ int i_min = (min(i, j) * RAND_MULTIPLIER) % n;
+ int i_max = (max(i, j) * RAND_MULTIPLIER) % n;
+ return ((float)i_max / n) * i_min;
+}
+
+/* WARNING: notice that based on the hexadecimal number in the last line
+ in the hash function the resulting floating point value is very likely
+ on the order of 0.5. */
+__host__ __device__ inline unsigned int hash_val(unsigned int a, unsigned int seed)
+{
+ a ^= seed;
+ a = (a + 0x7ed55d16) + (a << 12);
+ a = (a ^ 0xc761c23c) + (a >> 19);
+ a = (a + 0x165667b1) + (a << 5);
+ a = (a ^ 0xd3a2646c) + (a << 9);
+ a = (a + 0xfd7046c5) + (a << 3);
+ a = (a ^ 0xb55a4f09) + (a >> 16);
+ return a;
+}
+
+/* return 1e-5 for float [sizeof(float)=4] and 1e-12 for double [sizeof(double)=8] types */
+template
+__host__ __device__ WeightType scaling_factor(){
+ return (sizeof(WeightType) == 4) ? 1e-5f : 1e-12;
+}
+
+// Kernel to compute the weight of the edges
+// original version from AmgX.
+template
+__global__
+void computeEdgeWeightsBlockDiaCsr_V2( const IndexType* row_offsets, const IndexType *row_indices, const IndexType *column_indices,
+ const IndexType *dia_values, const ValueType* nonzero_values, const IndexType num_nonzero_blocks,
+ WeightType *str_edge_weights, WeightType *rand_edge_weights, int num_owned, int bsize, int component, int weight_formula)
+{
+ int tid= threadIdx.x + blockDim.x*blockIdx.x;
+
+ int i,j,kmin,kmax;
+ int bsize_sq = bsize*bsize;
+ WeightType den;
+
+ int matrix_weight_entry = component*bsize+component;
+
+ while (tid < num_nonzero_blocks)
+ {
+ i = row_indices[tid];
+ j = column_indices[tid];
+
+ if ((i != j) && (j < num_owned)) // skip diagonal and across-boundary edges
+ {
+ den = (WeightType) max(fabs(__cachingLoad(&nonzero_values[dia_values[i]*bsize_sq+matrix_weight_entry])),fabs(__cachingLoad(&nonzero_values[dia_values[j]*bsize_sq+matrix_weight_entry])));
+
+ kmin = __cachingLoad(&row_offsets[j]); //kmin = row_offsets[j];
+ kmax = __cachingLoad(&row_offsets[j+1]); //kmax = row_offsets[j+1];
+
+ WeightType kvalue = 0.0;
+ bool foundk = false;
+ for (int k=kmin;k()*hash_val(min(i,j),max(i,j))/UINT_MAX;
+ ed_weight += small_fraction*ed_weight;
+ str_edge_weights[tid] = ed_weight;
+
+ // fill up random unique weights
+ if( rand_edge_weights != NULL )
+ rand_edge_weights[tid] = random_weight(i, j, num_owned);
+ }
+ tid += gridDim.x*blockDim.x;
+ }
+}
+
+// Kernel to compute the weight of the edges
+// simple version modified for nvgraph
+template
+__global__
+void computeEdgeWeights_simple( const IndexType* row_offsets, const IndexType *row_indices, const IndexType *column_indices,
+ const ValueType *row_sum, const ValueType* nonzero_values, const IndexType num_nonzero_blocks,
+ WeightType *str_edge_weights, WeightType *rand_edge_weights, int n, int weight_formula)
+{
+ int tid= threadIdx.x + blockDim.x*blockIdx.x;
+
+ int i,j,kmin,kmax;
+ WeightType den;
+
+ while (tid < num_nonzero_blocks)
+ {
+ i = row_indices[tid];
+ j = column_indices[tid];
+
+ if ((i != j) && (j < n)) // skip diagonal and across-boundary edges
+ {
+ den = (WeightType) max(fabs(__cachingLoad(&row_sum[i])),fabs(__cachingLoad(&row_sum[j])));
+
+ kmin = __cachingLoad(&row_offsets[j]); //kmin = row_offsets[j];
+ kmax = __cachingLoad(&row_offsets[j+1]); //kmax = row_offsets[j+1];
+
+ WeightType kvalue = 0.0;
+ bool foundk = false;
+ for (int k=kmin;k()*hash_val(min(i,j),max(i,j))/UINT_MAX;
+ ed_weight += small_fraction*ed_weight;
+ str_edge_weights[tid] = ed_weight;
+
+ // fill up random unique weights
+ if( rand_edge_weights != NULL )
+ rand_edge_weights[tid] = random_weight(i, j, n);
+ }
+ tid += gridDim.x*blockDim.x;
+ }
+}
+
+// Kernel to compute the weight of the edges using geometry distance between edges
+template
+__global__
+void computeEdgeWeightsDistance3d( const int* row_offsets, const IndexType *column_indices,
+ const ValueType* gx, const ValueType* gy, const ValueType* gz, float *str_edge_weights, int num_rows)
+{
+ int tid= threadIdx.x + blockDim.x*blockIdx.x;
+ float lx, ly, lz;
+ float px, py, pz;
+ int kmin, kmax;
+ int col_id;
+
+ while (tid < num_rows)
+ {
+ lx = gx[tid];
+ ly = gy[tid];
+ lz = gz[tid];
+ kmin = row_offsets[tid];
+ kmax = row_offsets[tid+1];
+
+ for (int k=kmin;k
+__global__
+void matchEdges(const IndexType num_rows, IndexType *partner_index, IndexType *aggregates, const IndexType *strongest_neighbour)
+{
+ int potential_match, potential_match_neighbour;
+
+ for (int tid= threadIdx.x + blockDim.x*blockIdx.x; tid < num_rows; tid += gridDim.x*blockDim.x)
+ {
+ if (partner_index[tid] == -1) // Unaggregated row
+ {
+ potential_match = strongest_neighbour[tid];
+ if (potential_match!=-1)
+ {
+ potential_match_neighbour = strongest_neighbour[potential_match];
+
+ if ( potential_match_neighbour == tid ) // we have a match
+ {
+ partner_index[tid] = potential_match;
+ aggregates[tid] = ( potential_match > tid) ? tid : potential_match;
+ }
+ }
+ }
+ }
+}
+
+template