diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000..dad09a1bf9 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,255 @@ +language: none +sudo: false +dist: trusty + +cache: + apt: true + directories: + - $HOME/.cache/spack + - $HOME/.cache/cmake-3.11.4 + pip: true + +addons: + apt: + sources: + - ubuntu-toolchain-r-test + packages: + - g++-4.9 + - gfortran-4.9 # spack OpenMPI dependency + - environment-modules + - openmpi-bin + - libopenmpi-dev + # clang 5.0.0 is pre-installed + # - clang-tidy-3.9 + +env: + global: + - SPACK_ROOT: $HOME/.cache/spack + - PATH: $PATH:$HOME/.cache/spack/bin + - CXXFLAGS: "-std=c++11" + +install: + ############################################################################# + # PMacc CPU-only dependencies # + ############################################################################# +<<<<<<< HEAD + - SPACK_FOUND=$(which spack >/dev/null && { echo 0; } || { echo 1; }) + - if [ $SPACK_FOUND -ne 0 ]; then + mkdir -p $SPACK_ROOT && + git clone --depth 50 https://github.com/spack/spack.git $SPACK_ROOT && + echo -e "config:""\n build_jobs:"" 2" > $SPACK_ROOT/etc/spack/config.yaml && + echo -e "packages:""\n cmake:""\n version:"" [3.11.4]""\n paths:""\n cmake@3.11.4:"" /home/travis/.cache/cmake-3.11.4""\n buildable:"" False" > $SPACK_ROOT/etc/spack/packages.yaml; +======= + - export PATH=$CMAKE_ROOT/bin:$PATH + - CMAKE_3_11_4_FOUND=$(cmake --version | grep " 3\.11\.4" >/dev/null && { echo 0; } || { echo 1; }) + - if [ $CMAKE_3_11_4_FOUND -ne 0 ]; then + mkdir -p $CMAKE_ROOT && + cd $CMAKE_ROOT && + rm -rf $CMAKE_ROOT/* && + travis_retry wget --no-check-certificate http://cmake.org/files/v3.11/cmake-3.11.4-Linux-x86_64.tar.gz && + tar -xzf cmake-3.11.4-Linux-x86_64.tar.gz && + mv cmake-3.11.4-Linux-x86_64/* . && + rm -rf cmake-3.11.4-Linux-x86_64.tar.gz cmake-3.11.4-Linux-x86_64 && + cd -; +>>>>>>> Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c + fi + - spack compiler add + # required dependencies - CMake 3.11.4 + - if [ "$TRAVIS_OS_NAME" == "linux" ]; then + if [ ! -f $HOME/.cache/cmake-3.11.4/bin/cmake ]; then + wget -O cmake.sh https://cmake.org/files/v3.11/cmake-3.11.4-Linux-x86_64.sh && + sh cmake.sh --skip-license --exclude-subdir --prefix=$HOME/.cache/cmake-3.11.4 && + rm cmake.sh; + fi; + elif [ "$TRAVIS_OS_NAME" == "osx" ]; then + if [ ! -d /Applications/CMake.app/Contents/ ]; then + curl -L -s -o cmake.dmg https://cmake.org/files/v3.11/cmake-3.11.4-Darwin-x86_64.dmg && + yes | hdiutil mount cmake.dmg && + sudo cp -R "/Volumes/cmake-3.11.4-Darwin-x86_64/CMake.app" /Applications && + hdiutil detach /dev/disk1s1 && + rm cmake.dmg; + fi; + fi + - travis_wait spack install + cmake + $COMPILERSPEC + # required dependencies - Boost 1.65.1 + - travis_wait spack install + boost@1.65.1~date_time~graph~iostreams~locale~log~random~thread~timer~wave + $COMPILERSPEC + - spack clean -a + - source /etc/profile && + source $SPACK_ROOT/share/spack/setup-env.sh + - spack load cmake + - spack load boost $COMPILERSPEC + +<<<<<<< HEAD +jobs: + fast_finish: true + include: + - stage: 'Target Branch' + install: skip + script: + ############################################################################# + # Disallow PRs to `ComputationalRadiationPhysics/picongpu` branch `master` # + # if not an other mainline branch such as `dev` or `release-...` # + ############################################################################# + - . test/correctBranchPR + - &style-python + stage: 'Style' + language: python + python: "2.7" + install: pip install -U flake8 + script: + ############################################################################# + # Test Python Files for PEP8 conformance # + ############################################################################# + - flake8 --exclude=thirdParty . + - <<: *style-python + python: "3.6" + - install: skip + language: cpp + script: + ############################################################################# + # Conformance with Alpaka: Do not write __global__ CUDA kernels directly # + ############################################################################# + - test/hasCudaGlobalKeyword include/pmacc + - test/hasCudaGlobalKeyword share/pmacc/examples + - test/hasCudaGlobalKeyword include/picongpu + - test/hasCudaGlobalKeyword share/picongpu/examples + + ############################################################################# + # Disallow end-of-line (EOL) white spaces # + ############################################################################# + - test/hasEOLwhiteSpace + + ############################################################################# + # Disallow TABs, use white spaces # + ############################################################################# + - test/hasTabs + + ############################################################################# + # Disallow non-ASCII in source files and scripts # + ############################################################################# + - test/hasNonASCII + + ############################################################################# + # Disallow spaces before pre-compiler macros # + ############################################################################# + - test/hasSpaceBeforePrecompiler + + ############################################################################# + # Enforce angle brackets <...> for includes of external library files # + ############################################################################# + - test/hasExtLibIncludeBrackets include boost + - test/hasExtLibIncludeBrackets include alpaka + - test/hasExtLibIncludeBrackets include cupla + - test/hasExtLibIncludeBrackets include splash + - test/hasExtLibIncludeBrackets include mallocMC + - test/hasExtLibIncludeBrackets include/picongpu pmacc + - test/hasExtLibIncludeBrackets share/picongpu/examples pmacc + - test/hasExtLibIncludeBrackets share/picongpu/examples boost + - test/hasExtLibIncludeBrackets share/picongpu/examples alpaka + - test/hasExtLibIncludeBrackets share/picongpu/examples cupla + - test/hasExtLibIncludeBrackets share/picongpu/examples splash + - test/hasExtLibIncludeBrackets share/picongpu/examples mallocMC + - test/hasExtLibIncludeBrackets share/pmacc/examples pmacc + - &static-code-python + stage: 'Static Code Analysis' + language: python + python: "2.7" + install: pip install -U pyflakes + script: + ############################################################################# + # Warnings, unused code, etc. # + ############################################################################# + - pyflakes . + - <<: *static-code-python + python: "3.6" + - &test-cpp-unit + stage: 'C++ Unit Tests' + language: cpp + env: [ COMPILERSPEC='%gcc@4.9.4' ] + before_install: + - export CXX=g++-4.9 + - export CC=gcc-4.9 + - export FC=gfortran-4.9 + script: + - $CXX --version + - $CC --version + - $FC --version + ############################################################################# + # PMacc CPU-only tests # + ############################################################################# + - mkdir -p $HOME/build + - cd $HOME/build + - cmake $TRAVIS_BUILD_DIR/include/pmacc + -DALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLE=ON + - make -j 2 + # - make test # reduce memory and RT costs first + - <<: *test-cpp-unit + env: [ COMPILERSPEC='%clang@5.0.0' ] + before_install: + - export CXX=clang++ + - export CC=clang + - export FC=gfortran-4.9 +======= +script: + ############################################################################# + # Example: Matrix Multiplication (adapted original) # + ############################################################################# + - cd $HOME/matrixMul + - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/matrixMul/ $CMAKE_FLAGS + - make + # can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original + # SDK example + # CPU_B_SEQ_T_OMP2/THREADS: too many threads necessary (256) + # - ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64 + ############################################################################# + # Example: Async API (adapted original) # + ############################################################################# + - cd $HOME/asyncAPI + - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/asyncAPI/ $CMAKE_FLAGS + - make + # can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original + # SDK example + # CPU_B_SEQ_T_OMP2/THREADS: too many threads necessary (512) + # - ./asyncAPI + ############################################################################# + # Example: Async API (added elements layer) # + ############################################################################# + - cd $HOME/asyncAPI_tuned + - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/asyncAPI_tuned/ $CMAKE_FLAGS + - make + - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] || + [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then + ./asyncAPI_tuned; + fi + ############################################################################# + # Example: vectorAdd (added elements layer) # + ############################################################################# + - cd $HOME/vectorAdd + - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/vectorAdd/ $CMAKE_FLAGS + - make + - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] || + [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then + ./vectorAdd 100000; + fi + ############################################################################# + # Example: BlackScholes (adapted original) # + ############################################################################# + - cd $HOME/blackScholes + - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/blackScholes/ $CMAKE_FLAGS + - make + - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] || + [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then + ./blackScholes; + fi + ############################################################################# + # Test: additional tests # + ############################################################################# + - cd $HOME/test/config + - if [[ $CXX =~ "^g\+\+" ]] || [[ "$COMPILER" == "nvcc" ]] ; then + $TRAVIS_BUILD_DIR/test/system/config/test.sh $CXX; + fi +>>>>>>> Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c diff --git a/README.md b/README.md index 667cd9360e..0cb6bf66c6 100644 --- a/README.md +++ b/README.md @@ -235,4 +235,4 @@ way! ******************************************************************************** ![image of an lwfa](docs/images/lwfa_iso.png "LWFA") -![image of our strong scaling](docs/images/StrongScalingPIConGPU_log.png "Strong Scaling") +![image of our strong scaling](docs/images/StrongScalingPIConGPU_log.png "Strong Scaling") \ No newline at end of file diff --git a/alpaka/example/vectorAdd/src/vectorAdd.cpp b/alpaka/example/vectorAdd/src/vectorAdd.cpp new file mode 100644 index 0000000000..a7126c43e4 --- /dev/null +++ b/alpaka/example/vectorAdd/src/vectorAdd.cpp @@ -0,0 +1,213 @@ +/* Copyright 2019 Benjamin Worpitz, Matthias Werner + * + * This file exemplifies usage of Alpaka. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED “AS IS” AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR + * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include +#include +#include + +//############################################################################# +//! A vector addition kernel. +class VectorAddKernel +{ +public: + //----------------------------------------------------------------------------- + //! The kernel entry point. + //! + //! \tparam TAcc The accelerator environment to be executed on. + //! \tparam TElem The matrix element type. + //! \param acc The accelerator to be executed on. + //! \param A The first source vector. + //! \param B The second source vector. + //! \param C The destination vector. + //! \param numElements The number of elements. + ALPAKA_NO_HOST_ACC_WARNING + template< + typename TAcc, + typename TElem, + typename TIdx> + ALPAKA_FN_ACC auto operator()( + TAcc const & acc, + TElem const * const A, + TElem const * const B, + TElem * const C, + TIdx const & numElements) const + -> void + { + static_assert( + alpaka::dim::Dim::value == 1, + "The VectorAddKernel expects 1-dimensional indices!"); + + TIdx const gridThreadIdx(alpaka::idx::getIdx(acc)[0u]); + TIdx const threadElemExtent(alpaka::workdiv::getWorkDiv(acc)[0u]); + TIdx const threadFirstElemIdx(gridThreadIdx * threadElemExtent); + + if(threadFirstElemIdx < numElements) + { + // Calculate the number of elements to compute in this thread. + // The result is uniform for all but the last thread. + TIdx const threadLastElemIdx(threadFirstElemIdx+threadElemExtent); + TIdx const threadLastElemIdxClipped((numElements > threadLastElemIdx) ? threadLastElemIdx : numElements); + + for(TIdx i(threadFirstElemIdx); i int +{ +// Fallback for the CI with disabled sequential backend +#if defined(ALPAKA_CI) && !defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) + return EXIT_SUCCESS; +#else + // Define the index domain + using Dim = alpaka::dim::DimInt<1u>; + using Idx = std::size_t; + + // Define the accelerator + // + // It is possible to choose from a set of accelerators + // that are defined in the alpaka::acc namespace e.g.: + // - AccGpuCudaRt + // - AccCpuThreads + // - AccCpuFibers + // - AccCpuOmp2Threads + // - AccCpuOmp2Blocks + // - AccCpuOmp4 + // - AccCpuSerial + using Acc = alpaka::acc::AccCpuSerial; + using DevAcc = alpaka::dev::Dev; + using PltfAcc = alpaka::pltf::Pltf; + + // Defines the synchronization behavior of a queue + // + // choose between Blocking and NonBlocking + using QueueProperty = alpaka::queue::Blocking; + using QueueAcc = alpaka::queue::Queue; + + // Select a device + DevAcc const devAcc(alpaka::pltf::getDevByIdx(0u)); + + // Create a queue on the device + QueueAcc queue(devAcc); + + // Define the work division + Idx const numElements(123456); + Idx const elementsPerThread(3u); + alpaka::vec::Vec const extent(numElements); + + // Let alpaka calculate good block and grid sizes given our full problem extent + alpaka::workdiv::WorkDivMembers const workDiv( + alpaka::workdiv::getValidWorkDiv( + devAcc, + extent, + elementsPerThread, + false, + alpaka::workdiv::GridBlockExtentSubDivRestrictions::Unrestricted)); + + // Define the buffer element type + using Data = std::uint32_t; + + // Get the host device for allocating memory on the host. + using DevHost = alpaka::dev::DevCpu; + using PltfHost = alpaka::pltf::Pltf; + DevHost const devHost(alpaka::pltf::getDevByIdx(0u)); + + // Allocate 3 host memory buffers + using BufHost = alpaka::mem::buf::Buf; + BufHost bufHostA(alpaka::mem::buf::alloc(devHost, extent)); + BufHost bufHostB(alpaka::mem::buf::alloc(devHost, extent)); + BufHost bufHostC(alpaka::mem::buf::alloc(devHost, extent)); + + // Initialize the host input vectors A and B + Data * const pBufHostA(alpaka::mem::view::getPtrNative(bufHostA)); + Data * const pBufHostB(alpaka::mem::view::getPtrNative(bufHostB)); + Data * const pBufHostC(alpaka::mem::view::getPtrNative(bufHostC)); + + // C++11 random generator for uniformly distributed numbers in {1,..,42} + std::random_device rd{}; + std::default_random_engine eng{ rd() }; + std::uniform_int_distribution dist(1, 42); + + for (Idx i(0); i < numElements; ++i) + { + pBufHostA[i] = dist(eng); + pBufHostB[i] = dist(eng); + pBufHostC[i] = 0; + } + + // Allocate 3 buffers on the accelerator + using BufAcc = alpaka::mem::buf::Buf; + BufAcc bufAccA(alpaka::mem::buf::alloc(devAcc, extent)); + BufAcc bufAccB(alpaka::mem::buf::alloc(devAcc, extent)); + BufAcc bufAccC(alpaka::mem::buf::alloc(devAcc, extent)); + + // Copy Host -> Acc + alpaka::mem::view::copy(queue, bufAccA, bufHostA, extent); + alpaka::mem::view::copy(queue, bufAccB, bufHostB, extent); + alpaka::mem::view::copy(queue, bufAccC, bufHostC, extent); + + // Instantiate the kernel function object + VectorAddKernel kernel; + + // Create the kernel execution task. + auto const taskKernel(alpaka::kernel::createTaskKernel( + workDiv, + kernel, + alpaka::mem::view::getPtrNative(bufAccA), + alpaka::mem::view::getPtrNative(bufAccB), + alpaka::mem::view::getPtrNative(bufAccC), + numElements)); + + // Enqueue the kernel execution task + alpaka::queue::enqueue(queue, taskKernel); + + // Copy back the result + alpaka::mem::view::copy(queue, bufHostC, bufAccC, extent); + alpaka::wait::wait(queue); + + bool resultCorrect(true); + for(Idx i(0u); + i < numElements; + ++i) + { + Data const & val(pBufHostC[i]); + Data const correctResult(pBufHostA[i] + pBufHostB[i]); + if(val != correctResult) + { + std::cerr << "C[" << i << "] == " << val << " != " << correctResult << std::endl; + resultCorrect = false; + } + } + + if(resultCorrect) + { + std::cout << "Execution results correct!" << std::endl; + return EXIT_SUCCESS; + } + else + { + std::cout << "Execution results incorrect!" << std::endl; + return EXIT_FAILURE; + } +#endif +} diff --git a/alpaka/include/alpaka/core/Concepts.hpp b/alpaka/include/alpaka/core/Concepts.hpp new file mode 100644 index 0000000000..af3bc4251a --- /dev/null +++ b/alpaka/include/alpaka/core/Concepts.hpp @@ -0,0 +1,92 @@ +/* Copyright 2019 Benjamin Worpitz + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include + +namespace alpaka +{ + namespace concepts + { + //############################################################################# + //! Tag used in class inheritance hierarchies that describes that a specific concept (TConcept) + //! is implemented by the given base class (TBase). + template< + typename TConcept, + typename TBase> + struct Implements + { + }; + + //############################################################################# + //! Checks whether the concept is implemented by the given class + template< + typename TConcept, + typename TDerived> + struct ImplementsConcept { + template< + typename TBase> + static auto implements(Implements&) -> std::true_type; + static auto implements(...) -> std::false_type; + + static constexpr auto value = decltype(implements(std::declval()))::value; + }; + + namespace detail + { + //############################################################################# + //! Returns the type that implements the given concept in the inheritance hierarchy. + template< + typename TConcept, + typename TDerived, + typename Sfinae = void> + struct ImplementationBaseType; + + //############################################################################# + //! Base case for types that do not inherit from "Implements" is the type itself. + template< + typename TConcept, + typename TDerived> + struct ImplementationBaseType< + TConcept, + TDerived, + typename std::enable_if::value>::type> + { + using type = TDerived; + }; + + //############################################################################# + //! For types that inherit from "Implements" it finds the base class (TBase) which implements the concept. + template< + typename TConcept, + typename TDerived> + struct ImplementationBaseType< + TConcept, + TDerived, + typename std::enable_if::value>::type> + { + template< + typename TBase> + static auto implementer(Implements&) -> TBase; + + using type = decltype(implementer(std::declval())); + + static_assert(std::is_base_of::value, "The type implementing the concept has to be a publicly accessible base class!"); + }; + } + + //############################################################################# + //! Returns the type that implements the given concept in the inheritance hierarchy. + template< + typename TConcept, + typename TDerived> + using ImplementationBase = typename detail::ImplementationBaseType::type; + } +} diff --git a/alpaka/include/alpaka/meta/InheritFromList.hpp b/alpaka/include/alpaka/meta/InheritFromList.hpp new file mode 100644 index 0000000000..196c2fc510 --- /dev/null +++ b/alpaka/include/alpaka/meta/InheritFromList.hpp @@ -0,0 +1,32 @@ +/* Copyright 2019 Benjamin Worpitz + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +namespace alpaka +{ + namespace meta + { + template< + typename TBaseList + > + class InheritFromList; + + template< + template class TList, + typename... TBases + > + class InheritFromList< + TList + > + : public TBases... + { + }; + } +} diff --git a/alpaka/include/alpaka/meta/Unique.hpp b/alpaka/include/alpaka/meta/Unique.hpp new file mode 100644 index 0000000000..dbb6adc43a --- /dev/null +++ b/alpaka/include/alpaka/meta/Unique.hpp @@ -0,0 +1,60 @@ +/* Copyright 2019 Benjamin Worpitz + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +#include + +#include + +namespace alpaka +{ + namespace meta + { + namespace detail + { + template< + typename T, + typename... Ts> + struct UniqueHelper + { + using type = T; + }; + + template< + template class TList, + typename... Ts, + typename U, + typename... Us> + struct UniqueHelper, U, Us...> + : std::conditional<(Disjunction...>::value) + , UniqueHelper, Us...> + , UniqueHelper, Us...>>::type + {}; + + template< + typename T> + struct UniqueImpl; + + template< + template class TList, + typename... Ts> + struct UniqueImpl> + { + using type = typename UniqueHelper, Ts...>::type; + }; + } + + //############################################################################# + //! Trait that returns a list with only unique (no equal) types (a set). Duplicates will be filtered out. + template< + typename TList> + using Unique = typename detail::UniqueImpl::type; + } +} diff --git a/alpaka/include/alpaka/queue/Properties.hpp b/alpaka/include/alpaka/queue/Properties.hpp new file mode 100644 index 0000000000..67da1cac30 --- /dev/null +++ b/alpaka/include/alpaka/queue/Properties.hpp @@ -0,0 +1,31 @@ +/* Copyright 2019 Rene Widera + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#pragma once + +namespace alpaka +{ + namespace queue + { + //----------------------------------------------------------------------------- + //! Properties to define queue behavior + namespace property + { + //############################################################################# + //! The caller is waiting until the enqueued task is finished + struct Blocking{}; + + //############################################################################# + //! The caller is NOT waiting until the enqueued task is finished + struct NonBlocking{}; + } + + using namespace property; + } +} diff --git a/alpaka/test/unit/core/src/ConceptsTest.cpp b/alpaka/test/unit/core/src/ConceptsTest.cpp new file mode 100644 index 0000000000..4e656111a4 --- /dev/null +++ b/alpaka/test/unit/core/src/ConceptsTest.cpp @@ -0,0 +1,183 @@ +/* Copyright 2019 Axel Huebl, Benjamin Worpitz + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include + +#include + +struct ConceptExample; +struct ConceptNonMatchingExample; + +struct ImplementerNotTagged +{ +}; + +struct ImplementerNotTaggedButNonMatchingTagged + : public alpaka::concepts::Implements +{ +}; + +struct ImplementerTagged + : public alpaka::concepts::Implements +{ +}; + +struct ImplementerTaggedButAlsoNonMatchingTagged + : public alpaka::concepts::Implements + , public alpaka::concepts::Implements +{ +}; + +struct ImplementerWithTaggedBase + : public ImplementerTagged +{ +}; + +struct ImplementerWithTaggedBaseAlsoNonMatchingTagged + : public ImplementerTaggedButAlsoNonMatchingTagged +{ +}; + +struct ImplementerTaggedToBase + : public ImplementerNotTagged + , public alpaka::concepts::Implements +{ +}; + +struct ImplementerTaggedToBaseAlsoNonMatchingTagged + : public ImplementerNotTaggedButNonMatchingTagged + , public alpaka::concepts::Implements +{ +}; + +struct ImplementerNonMatchingTaggedTaggedToBase + : public ImplementerNotTagged + , public alpaka::concepts::Implements + , public alpaka::concepts::Implements +{ +}; + +//----------------------------------------------------------------------------- +TEST_CASE("ImplementerNotTagged", "[meta]") +{ + using ImplementationBase = alpaka::concepts::ImplementationBase; + + static_assert( + std::is_same< + ImplementerNotTagged, + ImplementationBase + >::value, + "alpaka::meta::ImplementationBase failed!"); +} + +//----------------------------------------------------------------------------- +TEST_CASE("ImplementerNotTaggedButNonMatchingTagged", "[meta]") +{ + using ImplementationBase = alpaka::concepts::ImplementationBase; + + static_assert( + std::is_same< + ImplementerNotTaggedButNonMatchingTagged, + ImplementationBase + >::value, + "alpaka::meta::ImplementationBase failed!"); +} + +//----------------------------------------------------------------------------- +TEST_CASE("ImplementerTagged", "[meta]") +{ + using ImplementationBase = alpaka::concepts::ImplementationBase; + + static_assert( + std::is_same< + ImplementerTagged, + ImplementationBase + >::value, + "alpaka::meta::ImplementationBase failed!"); +} + +//----------------------------------------------------------------------------- +TEST_CASE("ImplementerTaggedButAlsoNonMatchingTagged", "[meta]") +{ + using ImplementationBase = alpaka::concepts::ImplementationBase; + + static_assert( + std::is_same< + ImplementerTaggedButAlsoNonMatchingTagged, + ImplementationBase + >::value, + "alpaka::meta::ImplementationBase failed!"); +} + +//----------------------------------------------------------------------------- +TEST_CASE("ImplementerWithTaggedBaseAlsoNonMatchingTagged", "[meta]") +{ + using ImplementationBase = alpaka::concepts::ImplementationBase; + + static_assert( + std::is_same< + ImplementerTaggedButAlsoNonMatchingTagged, + ImplementationBase + >::value, + "alpaka::meta::ImplementationBase failed!"); +} + +//----------------------------------------------------------------------------- +TEST_CASE("ImplementerWithTaggedBase", "[meta]") +{ + using ImplementationBase = alpaka::concepts::ImplementationBase; + + static_assert( + std::is_same< + ImplementerTagged, + ImplementationBase + >::value, + "alpaka::meta::ImplementationBase failed!"); +} + +//----------------------------------------------------------------------------- +TEST_CASE("ImplementerTaggedToBase", "[meta]") +{ + using ImplementationBase = alpaka::concepts::ImplementationBase; + + static_assert( + std::is_same< + ImplementerNotTagged, + ImplementationBase + >::value, + "alpaka::meta::ImplementationBase failed!"); +} + +//----------------------------------------------------------------------------- +TEST_CASE("ImplementerTaggedToBaseAlsoNonMatchingTagged", "[meta]") +{ + using ImplementationBase = alpaka::concepts::ImplementationBase; + + static_assert( + std::is_same< + ImplementerNotTaggedButNonMatchingTagged, + ImplementationBase + >::value, + "alpaka::meta::ImplementationBase failed!"); +} + +//----------------------------------------------------------------------------- +TEST_CASE("ImplementerNonMatchingTaggedTaggedToBase", "[meta]") +{ + using ImplementationBase = alpaka::concepts::ImplementationBase; + + static_assert( + std::is_same< + ImplementerNotTagged, + ImplementationBase + >::value, + "alpaka::meta::ImplementationBase failed!"); +} diff --git a/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp b/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp new file mode 100644 index 0000000000..4873e5763d --- /dev/null +++ b/alpaka/test/unit/kernel/src/KernelWithTemplateArgumentDeduction.cpp @@ -0,0 +1,207 @@ +/* Copyright 2019 Axel Huebl, Benjamin Worpitz, René Widera, Sergei Bastrakov + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include +#include +#include + +#include + +#include + +//############################################################################# +template< typename TExpected > +class KernelInvocationTemplateDeductionValueSemantics +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename Acc, + typename TByValue, + typename TByConstValue, + typename TByConstReference> + ALPAKA_FN_ACC auto operator()( + Acc const & acc, + bool * success, + TByValue, + TByConstValue const, + TByConstReference const &) const + -> void + { + ALPAKA_CHECK( + *success, + static_cast>(1) == (alpaka::workdiv::getWorkDiv(acc)).prod()); + + static_assert( + std::is_same::value, + "Incorrect first additional kernel template parameter type!"); + static_assert( + std::is_same::value, + "Incorrect second additional kernel template parameter type!"); + static_assert( + std::is_same::value, + "Incorrect third additional kernel template parameter type!"); + + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromValue", "[kernel]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::ones()); + + using Value = std::int32_t; + KernelInvocationTemplateDeductionValueSemantics< Value > kernel; + + Value value{ }; + REQUIRE(fixture(kernel, value, value, value)); +} + +TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromConstValue", "[kernel]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::ones()); + + using Value = std::int32_t; + KernelInvocationTemplateDeductionValueSemantics< Value > kernel; + + Value const constValue{ }; + REQUIRE(fixture(kernel, constValue, constValue, constValue)); +} + +TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromConstReference", "[kernel]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::ones()); + + using Value = std::int32_t; + KernelInvocationTemplateDeductionValueSemantics< Value > kernel; + + Value value{ }; + Value const & constReference = value; + REQUIRE(fixture(kernel, constReference, constReference, constReference)); +} + +//############################################################################# +template< + typename TExpectedFirst, + typename TExpectedSecond = TExpectedFirst +> +class KernelInvocationTemplateDeductionPointerSemantics +{ +public: + //----------------------------------------------------------------------------- + ALPAKA_NO_HOST_ACC_WARNING + template< + typename Acc, + typename TByPointer, + typename TByPointerToConst> + ALPAKA_FN_ACC auto operator()( + Acc const & acc, + bool * success, + TByPointer *, + TByPointerToConst const *) const + -> void + { + ALPAKA_CHECK( + *success, + static_cast>(1) == (alpaka::workdiv::getWorkDiv(acc)).prod()); + + static_assert( + std::is_same::value, + "Incorrect first additional kernel template parameter type!"); + static_assert( + std::is_same::value, + "Incorrect second additional kernel template parameter type!"); + + } +}; + +//----------------------------------------------------------------------------- +TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromPointer", "[kernel]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::ones()); + + using Value = std::int32_t; + KernelInvocationTemplateDeductionPointerSemantics< Value > kernel; + + Value value{ }; + Value * pointer = &value; + REQUIRE(fixture(kernel, pointer, pointer)); +} + +TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromPointerToConst", "[kernel]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::ones()); + + using Value = std::int32_t; + KernelInvocationTemplateDeductionPointerSemantics< Value const, Value > kernel; + + Value const constValue{ }; + Value const * pointerToConst = &constValue; + REQUIRE(fixture(kernel, pointerToConst, pointerToConst)); +} + +TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromStaticArray", "[kernel]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::ones()); + + using Value = std::int32_t; + KernelInvocationTemplateDeductionPointerSemantics< Value > kernel; + + Value staticArray[4] = { }; + REQUIRE(fixture(kernel, staticArray, staticArray)); +} + +TEMPLATE_LIST_TEST_CASE( "kernelFuntionObjectTemplateDeductionFromConstStaticArray", "[kernel]", alpaka::test::acc::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::dim::Dim; + using Idx = alpaka::idx::Idx; + + alpaka::test::KernelExecutionFixture fixture( + alpaka::vec::Vec::ones()); + + using Value = std::int32_t; + KernelInvocationTemplateDeductionPointerSemantics< Value const, Value > kernel; + + Value const constStaticArray[4] = { }; + REQUIRE(fixture(kernel, constStaticArray, constStaticArray)); +} diff --git a/alpaka/test/unit/meta/src/UniqueTest.cpp b/alpaka/test/unit/meta/src/UniqueTest.cpp new file mode 100644 index 0000000000..36cb4665d4 --- /dev/null +++ b/alpaka/test/unit/meta/src/UniqueTest.cpp @@ -0,0 +1,70 @@ +/* Copyright 2019 Axel Huebl, Benjamin Worpitz + * + * This file is part of Alpaka. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +#include + +#include + +#include +#include + +//----------------------------------------------------------------------------- +TEST_CASE("uniqueWithDuplicate", "[meta]") +{ + using UniqueInput = + std::tuple< + int, + float, + int, + float, + float, + int>; + + using UniqueResult = + alpaka::meta::Unique< + UniqueInput + >; + + using UniqueReference = + std::tuple< + int, + float>; + + static_assert( + std::is_same< + UniqueReference, + UniqueResult + >::value, + "alpaka::meta::Unique failed!"); +} + +//----------------------------------------------------------------------------- +TEST_CASE("uniqueWithoutDuplicate", "[meta]") +{ + using UniqueInput = + std::tuple< + int, + float, + double>; + + using UniqueResult = + alpaka::meta::Unique< + UniqueInput + >; + + using UniqueReference = + UniqueInput; + + static_assert( + std::is_same< + UniqueReference, + UniqueResult + >::value, + "alpaka::meta::Unique failed!"); +} diff --git a/doc/logo/cupla_logo.svg b/doc/logo/cupla_logo.svg new file mode 100644 index 0000000000..6dbe87dc90 --- /dev/null +++ b/doc/logo/cupla_logo.svg @@ -0,0 +1,168 @@ + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + diff --git a/doc/logo/cupla_logo_320x210.png b/doc/logo/cupla_logo_320x210.png new file mode 100644 index 0000000000..4695c9791e Binary files /dev/null and b/doc/logo/cupla_logo_320x210.png differ diff --git a/doc/logo/cupla_logo_320x210_bw.png b/doc/logo/cupla_logo_320x210_bw.png new file mode 100644 index 0000000000..0148aa5e3f Binary files /dev/null and b/doc/logo/cupla_logo_320x210_bw.png differ diff --git a/doc/logo/cupla_logo_bw.svg b/doc/logo/cupla_logo_bw.svg new file mode 100644 index 0000000000..d8cda9fd4c --- /dev/null +++ b/doc/logo/cupla_logo_bw.svg @@ -0,0 +1,202 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + diff --git a/example/CUDASamples/blackScholes/CMakeLists.txt b/example/CUDASamples/blackScholes/CMakeLists.txt new file mode 100644 index 0000000000..7cd8d38e41 --- /dev/null +++ b/example/CUDASamples/blackScholes/CMakeLists.txt @@ -0,0 +1,69 @@ +# +# Copyright 2016 Rene Widera, Benjamin Worpitz, Vincent Ridder +# +# This file is part of cupla. +# +# cupla is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# cupla is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with cupla. +# If not, see . +# + + +################################################################################ +# Required CMake version. +################################################################################ + +cmake_minimum_required(VERSION 3.11.4) + +set_property(GLOBAL PROPERTY USE_FOLDERS ON) + +################################################################################ +# Project. +################################################################################ + +project("blackScholes") + +################################################################################ +# CMake policies +# +# Search in _ROOT: +# https://cmake.org/cmake/help/v3.12/policy/CMP0074.html# +################################################################################ + +if(POLICY CMP0074) + cmake_policy(SET CMP0074 NEW) +endif() + +################################################################################ +# Find cupla +################################################################################ + +set(cupla_ROOT "$ENV{CUPLA_ROOT}" CACHE STRING "The location of the cupla library") + +list(APPEND CMAKE_MODULE_PATH "${cupla_ROOT}") +find_package("cupla" REQUIRED) + + +################################################################################ +# Add library. +################################################################################ + +set(_SOURCE_DIR "src/") + +# Add all the source files in all recursive subdirectories and group them accordingly. +append_recursive_files_add_to_src_group("${_SOURCE_DIR}" "" "cpp" _FILES_SOURCE_CXX) + +# Always add all files to the target executable build call to add them to the build project. +cupla_add_executable(${PROJECT_NAME} ${_FILES_SOURCE_CXX}) + +target_include_directories(${PROJECT_NAME} PRIVATE "../common/") diff --git a/example/CUDASamples/blackScholes/doc/BlackScholes.pdf b/example/CUDASamples/blackScholes/doc/BlackScholes.pdf new file mode 100644 index 0000000000..d6549281d3 Binary files /dev/null and b/example/CUDASamples/blackScholes/doc/BlackScholes.pdf differ diff --git a/example/CUDASamples/blackScholes/readme.txt b/example/CUDASamples/blackScholes/readme.txt new file mode 100644 index 0000000000..0acb4c6c85 --- /dev/null +++ b/example/CUDASamples/blackScholes/readme.txt @@ -0,0 +1,7 @@ +Sample: BlackScholes +Minimum spec: SM 2.0 + +This sample evaluates fair call and put prices for a given set of European options by Black-Scholes formula. + +Key concepts: +Computational Finance diff --git a/example/CUDASamples/blackScholes/src/BlackScholes.cpp b/example/CUDASamples/blackScholes/src/BlackScholes.cpp new file mode 100644 index 0000000000..d09a3a59da --- /dev/null +++ b/example/CUDASamples/blackScholes/src/BlackScholes.cpp @@ -0,0 +1,259 @@ +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/* + * This sample evaluates fair call and put prices for a + * given set of European options by Black-Scholes formula. + * See supplied whitepaper for more explanations. + */ + +#include + +#include // helper functions for string parsing +#include // helper functions CUDA error checking and initialization + +//////////////////////////////////////////////////////////////////////////////// +// Process an array of optN options on CPU +//////////////////////////////////////////////////////////////////////////////// +extern "C" void BlackScholesCPU( + float *h_CallResult, + float *h_PutResult, + float *h_StockPrice, + float *h_OptionStrike, + float *h_OptionYears, + float Riskfree, + float Volatility, + int optN +); + +//////////////////////////////////////////////////////////////////////////////// +// Process an array of OptN options on GPU +//////////////////////////////////////////////////////////////////////////////// +#include "BlackScholes_kernel.cuh" + +//////////////////////////////////////////////////////////////////////////////// +// Helper function, returning uniformly distributed +// random float in [low, high] range +//////////////////////////////////////////////////////////////////////////////// +float RandFloat(float low, float high) +{ + float t = (float)rand() / (float)RAND_MAX; + return (1.0f - t) * low + t * high; +} + +//////////////////////////////////////////////////////////////////////////////// +// Data configuration +//////////////////////////////////////////////////////////////////////////////// +const int OPT_N = 4000000; +const int NUM_ITERATIONS = 500; + + +size_t OPT_SZ = OPT_N * sizeof(float); +const float RISKFREE = 0.02f; +const float VOLATILITY = 0.30f; + +#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) + +//////////////////////////////////////////////////////////////////////////////// +// Main program +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) +{ + // Start logs + printf("[%s] - Starting...\n", argv[0]); + + //'h_' prefix - CPU (host) memory space + float + //Results calculated by host for reference + *h_CallResultCPU, + *h_PutResultCPU, + //host copy of device results + *h_CallResultGPU, + *h_PutResultGPU, + //host instance of input data + *h_StockPrice, + *h_OptionStrike, + *h_OptionYears; + + //'d_' prefix - device memory space + float + //Results calculated by device + *d_CallResult, + *d_PutResult, + //device instance of input data + *d_StockPrice, + *d_OptionStrike, + *d_OptionYears; + + double + delta, ref, sum_delta, sum_ref, max_delta, L1norm, gpuTime; + + StopWatchInterface *hTimer = NULL; + int i; + + //findCudaDevice(argc, (const char **)argv); + + sdkCreateTimer(&hTimer); + + printf("Initializing data...\n"); + printf("...allocating CPU memory for options.\n"); + h_CallResultCPU = (float *)malloc(OPT_SZ); + h_PutResultCPU = (float *)malloc(OPT_SZ); + h_CallResultGPU = (float *)malloc(OPT_SZ); + h_PutResultGPU = (float *)malloc(OPT_SZ); + h_StockPrice = (float *)malloc(OPT_SZ); + h_OptionStrike = (float *)malloc(OPT_SZ); + h_OptionYears = (float *)malloc(OPT_SZ); + + printf("...allocating GPU memory for options.\n"); + checkCudaErrors(cudaMalloc((void **)&d_CallResult, OPT_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_PutResult, OPT_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_StockPrice, OPT_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_OptionStrike, OPT_SZ)); + checkCudaErrors(cudaMalloc((void **)&d_OptionYears, OPT_SZ)); + + printf("...generating input data in CPU mem.\n"); + srand(5347); + + //Generate options set + for (i = 0; i < OPT_N; i++) + { + h_CallResultCPU[i] = 0.0f; + h_PutResultCPU[i] = -1.0f; + h_StockPrice[i] = RandFloat(5.0f, 30.0f); + h_OptionStrike[i] = RandFloat(1.0f, 100.0f); + h_OptionYears[i] = RandFloat(0.25f, 10.0f); + } + + printf("...copying input data to device mem.\n"); + //Copy options data to device memory for further processing + checkCudaErrors(cudaMemcpy(d_StockPrice, h_StockPrice, OPT_SZ, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_OptionStrike, h_OptionStrike, OPT_SZ, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_OptionYears, h_OptionYears, OPT_SZ, cudaMemcpyHostToDevice)); + printf("Data init done.\n\n"); + + + printf("Executing Black-Scholes kernel (%i iterations)...\n", NUM_ITERATIONS); + checkCudaErrors(cudaDeviceSynchronize()); + sdkResetTimer(&hTimer); + sdkStartTimer(&hTimer); + + for (i = 0; i < NUM_ITERATIONS; i++) + { + CUPLA_KERNEL_OPTI(BlackScholesGPU)(DIV_UP((OPT_N/2), 128), 128/*480, 128*/,0,0)( + (float2 *)d_CallResult, + (float2 *)d_PutResult, + (float2 *)d_StockPrice, + (float2 *)d_OptionStrike, + (float2 *)d_OptionYears, + RISKFREE, + VOLATILITY, + OPT_N + ); + //getLastCudaError("BlackScholesGPU() execution failed\n"); + } + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&hTimer); + gpuTime = sdkGetTimerValue(&hTimer) / NUM_ITERATIONS; + + //Both call and put is calculated + printf("Options count : %i \n", 2 * OPT_N); + printf("BlackScholes device time : %f msec\n", gpuTime); + printf("Effective memory bandwidth: %f GB/s\n", ((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (gpuTime * 1E-3)); + printf("Gigaoptions per second : %f \n\n", ((double)(2 * OPT_N) * 1E-9) / (gpuTime * 1E-3)); + + printf("BlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %u, Workgroup = %u\n", + (((double)(2.0 * OPT_N) * 1.0E-9) / (gpuTime * 1.0E-3)), gpuTime*1e-3, (2 * OPT_N), 1, 128); + + printf("\nReading back device results...\n"); + //Read back device results to compare them to host results + checkCudaErrors(cudaMemcpy(h_CallResultGPU, d_CallResult, OPT_SZ, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(h_PutResultGPU, d_PutResult, OPT_SZ, cudaMemcpyDeviceToHost)); + + + printf("Checking the results...\n"); + printf("...running host calculations.\n\n"); + //Calculate options values on host + BlackScholesCPU( + h_CallResultCPU, + h_PutResultCPU, + h_StockPrice, + h_OptionStrike, + h_OptionYears, + RISKFREE, + VOLATILITY, + OPT_N + ); + + printf("Comparing the results...\n"); + //Calculate max absolute difference and L1 distance + //between CPU and GPU results + sum_delta = 0; + sum_ref = 0; + max_delta = 0; + + for (i = 0; i < OPT_N; i++) + { + ref = h_CallResultCPU[i]; + delta = fabs(h_CallResultCPU[i] - h_CallResultGPU[i]); + + if (delta > max_delta) + { + max_delta = delta; + } + + sum_delta += delta; + sum_ref += fabs(ref); + } + + L1norm = sum_delta / sum_ref; + printf("L1 norm: %E\n", L1norm); + printf("Max absolute error: %E\n\n", max_delta); + + printf("Shutting down...\n"); + printf("...releasing device memory.\n"); + checkCudaErrors(cudaFree(d_OptionYears)); + checkCudaErrors(cudaFree(d_OptionStrike)); + checkCudaErrors(cudaFree(d_StockPrice)); + checkCudaErrors(cudaFree(d_PutResult)); + checkCudaErrors(cudaFree(d_CallResult)); + + printf("...releasing host memory.\n"); + free(h_OptionYears); + free(h_OptionStrike); + free(h_StockPrice); + free(h_PutResultGPU); + free(h_CallResultGPU); + free(h_PutResultCPU); + free(h_CallResultCPU); + sdkDeleteTimer(&hTimer); + printf("Shutdown done.\n"); + + printf("\n[BlackScholes] - Test Summary\n"); + + // cudaDeviceReset causes the driver to clean up all state. While + // not mandatory in normal operation, it is good practice. It is also + // needed to ensure correct operation when the application is being + // profiled. Calling cudaDeviceReset causes all profile data to be + // flushed before the application exits + cudaDeviceReset(); + + if (L1norm > 1e-6) + { + printf("Test failed!\n"); + exit(EXIT_FAILURE); + } + + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n\n"); + printf("Test passed\n"); + exit(EXIT_SUCCESS); +} diff --git a/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp b/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp new file mode 100644 index 0000000000..f0efc5f664 --- /dev/null +++ b/example/CUDASamples/blackScholes/src/BlackScholes_gold.cpp @@ -0,0 +1,96 @@ +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + + + +#include + + + +/////////////////////////////////////////////////////////////////////////////// +// Polynomial approximation of cumulative normal distribution function +/////////////////////////////////////////////////////////////////////////////// +static double CND(double d) +{ + const double A1 = 0.31938153; + const double A2 = -0.356563782; + const double A3 = 1.781477937; + const double A4 = -1.821255978; + const double A5 = 1.330274429; + const double RSQRT2PI = 0.39894228040143267793994605993438; + + double + K = 1.0 / (1.0 + 0.2316419 * fabs(d)); + + double + cnd = RSQRT2PI * exp(- 0.5 * d * d) * + (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + + if (d > 0) + cnd = 1.0 - cnd; + + return cnd; +} + + +/////////////////////////////////////////////////////////////////////////////// +// Black-Scholes formula for both call and put +/////////////////////////////////////////////////////////////////////////////// +static void BlackScholesBodyCPU( + float &callResult, + float &putResult, + float Sf, //Stock price + float Xf, //Option strike + float Tf, //Option years + float Rf, //Riskless rate + float Vf //Volatility rate +) +{ + double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf; + + double sqrtT = sqrt(T); + double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT); + double d2 = d1 - V * sqrtT; + double CNDD1 = CND(d1); + double CNDD2 = CND(d2); + + //Calculate Call and Put simultaneously + double expRT = exp(- R * T); + callResult = (float)(S * CNDD1 - X * expRT * CNDD2); + putResult = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1)); +} + + +//////////////////////////////////////////////////////////////////////////////// +// Process an array of optN options +//////////////////////////////////////////////////////////////////////////////// +extern "C" void BlackScholesCPU( + float *h_CallResult, + float *h_PutResult, + float *h_StockPrice, + float *h_OptionStrike, + float *h_OptionYears, + float Riskfree, + float Volatility, + int optN +) +{ + for (int opt = 0; opt < optN; opt++) + BlackScholesBodyCPU( + h_CallResult[opt], + h_PutResult[opt], + h_StockPrice[opt], + h_OptionStrike[opt], + h_OptionYears[opt], + Riskfree, + Volatility + ); +} diff --git a/example/CUDASamples/blackScholes/src/BlackScholes_kernel.cuh b/example/CUDASamples/blackScholes/src/BlackScholes_kernel.cuh new file mode 100644 index 0000000000..050b4a15d3 --- /dev/null +++ b/example/CUDASamples/blackScholes/src/BlackScholes_kernel.cuh @@ -0,0 +1,141 @@ +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + + + +#include +#include +#ifndef __CUDACC__ +struct float2{ + float x; + float y; + float2(float x, float y) : y(y), x(x) { } +}; +float2 make_float2(float x, float y){ + return float2(x,y); +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +// Polynomial approximation of cumulative normal distribution function +/////////////////////////////////////////////////////////////////////////////// +template +ALPAKA_FN_ACC +float cndGPU(T_Acc const & acc, float d) +{ + const float A1 = 0.31938153f; + const float A2 = -0.356563782f; + const float A3 = 1.781477937f; + const float A4 = -1.821255978f; + const float A5 = 1.330274429f; + const float RSQRT2PI = 0.39894228040143267793994605993438f; + + float + K = __fdividef(1.0f, (1.0f + 0.2316419f * fabsf(d))); + float cnd = RSQRT2PI * __expf(- 0.5f * d * d) * + (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5))))); + if (d > 0) + cnd = 1.0f - cnd; + + return cnd; +} + + +/////////////////////////////////////////////////////////////////////////////// +// Black-Scholes formula for both call and put +/////////////////////////////////////////////////////////////////////////////// +template +ALPAKA_FN_ACC void BlackScholesBodyGPU( + T_Acc const & acc, + float &CallResult, + float &PutResult, + float S, //Stock price + float X, //Option strike + float T, //Option years + float R, //Riskless rate + float V //Volatility rate +) +{ + float sqrtT, expRT; + float d1, d2, CNDD1, CNDD2; + sqrtT = sqrtf(T); /// __fdividef(1.0F, rsqrtf(T)); + d1 = __fdividef(__logf(S / X) + (R + 0.5f * V * V) * T, V * sqrtT); + + d2 = d1 - V * sqrtT; + + CNDD1 = cndGPU(acc, d1); + CNDD2 = cndGPU(acc, d2); + + //Calculate Call and Put simultaneously + expRT = __expf(- R * T); + CallResult = S * CNDD1 - X * expRT * CNDD2; + PutResult = X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1); +} + + +//////////////////////////////////////////////////////////////////////////////// +//Process an array of optN options on GPU +//////////////////////////////////////////////////////////////////////////////// +//__launch_bounds__(128) +struct BlackScholesGPU { + template< typename T_Acc> + ALPAKA_FN_HOST_ACC + void operator()( + T_Acc const & acc, + float2 *__restrict d_CallResult, + float2 *__restrict d_PutResult, + float2 *__restrict d_StockPrice, + float2 *__restrict d_OptionStrike, + float2 *__restrict d_OptionYears, + float Riskfree, + float Volatility, + int optN + ) const + { + ////Thread index + //const int tid = blockDim.x * blockIdx.x + threadIdx.x; + ////Total number of threads in execution grid + //const int THREAD_N = blockDim.x * gridDim.x; + + const int opt_begin = blockDim.x * blockIdx.x * elemDim.x + threadIdx.x * elemDim.x; + + // Calculating 2 options per thread to increase ILP (instruction level parallelism) + if (opt_begin < (optN / 2)) { + const int opt_end = (opt_begin + elemDim.x < optN / 2) ? opt_begin + elemDim.x : optN / 2; + for (int opt = opt_begin; opt < opt_end; opt++) { + float callResult1, callResult2; + float putResult1, putResult2; + BlackScholesBodyGPU( + acc, + callResult1, + putResult1, + d_StockPrice[opt].x, + d_OptionStrike[opt].x, + d_OptionYears[opt].x, + Riskfree, + Volatility + ); + BlackScholesBodyGPU( + acc, + callResult2, + putResult2, + d_StockPrice[opt].y, + d_OptionStrike[opt].y, + d_OptionYears[opt].y, + Riskfree, + Volatility + ); + d_CallResult[opt] = make_float2(callResult1, callResult2); + d_PutResult[opt] = make_float2(putResult1, putResult2); + } + } + } +}; diff --git a/thirdParty/cupla/.travis.yml b/thirdParty/cupla/.travis.yml deleted file mode 100644 index 0f1ae8a4ff..0000000000 --- a/thirdParty/cupla/.travis.yml +++ /dev/null @@ -1,186 +0,0 @@ -language: cpp -sudo: false -dist: xenial - -cache: - apt: true - directories: - - $HOME/.cache/cuda - - $HOME/.cache/cmake - - $HOME/.cache/boost - -addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - g++-5 - - clang-4.0 - -env: - global: - - CUDA_ROOT: $HOME/.cache/cuda - - CMAKE_ROOT: $HOME/.cache/cmake - - BOOST_ROOT: $HOME/.cache/boost - - BOOST_MIN: 106501 # careful: hard coded below - - CUPLA_ROOT: $TRAVIS_BUILD_DIR - - OMP_NUM_THREADS: 4 # ignored in thread layer - matrix: - # Acc Block Thread - - COMPILER=gcc STRATEGY="CPU_B_SEQ_T_OMP2" - - COMPILER=gcc STRATEGY="CPU_B_OMP2_T_SEQ" - - COMPILER=gcc STRATEGY="CPU_B_SEQ_T_SEQ" - - COMPILER=clang STRATEGY="CPU_B_SEQ_T_THREADS" - - COMPILER=clang STRATEGY="CPU_B_SEQ_T_SEQ" - - COMPILER=nvcc STRATEGY="GPU_CUDA" - -before_install: - - mkdir -p $HOME/matrixMul - - mkdir -p $HOME/asyncAPI - - mkdir -p $HOME/asyncAPI_tuned - - mkdir -p $HOME/vectorAdd - - mkdir -p $HOME/cuplaVectorAdd - - mkdir -p $HOME/blackScholes - - mkdir -p $HOME/test/config - - export CMAKE_FLAGS="-DALPAKA_ACC_"$STRATEGY"_ENABLE=ON" - - if [ "$COMPILER" == "gcc" ]; then - echo "Using g++-5 and sequential OpenMP2 threads ..."; - export CXX=g++-5; - export CC=gcc-5; - unset CUDA_ROOT; - elif [ "$COMPILER" == "clang" ]; then - echo "Using clang++-4.0 & sequential threads ..."; - export CXX=clang++-4.0; - export CC=clang-4.0; - unset CUDA_ROOT; - elif [ "$COMPILER" == "nvcc" ]; then - echo "Using CUDA 7.5 ..."; - export CXX=g++-5; - export CC=gcc-5; - export PATH=$CUDA_ROOT/bin:$PATH; - fi - - echo "$CMAKE_FLAGS" - - echo "$CXX $CC" - - $CXX --version - - $CC --version - -install: - ############################################################################# - # CMAKE # - ############################################################################# - - export PATH=$CMAKE_ROOT/bin:$PATH - - CMAKE_3_15_0_FOUND=$(cmake --version | grep " 3\.15\.0" >/dev/null && { echo 0; } || { echo 1; }) - - if [ $CMAKE_3_15_0_FOUND -ne 0 ]; then - mkdir -p $CMAKE_ROOT && - cd $CMAKE_ROOT && - rm -rf $CMAKE_ROOT/* && - travis_retry wget --no-check-certificate http://cmake.org/files/v3.15/cmake-3.15.0-Linux-x86_64.tar.gz && - tar -xzf cmake-3.15.0-Linux-x86_64.tar.gz && - mv cmake-3.15.0-Linux-x86_64/* . && - rm -rf cmake-3.15.0-Linux-x86_64.tar.gz cmake-3.15.0-Linux-x86_64 && - cd -; - fi - - cmake --version - ############################################################################# - # BOOST # - ############################################################################# - - export CMAKE_PREFIX_PATH=$BOOST_ROOT:$CMAKE_PREFIX_PATH - # cmake --find-package -DNAME=Boost -DCOMPILER_ID=GNU -DLANGUAGE=CXX -DMODE=EXIST - - echo -e "#include \n#include \nint main() { std::cout << BOOST_VERSION << std::endl; return 0; }" - | $CXX -I$BOOST_ROOT/include -x c++ - >/dev/null || { echo 0; } - - ls $BOOST_ROOT - - BOOST_FOUND=$([ $(./a.out) -ge $BOOST_MIN ] && { echo 0; } || { echo 1; }) - - if [ $BOOST_FOUND -ne 0 ]; then - mkdir -p $ BOOST_ROOT && - cd $BOOST_ROOT && - travis_retry wget --no-check-certificate -O boost.tar.bz2 http://sourceforge.net/projects/boost/files/boost/1.65.1/boost_1_65_1.tar.bz2/download && - tar -xjf boost.tar.bz2 && - cd boost_1_65_1 && - ./bootstrap.sh --with-libraries=atomic,chrono,context,date_time,system,thread --prefix=$BOOST_ROOT && - ./b2 -j2 && - ./b2 install && - rm -rf boost.tar.bz2 boost_1_65_1 && - cd $HOME; - fi - ############################################################################# - # CUDA # - ############################################################################# - - NVCC_FOUND=$(which nvcc >/dev/null && { echo 0; } || { echo 1; }) - - if [ $NVCC_FOUND -ne 0 ] && [ $COMPILER == "nvcc" ]; then - mkdir -p $CUDA_ROOT && - cd $CUDA_ROOT && - travis_retry wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda_9.0.176_384.81_linux-run && - chmod u+x *-run && - ./cuda_9.0.176_384.81_linux-run --override --silent --verbose --toolkit --toolkitpath=$CUDA_ROOT && - rm -rf ./cuda_9.0.176_384.81_linux-run $CUDA_ROOT/{samples,jre,doc,share} && - cd -; - fi - -script: - ############################################################################# - # Example: Matrix Multiplication (adapted original) # - ############################################################################# - - cd $HOME/matrixMul - - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/matrixMul/ $CMAKE_FLAGS - - make - # can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original - # SDK example - # CPU_B_SEQ_T_OMP2/THREADS: too many threads necessary (256) - # - ./matrixMul -wA=64 -wB=64 -hA=64 -hB=64 - ############################################################################# - # Example: Async API (adapted original) # - ############################################################################# - - cd $HOME/asyncAPI - - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/asyncAPI/ $CMAKE_FLAGS - - make - # can not run with CPU_B_SEQ_T_SEQ due to missing elements layer in original - # SDK example - # CPU_B_SEQ_T_OMP2/THREADS: too many threads necessary (512) - # - ./asyncAPI - ############################################################################# - # Example: Async API (added elements layer) # - ############################################################################# - - cd $HOME/asyncAPI_tuned - - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/asyncAPI_tuned/ $CMAKE_FLAGS - - make - - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] || - [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then - ./asyncAPI_tuned; - fi - ############################################################################# - # Example: vectorAdd (added elements layer) # - ############################################################################# - - cd $HOME/vectorAdd - - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/vectorAdd/ $CMAKE_FLAGS - - make - - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] || - [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then - ./vectorAdd 100000; - fi - ############################################################################# - # Example: cuplaVectorAdd (added elements layer) # - ############################################################################# - - cd $HOME/cuplaVectorAdd - - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/cuplaVectorAdd/ $CMAKE_FLAGS - - make - - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] || - [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then - ./cuplaVectorAdd 100000; - fi - ############################################################################# - # Example: BlackScholes (adapted original) # - ############################################################################# - - cd $HOME/blackScholes - - cmake $TRAVIS_BUILD_DIR/example/CUDASamples/blackScholes/ $CMAKE_FLAGS - - make - - if [ $STRATEGY == "CPU_B_OMP2_T_SEQ" ] || - [ $STRATEGY == "CPU_B_SEQ_T_SEQ" ]; then - ./blackScholes; - fi - ############################################################################# - # Test: additional tests # - ############################################################################# - - cd $HOME/test/config - - if [[ $CXX =~ "^g\+\+" ]] || [[ "$COMPILER" == "nvcc" ]] ; then - $TRAVIS_BUILD_DIR/test/system/config/test.sh $CXX; - fi diff --git a/thirdParty/cupla/include/cupla/cudaToCupla/driverTypes.hpp b/thirdParty/cupla/include/cupla/cudaToCupla/driverTypes.hpp index 8a7985672f..0e41bfeba8 100644 --- a/thirdParty/cupla/include/cupla/cudaToCupla/driverTypes.hpp +++ b/thirdParty/cupla/include/cupla/cudaToCupla/driverTypes.hpp @@ -89,6 +89,7 @@ * * @{ */ +<<<<<<< HEAD #define atomicAdd(...) cupla::atomicAdd(acc, __VA_ARGS__) #define atomicSub(...) cupla::atomicSub(acc, __VA_ARGS__) #define atomicMin(...) cupla::atomicMin(acc, __VA_ARGS__) @@ -100,6 +101,19 @@ #define atomicAnd(...) cupla::atomicAnd(acc, __VA_ARGS__) #define atomicXor(...) cupla::atomicXor(acc, __VA_ARGS__) #define atomicOr(...) cupla::atomicOr(acc, __VA_ARGS__) +======= +#define atomicAdd(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Add>(acc, __VA_ARGS__) +#define atomicSub(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Sub>(acc, __VA_ARGS__) +#define atomicMin(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Min>(acc, __VA_ARGS__) +#define atomicMax(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Max>(acc, __VA_ARGS__) +#define atomicInc(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Inc>(acc, __VA_ARGS__) +#define atomicDec(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Dec>(acc, __VA_ARGS__) +#define atomicExch(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Exch>(acc, __VA_ARGS__) +#define atomicCAS(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Cas>(acc, __VA_ARGS__) +#define atomicAnd(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::And>(acc, __VA_ARGS__) +#define atomicXor(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Xor>(acc, __VA_ARGS__) +#define atomicOr(...) ::alpaka::atomic::atomicOp<::alpaka::atomic::op::Or>(acc, __VA_ARGS__) +>>>>>>> Squashed 'thirdParty/cupla/' changes from 0594a68a0d..f60a0ac72c /** @} */ #define uint3 ::cupla::uint3