diff --git a/.clang-format b/.clang-format index 5708a4fb10..35c4e2b679 100644 --- a/.clang-format +++ b/.clang-format @@ -1,77 +1,98 @@ --- +Language: Cpp +Standard: c++17 +DisableFormat: false AccessModifierOffset: -4 AlignAfterOpenBracket: AlwaysBreak AlignConsecutiveAssignments: false +#AlignConsecutiveBitFields: false AlignConsecutiveDeclarations: false -AlignEscapedNewlines: DontAlign -AlignOperands: false +AlignConsecutiveMacros: false +AlignEscapedNewlines: Right +AlignOperands: false #DontAlign AlignTrailingComments: false +AllowAllArgumentsOnNextLine: false +AllowAllConstructorInitializersOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: false -AllowShortBlocksOnASingleLine: false +AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: Empty -AllowShortIfStatementsOnASingleLine: false -AllowShortLoopsOnASingleLine: true +#AllowShortEnumsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false -BreakBeforeBraces: Custom -BraceWrapping: - AfterClass: true - AfterControlStatement: true - AfterEnum: true - AfterFunction: true - AfterNamespace: true - AfterStruct: true - AfterUnion: true - AfterExternBlock: true - BeforeCatch: true - BeforeElse: true - IndentBraces: false - SplitEmptyFunction: false - SplitEmptyRecord: false - SplitEmptyNamespace: false +#BitFieldColonSpacing: Both BreakBeforeBinaryOperators: All +BreakBeforeBraces: Allman BreakBeforeTernaryOperators: true -BreakConstructorInitializers: AfterColon -BreakInheritanceList: AfterColon +BreakConstructorInitializers: BeforeComma +BreakInheritanceList: BeforeComma BreakStringLiterals: true -ColumnLimit: 80 +ColumnLimit: 120 +CommentPragmas: '^ COMMENT pragma:' CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: true -ConstructorInitializerIndentWidth: 8 +ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true +DeriveLineEnding: false DerivePointerAlignment: false -FixNamespaceComments: false -IncludeBlocks: Regroup +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +IncludeBlocks: Regroup +IncludeCategories: +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +#IndentCaseBlocks: true IndentCaseLabels: false -IndentPPDirectives: None +#IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: AfterHash IndentWidth: 4 IndentWrappedFunctionNames: false KeepEmptyLinesAtTheStartOfBlocks: false -Language: Cpp +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 2 NamespaceIndentation: All -PointerAlignment: Middle +#OperandAlignmentStyle: Align +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 1000 +PointerAlignment: Left ReflowComments: true SortIncludes: true SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterTemplateKeyword: false +SpaceAfterCStyleCast: true +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true -SpaceBeforeCpp11BracedList: false +SpaceBeforeCpp11BracedList: true SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true -SpaceBeforeParens: Never +SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false SpaceInEmptyParentheses: false -SpacesInAngles: false -SpacesInCStyleCastParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInConditionalStatement: false SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false -Standard: Cpp11 +SpaceBeforeSquareBrackets: false +TabWidth: 4 +UseCRLF: false UseTab: Never ... diff --git a/examples/alpaka/asyncblur/asyncblur.cpp b/examples/alpaka/asyncblur/asyncblur.cpp index 7feef248c0..54aacca63a 100644 --- a/examples/alpaka/asyncblur/asyncblur.cpp +++ b/examples/alpaka/asyncblur/asyncblur.cpp @@ -26,34 +26,25 @@ #include #include -constexpr auto ASYNC - = true; ///< defines whether the data shall be processed asynchronously +constexpr auto ASYNC = true; ///< defines whether the data shall be processed asynchronously constexpr auto SHARED = true; ///< defines whether shared memory shall be used -constexpr auto SAVE - = true; ///< defines whether the resultion image shall be saved +constexpr auto SAVE = true; ///< defines whether the resultion image shall be saved constexpr auto CHUNK_COUNT = 4; -constexpr auto DEFAULT_IMG_X - = 4096; /// width of the default image if no png is loaded -constexpr auto DEFAULT_IMG_Y - = 4096; /// height of the default image if no png is loaded -constexpr auto KERNEL_SIZE - = 8; /// radius of the blur kernel, the diameter is this times two plus one -constexpr auto CHUNK_SIZE - = 512; /// size of each chunk to be processed per alpaka kernel -constexpr auto ELEMS_PER_BLOCK - = 16; /// number of elements per direction(!) every block should process +constexpr auto DEFAULT_IMG_X = 4096; /// width of the default image if no png is loaded +constexpr auto DEFAULT_IMG_Y = 4096; /// height of the default image if no png is loaded +constexpr auto KERNEL_SIZE = 8; /// radius of the blur kernel, the diameter is this times two plus one +constexpr auto CHUNK_SIZE = 512; /// size of each chunk to be processed per alpaka kernel +constexpr auto ELEMS_PER_BLOCK = 16; /// number of elements per direction(!) every block should process using FP = float; -template +template auto viewAlpakaBuffer( - Mapping & mapping, - AlpakaBuffer & - buffer) // taking mapping by & on purpose, so Mapping can deduce const + Mapping& mapping, + AlpakaBuffer& buffer) // taking mapping by & on purpose, so Mapping can deduce const { - return llama::View{ - mapping, {alpaka::mem::view::getPtrNative(buffer)}}; + return llama::View {mapping, {alpaka::mem::view::getPtrNative(buffer)}}; } // clang-format off @@ -80,101 +71,79 @@ using PixelOnAcc = llama::DS< /** Alpaka kernel functor used to blur a small image living in the device memory * using the \ref PixelOnAcc datum domain */ -template +template struct BlurKernel { - template - LLAMA_FN_HOST_ACC_INLINE void - operator()(const Acc & acc, View oldImage, View newImage) const + template + LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View oldImage, View newImage) const { const auto ti = alpaka::idx::getIdx(acc); [[maybe_unused]] auto sharedView = [&] { - if constexpr(SHARED) + if constexpr (SHARED) { // Using SoA for the shared memory constexpr auto sharedChunkSize = ElemsPerBlock + 2 * KernelSize; const auto sharedMapping = llama::mapping::tree::Mapping( - typename View::UserDomain{sharedChunkSize, sharedChunkSize}, - llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, - typename View::DatumDomain{}); - constexpr auto sharedMemSize - = llama::sizeOf * sharedChunkSize - * sharedChunkSize; - auto & sharedMem = alpaka::block::shared::st:: - allocVar(acc); - return llama::View{sharedMapping, llama::Array{&sharedMem[0]}}; + typename View::UserDomain {sharedChunkSize, sharedChunkSize}, + llama::Tuple {llama::mapping::tree::functor::LeafOnlyRT()}, + typename View::DatumDomain {}); + constexpr auto sharedMemSize = llama::sizeOf * sharedChunkSize * sharedChunkSize; + auto& sharedMem = alpaka::block::shared::st::allocVar(acc); + return llama::View {sharedMapping, llama::Array {&sharedMem[0]}}; } else - return int{}; // dummy + return int {}; // dummy }(); - [[maybe_unused]] const auto bi - = alpaka::idx::getIdx(acc); - if constexpr(SHARED) + [[maybe_unused]] const auto bi = alpaka::idx::getIdx(acc); + if constexpr (SHARED) { constexpr auto threadsPerBlock = ElemsPerBlock / Elems; - const auto threadIdxInBlock - = alpaka::idx::getIdx(acc); + const auto threadIdxInBlock = alpaka::idx::getIdx(acc); const std::size_t bStart[2] - = {bi[0] * ElemsPerBlock + threadIdxInBlock[0], - bi[1] * ElemsPerBlock + threadIdxInBlock[1]}; + = {bi[0] * ElemsPerBlock + threadIdxInBlock[0], bi[1] * ElemsPerBlock + threadIdxInBlock[1]}; const std::size_t bEnd[2] = { - alpaka::math::min( - acc, - bStart[0] + ElemsPerBlock + 2 * KernelSize, - oldImage.mapping.userDomainSize[0]), - alpaka::math::min( - acc, - bStart[1] + ElemsPerBlock + 2 * KernelSize, - oldImage.mapping.userDomainSize[1]), + alpaka::math::min(acc, bStart[0] + ElemsPerBlock + 2 * KernelSize, oldImage.mapping.userDomainSize[0]), + alpaka::math::min(acc, bStart[1] + ElemsPerBlock + 2 * KernelSize, oldImage.mapping.userDomainSize[1]), }; LLAMA_INDEPENDENT_DATA - for(auto y = bStart[0]; y < bEnd[0]; y += threadsPerBlock) + for (auto y = bStart[0]; y < bEnd[0]; y += threadsPerBlock) LLAMA_INDEPENDENT_DATA - for(auto x = bStart[1]; x < bEnd[1]; x += threadsPerBlock) - sharedView(y - bi[0] * ElemsPerBlock, x - bi[1] * ElemsPerBlock) - = oldImage(y, x); + for (auto x = bStart[1]; x < bEnd[1]; x += threadsPerBlock) + sharedView(y - bi[0] * ElemsPerBlock, x - bi[1] * ElemsPerBlock) = oldImage(y, x); alpaka::block::sync::syncBlockThreads(acc); } const std::size_t start[2] = {ti[0] * Elems, ti[1] * Elems}; const std::size_t end[2] = { - alpaka::math::min( - acc, - start[0] + Elems, - oldImage.mapping.userDomainSize[0] - 2 * KernelSize), - alpaka::math::min( - acc, - start[1] + Elems, - oldImage.mapping.userDomainSize[1] - 2 * KernelSize), + alpaka::math::min(acc, start[0] + Elems, oldImage.mapping.userDomainSize[0] - 2 * KernelSize), + alpaka::math::min(acc, start[1] + Elems, oldImage.mapping.userDomainSize[1] - 2 * KernelSize), }; LLAMA_INDEPENDENT_DATA - for(auto y = start[0]; y < end[0]; ++y) LLAMA_INDEPENDENT_DATA - for(auto x = start[1]; x < end[1]; ++x) + for (auto y = start[0]; y < end[0]; ++y) + LLAMA_INDEPENDENT_DATA + for (auto x = start[1]; x < end[1]; ++x) { auto sum = llama::allocVirtualDatumStack(); sum = 0; using ItType = long int; - const ItType iBStart - = SHARED ? ItType(y) - ItType(bi[0] * ElemsPerBlock) : y; - const ItType iAStart - = SHARED ? ItType(x) - ItType(bi[1] * ElemsPerBlock) : x; - const ItType i_b_end = SHARED - ? ItType(y + 2 * KernelSize + 1) - ItType(bi[0] * ElemsPerBlock) - : y + 2 * KernelSize + 1; - const ItType i_a_end = SHARED - ? ItType(x + 2 * KernelSize + 1) - ItType(bi[1] * ElemsPerBlock) - : x + 2 * KernelSize + 1; + const ItType iBStart = SHARED ? ItType(y) - ItType(bi[0] * ElemsPerBlock) : y; + const ItType iAStart = SHARED ? ItType(x) - ItType(bi[1] * ElemsPerBlock) : x; + const ItType i_b_end + = SHARED ? ItType(y + 2 * KernelSize + 1) - ItType(bi[0] * ElemsPerBlock) : y + 2 * KernelSize + 1; + const ItType i_a_end + = SHARED ? ItType(x + 2 * KernelSize + 1) - ItType(bi[1] * ElemsPerBlock) : x + 2 * KernelSize + 1; LLAMA_INDEPENDENT_DATA - for(auto b = iBStart; b < i_b_end; ++b) LLAMA_INDEPENDENT_DATA - for(auto a = iAStart; a < i_a_end; ++a) + for (auto b = iBStart; b < i_b_end; ++b) + LLAMA_INDEPENDENT_DATA + for (auto a = iAStart; a < i_a_end; ++a) { - if constexpr(SHARED) + if constexpr (SHARED) sum += sharedView(std::size_t(b), std::size_t(a)); else sum += oldImage(std::size_t(b), std::size_t(a)); @@ -185,7 +154,7 @@ struct BlurKernel } }; -int main(int argc, char ** argv) +int main(int argc, char** argv) { // ALPAKA using Dim = alpaka::dim::DimInt<2>; @@ -194,12 +163,8 @@ int main(int argc, char ** argv) // using Acc = alpaka::acc::AccGpuCudaRt; // using Acc = alpaka::acc::AccCpuSerial; - using Queue = alpaka::queue::Queue< - Acc, - std::conditional_t< - ASYNC, - alpaka::queue::NonBlocking, - alpaka::queue::Blocking>>; + using Queue + = alpaka::queue::Queue>; using DevHost = alpaka::dev::DevCpu; using DevAcc = alpaka::dev::Dev; using PltfHost = alpaka::pltf::Pltf; @@ -207,7 +172,8 @@ int main(int argc, char ** argv) const DevAcc devAcc = alpaka::pltf::getDevByIdx(0); const DevHost devHost = alpaka::pltf::getDevByIdx(0); std::vector queue; - for(std::size_t i = 0; i < CHUNK_COUNT; ++i) queue.push_back(Queue(devAcc)); + for (std::size_t i = 0; i < CHUNK_COUNT; ++i) + queue.push_back(Queue(devAcc)); // ASYNCCOPY std::size_t img_x = DEFAULT_IMG_X; @@ -216,20 +182,19 @@ int main(int argc, char ** argv) std::size_t buffer_y = DEFAULT_IMG_Y + 2 * KERNEL_SIZE; constexpr std::size_t hardwareThreads = 2; // relevant for OpenMP2Threads - using Distribution = common:: - ThreadsElemsDistribution; + using Distribution = common::ThreadsElemsDistribution; constexpr std::size_t elemCount = Distribution::elemCount; constexpr std::size_t threadCount = Distribution::threadCount; std::vector image; std::string out_filename = "output.png"; - if(argc > 1) + if (argc > 1) { int x = 0; int y = 0; int n = 3; - unsigned char * data = stbi_load(argv[1], &x, &y, &n, 0); + unsigned char* data = stbi_load(argv[1], &x, &y, &n, 0); image.resize(x * y * 3); std::copy(data, data + image.size(), begin(image)); stbi_image_free(data); @@ -238,21 +203,20 @@ int main(int argc, char ** argv) buffer_x = x + 2 * KERNEL_SIZE; buffer_y = y + 2 * KERNEL_SIZE; - if(argc > 2) + if (argc > 2) out_filename = std::string(argv[2]); } // LLAMA using UserDomain = llama::UserDomain<2>; - auto treeOperationList - = llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}; - const auto hostMapping = llama::mapping::tree::Mapping{ - UserDomain{buffer_y, buffer_x}, treeOperationList, Pixel{}}; - const auto devMapping = llama::mapping::tree::Mapping{ - UserDomain{CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE}, + auto treeOperationList = llama::Tuple {llama::mapping::tree::functor::LeafOnlyRT()}; + const auto hostMapping + = llama::mapping::tree::Mapping {UserDomain {buffer_y, buffer_x}, treeOperationList, Pixel {}}; + const auto devMapping = llama::mapping::tree::Mapping { + UserDomain {CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE}, treeOperationList, - PixelOnAcc{}}; + PixelOnAcc {}}; const auto hostBufferSize = hostMapping.getBlobSize(0); const auto devBufferSize = devMapping.getBlobSize(0); @@ -262,50 +226,39 @@ int main(int argc, char ** argv) Chrono chrono; - auto hostBuffer = alpaka::mem::buf::alloc( - devHost, hostBufferSize); + auto hostBuffer = alpaka::mem::buf::alloc(devHost, hostBufferSize); auto hostView = viewAlpakaBuffer(hostMapping, hostBuffer); - std::vector< - alpaka::mem::buf:: - Buf, std::size_t>> - hostChunkBuffer; - std::vector> hostChunkView; + std::vector, std::size_t>> hostChunkBuffer; + std::vector> hostChunkView; - std::vector, std::size_t>> - devOldBuffer, devNewBuffer; - std::vector> devOldView, - devNewView; + std::vector, std::size_t>> devOldBuffer, + devNewBuffer; + std::vector> devOldView, devNewView; - for(std::size_t i = 0; i < CHUNK_COUNT; ++i) + for (std::size_t i = 0; i < CHUNK_COUNT; ++i) { - hostChunkBuffer.push_back( - alpaka::mem::buf::alloc( - devHost, devBufferSize)); - hostChunkView.push_back( - viewAlpakaBuffer(devMapping, hostChunkBuffer.back())); - - devOldBuffer.push_back(alpaka::mem::buf::alloc( - devAcc, devBufferSize)); + hostChunkBuffer.push_back(alpaka::mem::buf::alloc(devHost, devBufferSize)); + hostChunkView.push_back(viewAlpakaBuffer(devMapping, hostChunkBuffer.back())); + + devOldBuffer.push_back(alpaka::mem::buf::alloc(devAcc, devBufferSize)); devOldView.push_back(viewAlpakaBuffer(devMapping, devOldBuffer.back())); - devNewBuffer.push_back(alpaka::mem::buf::alloc( - devAcc, devBufferSize)); + devNewBuffer.push_back(alpaka::mem::buf::alloc(devAcc, devBufferSize)); devNewView.push_back(viewAlpakaBuffer(devMapping, devNewBuffer.back())); } chrono.printAndReset("Alloc"); - if(image.empty()) + if (image.empty()) { image.resize(img_x * img_y * 3); std::default_random_engine generator; - std::normal_distribution distribution{FP(0), FP(0.5)}; - for(std::size_t y = 0; y < buffer_y; ++y) + std::normal_distribution distribution {FP(0), FP(0.5)}; + for (std::size_t y = 0; y < buffer_y; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < buffer_x; ++x) + for (std::size_t x = 0; x < buffer_x; ++x) { hostView(y, x)(tag::R()) = std::abs(distribution(generator)); hostView(y, x)(tag::G()) = std::abs(distribution(generator)); @@ -315,17 +268,14 @@ int main(int argc, char ** argv) } else { - for(std::size_t y = 0; y < buffer_y; ++y) + for (std::size_t y = 0; y < buffer_y; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < buffer_x; ++x) + for (std::size_t x = 0; x < buffer_x; ++x) { - const auto X = std::clamp( - x, KERNEL_SIZE, img_x + KERNEL_SIZE - 1); - const auto Y = std::clamp( - y, KERNEL_SIZE, img_y + KERNEL_SIZE - 1); - const auto * pixel - = &image[((Y - KERNEL_SIZE) * img_x + X - KERNEL_SIZE) * 3]; + const auto X = std::clamp(x, KERNEL_SIZE, img_x + KERNEL_SIZE - 1); + const auto Y = std::clamp(y, KERNEL_SIZE, img_y + KERNEL_SIZE - 1); + const auto* pixel = &image[((Y - KERNEL_SIZE) * img_x + X - KERNEL_SIZE) * 3]; hostView(y, x)(tag::R()) = FP(pixel[0]) / 255.; hostView(y, x)(tag::G()) = FP(pixel[1]) / 255.; hostView(y, x)(tag::B()) = FP(pixel[2]) / 255.; @@ -335,19 +285,15 @@ int main(int argc, char ** argv) chrono.printAndReset("Init"); const auto elems = alpaka::vec::Vec(elemCount, elemCount); - const auto threads - = alpaka::vec::Vec(threadCount, threadCount); + const auto threads = alpaka::vec::Vec(threadCount, threadCount); const auto blocks = alpaka::vec::Vec( - static_cast( - (CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK), - static_cast( - (CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK)); + static_cast((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK), + static_cast((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK)); const alpaka::vec::Vec chunks( static_cast((img_y + CHUNK_SIZE - 1) / CHUNK_SIZE), static_cast((img_x + CHUNK_SIZE - 1) / CHUNK_SIZE)); - const auto workdiv - = alpaka::workdiv::WorkDivMembers{blocks, threads, elems}; + const auto workdiv = alpaka::workdiv::WorkDivMembers {blocks, threads, elems}; struct VirtualHostElement { @@ -355,106 +301,79 @@ int main(int argc, char ** argv) const UserDomain validMiniSize; }; std::list virtualHostList; - for(std::size_t chunk_y = 0; chunk_y < chunks[0]; ++chunk_y) - for(std::size_t chunk_x = 0; chunk_x < chunks[1]; ++chunk_x) + for (std::size_t chunk_y = 0; chunk_y < chunks[0]; ++chunk_y) + for (std::size_t chunk_x = 0; chunk_x < chunks[1]; ++chunk_x) { // Create virtual view with size of mini view - const UserDomain validMiniSize{ - ((chunk_y < chunks[0] - 1) ? CHUNK_SIZE - : (img_y - 1) % CHUNK_SIZE + 1) - + 2 * KERNEL_SIZE, - ((chunk_x < chunks[1] - 1) ? CHUNK_SIZE - : (img_x - 1) % CHUNK_SIZE + 1) - + 2 * KERNEL_SIZE}; - llama::VirtualView virtualHost( - hostView, - {chunk_y * CHUNK_SIZE, chunk_x * CHUNK_SIZE}, - validMiniSize); + const UserDomain validMiniSize { + ((chunk_y < chunks[0] - 1) ? CHUNK_SIZE : (img_y - 1) % CHUNK_SIZE + 1) + 2 * KERNEL_SIZE, + ((chunk_x < chunks[1] - 1) ? CHUNK_SIZE : (img_x - 1) % CHUNK_SIZE + 1) + 2 * KERNEL_SIZE}; + llama::VirtualView virtualHost(hostView, {chunk_y * CHUNK_SIZE, chunk_x * CHUNK_SIZE}, validMiniSize); // Find free chunk stream std::size_t chunkNr = virtualHostList.size(); - if(virtualHostList.size() < CHUNK_COUNT) + if (virtualHostList.size() < CHUNK_COUNT) virtualHostList.push_back({virtualHost, validMiniSize}); else { bool notFound = true; - while(notFound) + while (notFound) { auto chunkIt = virtualHostList.begin(); - for(chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr) + for (chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr) { - if(alpaka::queue::empty(queue[chunkNr])) + if (alpaka::queue::empty(queue[chunkNr])) { // Copy data back LLAMA_INDEPENDENT_DATA - for(std::size_t y = 0; - y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; - ++y) + for (std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; - x < chunkIt->validMiniSize[1] - - 2 * KERNEL_SIZE; - ++x) - chunkIt->virtualHost( - y + KERNEL_SIZE, x + KERNEL_SIZE) - = hostChunkView[chunkNr]( - y + KERNEL_SIZE, x + KERNEL_SIZE); + for (std::size_t x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x) + chunkIt->virtualHost(y + KERNEL_SIZE, x + KERNEL_SIZE) + = hostChunkView[chunkNr](y + KERNEL_SIZE, x + KERNEL_SIZE); } chunkIt = virtualHostList.erase(chunkIt); - virtualHostList.insert( - chunkIt, {virtualHost, validMiniSize}); + virtualHostList.insert(chunkIt, {virtualHost, validMiniSize}); notFound = false; break; } chunkIt++; } - if(notFound) - std::this_thread::sleep_for( - std::chrono::microseconds{1}); + if (notFound) + std::this_thread::sleep_for(std::chrono::microseconds {1}); } } // Copy data from virtual view to mini view - for(std::size_t y = 0; y < validMiniSize[0]; ++y) + for (std::size_t y = 0; y < validMiniSize[0]; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < validMiniSize[1]; ++x) + for (std::size_t x = 0; x < validMiniSize[1]; ++x) hostChunkView[chunkNr](y, x) = virtualHost(y, x); } - alpaka::mem::view::copy( - queue[chunkNr], - devOldBuffer[chunkNr], - hostChunkBuffer[chunkNr], - devBufferSize); + alpaka::mem::view::copy(queue[chunkNr], devOldBuffer[chunkNr], hostChunkBuffer[chunkNr], devBufferSize); alpaka::kernel::exec( queue[chunkNr], workdiv, - BlurKernel{}, + BlurKernel {}, devOldView[chunkNr], devNewView[chunkNr]); - alpaka::mem::view::copy( - queue[chunkNr], - hostChunkBuffer[chunkNr], - devNewBuffer[chunkNr], - devBufferSize); + alpaka::mem::view::copy(queue[chunkNr], hostChunkBuffer[chunkNr], devNewBuffer[chunkNr], devBufferSize); } // Wait for not finished tasks on accelerator auto chunkIt = virtualHostList.begin(); - for(std::size_t chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr) + for (std::size_t chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr) { alpaka::wait::wait(queue[chunkNr]); // Copy data back - for(std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; - ++y) + for (std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; - x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; - ++x) + for (std::size_t x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x) chunkIt->virtualHost(y + KERNEL_SIZE, x + KERNEL_SIZE) = hostChunkView[chunkNr](y + KERNEL_SIZE, x + KERNEL_SIZE); } @@ -462,20 +381,17 @@ int main(int argc, char ** argv) } chrono.printAndReset("Blur kernel"); - if(SAVE) + if (SAVE) { - for(std::size_t y = 0; y < img_y; ++y) + for (std::size_t y = 0; y < img_y; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < img_x; ++x) + for (std::size_t x = 0; x < img_x; ++x) { - auto * pixel = &image[(y * img_x + x) * 3]; - pixel[0] = hostView(y + KERNEL_SIZE, x + KERNEL_SIZE)(tag::R()) - * 255.; - pixel[1] = hostView(y + KERNEL_SIZE, x + KERNEL_SIZE)(tag::G()) - * 255.; - pixel[2] = hostView(y + KERNEL_SIZE, x + KERNEL_SIZE)(tag::B()) - * 255.; + auto* pixel = &image[(y * img_x + x) * 3]; + pixel[0] = hostView(y + KERNEL_SIZE, x + KERNEL_SIZE)(tag::R()) * 255.; + pixel[1] = hostView(y + KERNEL_SIZE, x + KERNEL_SIZE)(tag::G()) * 255.; + pixel[2] = hostView(y + KERNEL_SIZE, x + KERNEL_SIZE)(tag::B()) * 255.; } } stbi_write_png(out_filename.c_str(), img_x, img_y, 3, image.data(), 0); diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 62599112e8..3e1f94b408 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -16,13 +16,10 @@ #include #include -constexpr auto MAPPING - = 0; /// 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA -constexpr auto USE_SHARED - = true; ///< defines whether shared memory shall be used -constexpr auto USE_SHARED_TREE - = true; ///< defines whether the shared memory shall use tree mapping or - ///< native mapping +constexpr auto MAPPING = 0; /// 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA +constexpr auto USE_SHARED = true; ///< defines whether shared memory shall be used +constexpr auto USE_SHARED_TREE = true; ///< defines whether the shared memory shall use tree mapping or + ///< native mapping constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles constexpr auto BLOCK_SIZE = 256; ///< number of elements per block @@ -56,15 +53,13 @@ using Particle = llama::DS< /// Helper function for particle particle interaction. Gets two virtual /// datums like they are real particle objects -template -LLAMA_FN_HOST_ACC_INLINE void -pPInteraction(VirtualDatum1 p1, VirtualDatum2 p2, FP ts) +template +LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualDatum1 p1, VirtualDatum2 p2, FP ts) { // Creating tempory virtual datum object for distance on stack: auto distance = p1(tag::Pos()) + p2(tag::Pos()); distance *= distance; // square for each element - const FP distSqr - = EPS2 + distance(tag::X()) + distance(tag::Y()) + distance(tag::Z()); + const FP distSqr = EPS2 + distance(tag::X()) + distance(tag::Y()) + distance(tag::Z()); const FP distSixth = distSqr * distSqr * distSqr; const FP invDistCube = 1.0f / std::sqrt(distSixth); const FP s = p2(tag::Mass()) * invDistCube; @@ -74,81 +69,68 @@ pPInteraction(VirtualDatum1 p1, VirtualDatum2 p2, FP ts) /// Alpaka kernel for updating the speed of every particle based on the /// distance and mass to each other particle. Has complexity O(N²). -template +template struct UpdateKernel { - template - LLAMA_FN_HOST_ACC_INLINE void - operator()(const Acc & acc, View particles, FP ts) const + template + LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { [[maybe_unused]] auto sharedView = [&] { - if constexpr(USE_SHARED) + if constexpr (USE_SHARED) { const auto sharedMapping = [&] { - if constexpr(USE_SHARED_TREE) - return llama::mapping::tree::Mapping{ - typename View::UserDomain{BlockSize}, - llama::Tuple{ - llama::mapping::tree::functor::LeafOnlyRT()}, - typename View::DatumDomain{}}; + if constexpr (USE_SHARED_TREE) + return llama::mapping::tree::Mapping { + typename View::UserDomain {BlockSize}, + llama::Tuple {llama::mapping::tree::functor::LeafOnlyRT()}, + typename View::DatumDomain {}}; else - return llama::mapping::SoA{ - typename View::UserDomain{BlockSize}, - typename View::DatumDomain{}}; + return llama::mapping::SoA { + typename View::UserDomain {BlockSize}, + typename View::DatumDomain {}}; }(); // if there is only 1 thread per block, avoid using shared // memory - if constexpr(BlockSize / Elems == 1) - return llama::allocViewStack< - View::UserDomain::rank, - typename View::DatumDomain>(); + if constexpr (BlockSize / Elems == 1) + return llama::allocViewStack(); else { - constexpr auto sharedMemSize - = llama::sizeOf * BlockSize; - auto & sharedMem = alpaka::block::shared::st:: - allocVar(acc); - return llama::View{ - sharedMapping, llama::Array{&sharedMem[0]}}; + constexpr auto sharedMemSize = llama::sizeOf * BlockSize; + auto& sharedMem = alpaka::block::shared::st::allocVar(acc); + return llama::View {sharedMapping, llama::Array {&sharedMem[0]}}; } } else - return int{}; // dummy + return int {}; // dummy }(); - const auto ti - = alpaka::idx::getIdx(acc)[0u]; - const auto tbi - = alpaka::idx::getIdx(acc)[0]; + const auto ti = alpaka::idx::getIdx(acc)[0u]; + const auto tbi = alpaka::idx::getIdx(acc)[0]; const auto start = ti * Elems; const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); LLAMA_INDEPENDENT_DATA - for(std::size_t b = 0; b < (ProblemSize + BlockSize - 1u) / BlockSize; - ++b) + for (std::size_t b = 0; b < (ProblemSize + BlockSize - 1u) / BlockSize; ++b) { const auto start2 = b * BlockSize; - const auto end2 - = alpaka::math::min(acc, start2 + BlockSize, ProblemSize) - - start2; - if constexpr(USE_SHARED) + const auto end2 = alpaka::math::min(acc, start2 + BlockSize, ProblemSize) - start2; + if constexpr (USE_SHARED) { LLAMA_INDEPENDENT_DATA - for(auto pos2 = decltype(end2)(0); pos2 + ti < end2; - pos2 += BlockSize / Elems) + for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems) sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi); alpaka::block::sync::syncBlockThreads(acc); } LLAMA_INDEPENDENT_DATA - for(auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2) + for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2) LLAMA_INDEPENDENT_DATA - for(auto i = start; i < end; ++i) - if constexpr(USE_SHARED) + for (auto i = start; i < end; ++i) + if constexpr (USE_SHARED) pPInteraction(particles(i), sharedView(pos2), ts); else pPInteraction(particles(i), particles(start2 + pos2), ts); - if constexpr(USE_SHARED) + if constexpr (USE_SHARED) alpaka::block::sync::syncBlockThreads(acc); } } @@ -156,26 +138,24 @@ struct UpdateKernel /// Alpaka kernel for moving each particle with its speed. Has complexity /// O(N). -template +template struct MoveKernel { - template - LLAMA_FN_HOST_ACC_INLINE void - operator()(const Acc & acc, View particles, FP ts) const + template + LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { - const auto ti - = alpaka::idx::getIdx(acc)[0]; + const auto ti = alpaka::idx::getIdx(acc)[0]; const auto start = ti * Elems; const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); LLAMA_INDEPENDENT_DATA - for(auto i = start; i < end; ++i) + for (auto i = start; i < end; ++i) particles(i)(tag::Pos()) += particles(i)(tag::Vel()) * ts; } }; -int main(int argc, char ** argv) +int main(int argc, char** argv) { using Dim = alpaka::dim::DimInt<1>; using Size = std::size_t; @@ -195,49 +175,42 @@ int main(int argc, char ** argv) // NBODY constexpr std::size_t hardwareThreads = 2; // relevant for OpenMP2Threads - using Distribution - = common::ThreadsElemsDistribution; + using Distribution = common::ThreadsElemsDistribution; constexpr std::size_t elemCount = Distribution::elemCount; constexpr std::size_t threadCount = Distribution::threadCount; constexpr FP ts = 0.0001; // LLAMA - const auto userDomain = llama::UserDomain{PROBLEM_SIZE}; + const auto userDomain = llama::UserDomain {PROBLEM_SIZE}; const auto mapping = [&] { - if constexpr(MAPPING == 0) - return llama::mapping::AoS{userDomain, Particle{}}; - if constexpr(MAPPING == 1) - return llama::mapping::SoA{userDomain, Particle{}}; - if constexpr(MAPPING == 2) - return llama::mapping::tree::Mapping{ - userDomain, llama::Tuple{}, Particle{}}; - if constexpr(MAPPING == 3) - return llama::mapping::tree::Mapping{ + if constexpr (MAPPING == 0) + return llama::mapping::AoS {userDomain, Particle {}}; + if constexpr (MAPPING == 1) + return llama::mapping::SoA {userDomain, Particle {}}; + if constexpr (MAPPING == 2) + return llama::mapping::tree::Mapping {userDomain, llama::Tuple {}, Particle {}}; + if constexpr (MAPPING == 3) + return llama::mapping::tree::Mapping { userDomain, - llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, - Particle{}}; + llama::Tuple {llama::mapping::tree::functor::LeafOnlyRT()}, + Particle {}}; }(); std::cout << PROBLEM_SIZE / 1000 << " thousand particles\n" - << PROBLEM_SIZE * llama::sizeOf / 1000 / 1000 - << "MB \n"; + << PROBLEM_SIZE * llama::sizeOf / 1000 / 1000 << "MB \n"; Chrono chrono; const auto bufferSize = Size(mapping.getBlobSize(0)); - auto hostBuffer - = alpaka::mem::buf::alloc(devHost, bufferSize); - auto accBuffer - = alpaka::mem::buf::alloc(devAcc, bufferSize); + auto hostBuffer = alpaka::mem::buf::alloc(devHost, bufferSize); + auto accBuffer = alpaka::mem::buf::alloc(devAcc, bufferSize); chrono.printAndReset("Alloc"); - auto hostView = llama::View{ - mapping, llama::Array{alpaka::mem::view::getPtrNative(hostBuffer)}}; - auto accView = llama::View{ - mapping, llama::Array{alpaka::mem::view::getPtrNative(accBuffer)}}; + auto hostView = llama::View {mapping, llama::Array {alpaka::mem::view::getPtrNative(hostBuffer)}}; + auto accView = llama::View {mapping, llama::Array {alpaka::mem::view::getPtrNative(accBuffer)}}; chrono.printAndReset("Views"); @@ -245,7 +218,7 @@ int main(int argc, char ** argv) std::mt19937_64 generator; std::normal_distribution distribution(FP(0), FP(1)); LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) + for (std::size_t i = 0; i < PROBLEM_SIZE; ++i) { auto temp = llama::allocVirtualDatumStack(); temp(tag::Pos(), tag::X()) = distribution(generator); @@ -266,13 +239,11 @@ int main(int argc, char ** argv) const alpaka::vec::Vec Elems(static_cast(elemCount)); const alpaka::vec::Vec threads(static_cast(threadCount)); constexpr auto innerCount = elemCount * threadCount; - const alpaka::vec::Vec blocks( - static_cast((PROBLEM_SIZE + innerCount - 1u) / innerCount)); + const alpaka::vec::Vec blocks(static_cast((PROBLEM_SIZE + innerCount - 1u) / innerCount)); - const auto workdiv - = alpaka::workdiv::WorkDivMembers{blocks, threads, Elems}; + const auto workdiv = alpaka::workdiv::WorkDivMembers {blocks, threads, Elems}; - for(std::size_t s = 0; s < STEPS; ++s) + for (std::size_t s = 0; s < STEPS; ++s) { UpdateKernel updateKernel; alpaka::kernel::exec(queue, workdiv, updateKernel, accView, ts); diff --git a/examples/alpaka/vectoradd/vectoradd.cpp b/examples/alpaka/vectoradd/vectoradd.cpp index 621948e7c2..e2345422b1 100644 --- a/examples/alpaka/vectoradd/vectoradd.cpp +++ b/examples/alpaka/vectoradd/vectoradd.cpp @@ -16,8 +16,7 @@ #include #include -constexpr auto MAPPING - = 0; /// 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA +constexpr auto MAPPING = 0; /// 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA constexpr auto PROBLEM_SIZE = 64 * 1024 * 1024; constexpr auto BLOCK_SIZE = 256; constexpr auto STEPS = 10; @@ -38,30 +37,28 @@ using Vector = llama::DS< llama::DE>; // clang-format on -template +template struct AddKernel { - template - LLAMA_FN_HOST_ACC_INLINE void - operator()(const Acc & acc, View a, View b) const + template + LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View a, View b) const { - const auto ti - = alpaka::idx::getIdx(acc)[0]; + const auto ti = alpaka::idx::getIdx(acc)[0]; const auto start = ti * Elems; const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); LLAMA_INDEPENDENT_DATA - for(auto i = start; i < end; ++i) + for (auto i = start; i < end; ++i) { - a(i)(tag::X{}) += b(i)(tag::X{}); - a(i)(tag::Y{}) -= b(i)(tag::Y{}); - a(i)(tag::Z{}) *= b(i)(tag::Z{}); + a(i)(tag::X {}) += b(i)(tag::X {}); + a(i)(tag::Y {}) -= b(i)(tag::Y {}); + a(i)(tag::Z {}) *= b(i)(tag::Z {}); } } }; -int main(int argc, char ** argv) +int main(int argc, char** argv) { // ALPAKA using Dim = alpaka::dim::DimInt<1>; @@ -81,52 +78,42 @@ int main(int argc, char ** argv) Queue queue(devAcc); // LLAMA - const auto userDomain = llama::UserDomain{PROBLEM_SIZE}; + const auto userDomain = llama::UserDomain {PROBLEM_SIZE}; const auto mapping = [&] { - if constexpr(MAPPING == 0) - return llama::mapping::AoS{userDomain, Vector{}}; - if constexpr(MAPPING == 1) - return llama::mapping::SoA{userDomain, Vector{}}; - if constexpr(MAPPING == 2) - return llama::mapping::tree::Mapping{ - userDomain, llama::Tuple{}, Vector{}}; - if constexpr(MAPPING == 3) - return llama::mapping::tree::Mapping{ + if constexpr (MAPPING == 0) + return llama::mapping::AoS {userDomain, Vector {}}; + if constexpr (MAPPING == 1) + return llama::mapping::SoA {userDomain, Vector {}}; + if constexpr (MAPPING == 2) + return llama::mapping::tree::Mapping {userDomain, llama::Tuple {}, Vector {}}; + if constexpr (MAPPING == 3) + return llama::mapping::tree::Mapping { userDomain, - llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, - Vector{}}; + llama::Tuple {llama::mapping::tree::functor::LeafOnlyRT()}, + Vector {}}; }(); std::cout << PROBLEM_SIZE / 1000 / 1000 << " million vectors\n" - << PROBLEM_SIZE * llama::sizeOf * 2 / 1000 / 1000 - << " MB on device\n"; + << PROBLEM_SIZE * llama::sizeOf * 2 / 1000 / 1000 << " MB on device\n"; Chrono chrono; const auto bufferSize = Size(mapping.getBlobSize(0)); // allocate buffers - auto hostBufferA - = alpaka::mem::buf::alloc(devHost, bufferSize); - auto hostBufferB - = alpaka::mem::buf::alloc(devHost, bufferSize); - auto devBufferA - = alpaka::mem::buf::alloc(devAcc, bufferSize); - auto devBufferB - = alpaka::mem::buf::alloc(devAcc, bufferSize); + auto hostBufferA = alpaka::mem::buf::alloc(devHost, bufferSize); + auto hostBufferB = alpaka::mem::buf::alloc(devHost, bufferSize); + auto devBufferA = alpaka::mem::buf::alloc(devAcc, bufferSize); + auto devBufferB = alpaka::mem::buf::alloc(devAcc, bufferSize); chrono.printAndReset("Alloc"); // create LLAMA views - auto hostA = llama::View{ - mapping, llama::Array{alpaka::mem::view::getPtrNative(hostBufferA)}}; - auto hostB = llama::View{ - mapping, llama::Array{alpaka::mem::view::getPtrNative(hostBufferB)}}; - auto devA = llama::View{ - mapping, llama::Array{alpaka::mem::view::getPtrNative(devBufferA)}}; - auto devB = llama::View{ - mapping, llama::Array{alpaka::mem::view::getPtrNative(devBufferB)}}; + auto hostA = llama::View {mapping, llama::Array {alpaka::mem::view::getPtrNative(hostBufferA)}}; + auto hostB = llama::View {mapping, llama::Array {alpaka::mem::view::getPtrNative(hostBufferB)}}; + auto devA = llama::View {mapping, llama::Array {alpaka::mem::view::getPtrNative(devBufferA)}}; + auto devB = llama::View {mapping, llama::Array {alpaka::mem::view::getPtrNative(devBufferB)}}; chrono.printAndReset("Views"); @@ -134,7 +121,7 @@ int main(int argc, char ** argv) std::normal_distribution distribution(FP(0), FP(1)); auto seed = distribution(generator); LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) + for (std::size_t i = 0; i < PROBLEM_SIZE; ++i) { hostA(i) = seed + i; hostB(i) = seed - i; @@ -147,23 +134,19 @@ int main(int argc, char ** argv) chrono.printAndReset("Copy H->D"); constexpr std::size_t hardwareThreads = 2; // relevant for OpenMP2Threads - using Distribution - = common::ThreadsElemsDistribution; + using Distribution = common::ThreadsElemsDistribution; constexpr std::size_t elemCount = Distribution::elemCount; constexpr std::size_t threadCount = Distribution::threadCount; const alpaka::vec::Vec elems(static_cast(elemCount)); const alpaka::vec::Vec threads(static_cast(threadCount)); constexpr auto innerCount = elemCount * threadCount; - const alpaka::vec::Vec blocks( - static_cast((PROBLEM_SIZE + innerCount - 1) / innerCount)); + const alpaka::vec::Vec blocks(static_cast((PROBLEM_SIZE + innerCount - 1) / innerCount)); - const auto workdiv - = alpaka::workdiv::WorkDivMembers{blocks, threads, elems}; + const auto workdiv = alpaka::workdiv::WorkDivMembers {blocks, threads, elems}; - for(std::size_t s = 0; s < STEPS; ++s) + for (std::size_t s = 0; s < STEPS; ++s) { - alpaka::kernel::exec( - queue, workdiv, AddKernel{}, devA, devB); + alpaka::kernel::exec(queue, workdiv, AddKernel {}, devA, devB); chrono.printAndReset("Add kernel"); } diff --git a/examples/common/Chrono.hpp b/examples/common/Chrono.hpp index 83e96820d1..67b31f1ec4 100644 --- a/examples/common/Chrono.hpp +++ b/examples/common/Chrono.hpp @@ -6,7 +6,9 @@ struct Chrono { - Chrono() : last(std::chrono::system_clock::now()) {} + Chrono() : last(std::chrono::system_clock::now()) + { + } void printAndReset(std::string eventName) { diff --git a/examples/common/alpakaHelpers.hpp b/examples/common/alpakaHelpers.hpp index b2d6ee2cda..215beea131 100644 --- a/examples/common/alpakaHelpers.hpp +++ b/examples/common/alpakaHelpers.hpp @@ -8,8 +8,8 @@ #pragma once -#include #include +#include namespace common { @@ -18,7 +18,7 @@ namespace common /** Returns a good guess for an optimal number of threads and elements in a * block based on the total number of elements in the block. */ - template + template struct ThreadsElemsDistribution { /// number of elements per thread @@ -28,37 +28,21 @@ namespace common }; #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED - template< - std::size_t blockSize, - std::size_t hardwareThreads, - typename T_Dim, - typename T_Size> - struct ThreadsElemsDistribution< - alpaka::acc::AccGpuCudaRt, - blockSize, - hardwareThreads> + template + struct ThreadsElemsDistribution, blockSize, hardwareThreads> { static constexpr std::size_t elemCount = THREADELEMDIST_MIN_ELEM; - static constexpr std::size_t threadCount - = blockSize / THREADELEMDIST_MIN_ELEM; + static constexpr std::size_t threadCount = blockSize / THREADELEMDIST_MIN_ELEM; }; #endif #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED - template< - std::size_t blockSize, - std::size_t hardwareThreads, - typename T_Dim, - typename T_Size> - struct ThreadsElemsDistribution< - alpaka::acc::AccCpuOmp2Threads, - blockSize, - hardwareThreads> + template + struct ThreadsElemsDistribution, blockSize, hardwareThreads> { - static constexpr std::size_t elemCount - = (blockSize + hardwareThreads - 1u) / hardwareThreads; + static constexpr std::size_t elemCount = (blockSize + hardwareThreads - 1u) / hardwareThreads; static constexpr std::size_t threadCount = hardwareThreads; }; #endif -} // namspace common +} // namespace common diff --git a/examples/heatequation/heatequation.cpp b/examples/heatequation/heatequation.cpp index 56b086eefe..5f691432da 100644 --- a/examples/heatequation/heatequation.cpp +++ b/examples/heatequation/heatequation.cpp @@ -25,19 +25,12 @@ using DatumDomain = double; struct HeatEquationKernel { - template - void operator()( - uint32_t idx, - const View & uCurrBuf, - View & uNextBuf, - uint32_t extent, - double dx, - double dt) const + template + void operator()(uint32_t idx, const View& uCurrBuf, View& uNextBuf, uint32_t extent, double dx, double dt) const { const auto r = dt / (dx * dx); - if(idx > 0 && idx < extent - 1u) - uNextBuf[idx] = uCurrBuf[idx] * (1.0 - 2.0 * r) - + uCurrBuf[idx - 1] * r + uCurrBuf[idx + 1] * r; + if (idx > 0 && idx < extent - 1u) + uNextBuf[idx] = uCurrBuf[idx] * (1.0 - 2.0 * r) + uCurrBuf[idx - 1] * r + uCurrBuf[idx + 1] * r; } }; @@ -62,27 +55,25 @@ auto main() -> int const auto dt = tMax / static_cast(numTimeSteps - 1); const auto r = dt / (dx * dx); - if(r > 0.5) + if (r > 0.5) { - std::cerr << "Stability condition check failed: dt/dx^2 = " << r - << ", it is required to be <= 0.5\n"; + std::cerr << "Stability condition check failed: dt/dx^2 = " << r << ", it is required to be <= 0.5\n"; return 1; } - const auto mapping - = llama::mapping::AoS{llama::UserDomain{numNodesX}, DatumDomain{}}; + const auto mapping = llama::mapping::AoS {llama::UserDomain {numNodesX}, DatumDomain {}}; auto uNext = llama::allocView(mapping); auto uCurr = llama::allocView(mapping); // Apply initial conditions for the test problem - for(uint32_t i = 0; i < numNodesX; i++) + for (uint32_t i = 0; i < numNodesX; i++) uCurr[i] = exactSolution(i * dx, 0.0); const auto start = std::chrono::high_resolution_clock::now(); HeatEquationKernel kernel; - for(int step = 0; step < numTimeSteps; step++) + for (int step = 0; step < numTimeSteps; step++) { - for(auto i = 0; i < numNodesX; i++) + for (auto i = 0; i < numNodesX; i++) kernel(i, uCurr, uNext, numNodesX, dx, dt); // We assume the boundary conditions are constant and so these values @@ -90,29 +81,26 @@ auto main() -> int std::swap(uNext, uCurr); } const auto end = std::chrono::high_resolution_clock::now(); - std::cout << "Runtime: " - << std::chrono::duration(end - start).count() << "s\n"; + std::cout << "Runtime: " << std::chrono::duration(end - start).count() << "s\n"; // Calculate error double maxError = 0.0; - for(uint32_t i = 0; i < numNodesX; i++) + for (uint32_t i = 0; i < numNodesX; i++) { - const auto error - = std::abs(uNext[i] - exactSolution(i * dx, tMax)); + const auto error = std::abs(uNext[i] - exactSolution(i * dx, tMax)); maxError = std::max(maxError, error); } const auto errorThreshold = 1e-5; const auto resultCorrect = (maxError < errorThreshold); - if(resultCorrect) + if (resultCorrect) { std::cout << "Execution results correct!\n"; return 0; } else { - std::cout << "Execution results incorrect: error = " << maxError - << " (the grid resolution may be too low)\n"; + std::cout << "Execution results incorrect: error = " << maxError << " (the grid resolution may be too low)\n"; return 2; } } diff --git a/examples/nbody/nbody.cpp b/examples/nbody/nbody.cpp index 0db7af7fe6..804e7d6b85 100644 --- a/examples/nbody/nbody.cpp +++ b/examples/nbody/nbody.cpp @@ -7,8 +7,7 @@ // needs -fno-math-errno, so std::sqrt() can be vectorized -constexpr auto MAPPING - = 1; ///< 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA +constexpr auto MAPPING = 1; ///< 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles constexpr auto STEPS = 5; ///< number of steps to calculate constexpr auto TRACE = false; @@ -44,63 +43,60 @@ namespace usellama >; // clang-format on - template - LLAMA_FN_HOST_ACC_INLINE void - pPInteraction(VirtualParticle p1, VirtualParticle p2, FP ts) + template + LLAMA_FN_HOST_ACC_INLINE void pPInteraction(VirtualParticle p1, VirtualParticle p2, FP ts) { - auto dist = p1(tag::Pos{}) + p2(tag::Pos{}); + auto dist = p1(tag::Pos {}) + p2(tag::Pos {}); dist *= dist; - const FP distSqr - = EPS2 + dist(tag::X{}) + dist(tag::Y{}) + dist(tag::Z{}); + const FP distSqr = EPS2 + dist(tag::X {}) + dist(tag::Y {}) + dist(tag::Z {}); const FP distSixth = distSqr * distSqr * distSqr; const FP invDistCube = 1.0f / std::sqrt(distSixth); - const FP s = p2(tag::Mass{}) * invDistCube; + const FP s = p2(tag::Mass {}) * invDistCube; dist *= s * ts; - p1(tag::Vel{}) += dist; + p1(tag::Vel {}) += dist; } - template - void update(View & particles, FP ts) + template + void update(View& particles, FP ts) { - for(std::size_t i = 0; i < PROBLEM_SIZE; i++) + for (std::size_t i = 0; i < PROBLEM_SIZE; i++) { LLAMA_INDEPENDENT_DATA - for(std::size_t j = 0; j < PROBLEM_SIZE; j++) + for (std::size_t j = 0; j < PROBLEM_SIZE; j++) pPInteraction(particles(j), particles(i), ts); } } - template - void move(View & particles, FP ts) + template + void move(View& particles, FP ts) { LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; i++) - particles(i)(tag::Pos{}) += particles(i)(tag::Vel{}) * ts; + for (std::size_t i = 0; i < PROBLEM_SIZE; i++) + particles(i)(tag::Pos {}) += particles(i)(tag::Vel {}) * ts; } - int main(int argc, char ** argv) + int main(int argc, char** argv) { constexpr FP ts = 0.0001f; - const auto userDomain = llama::UserDomain{PROBLEM_SIZE}; + const auto userDomain = llama::UserDomain {PROBLEM_SIZE}; auto mapping = [&] { - if constexpr(MAPPING == 0) - return llama::mapping::AoS{userDomain, Particle{}}; - if constexpr(MAPPING == 1) - return llama::mapping::SoA{userDomain, Particle{}}; - if constexpr(MAPPING == 2) - return llama::mapping::tree::Mapping{ - userDomain, llama::Tuple{}, Particle{}}; - if constexpr(MAPPING == 3) - return llama::mapping::tree::Mapping{ + if constexpr (MAPPING == 0) + return llama::mapping::AoS {userDomain, Particle {}}; + if constexpr (MAPPING == 1) + return llama::mapping::SoA {userDomain, Particle {}}; + if constexpr (MAPPING == 2) + return llama::mapping::tree::Mapping {userDomain, llama::Tuple {}, Particle {}}; + if constexpr (MAPPING == 3) + return llama::mapping::tree::Mapping { userDomain, - llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, - Particle{}}; + llama::Tuple {llama::mapping::tree::functor::LeafOnlyRT()}, + Particle {}}; }(); auto tmapping = [&] { - if constexpr(TRACE) - return llama::mapping::Trace{std::move(mapping)}; + if constexpr (TRACE) + return llama::mapping::Trace {std::move(mapping)}; else return std::move(mapping); }(); @@ -111,56 +107,48 @@ namespace usellama const auto start = std::chrono::high_resolution_clock::now(); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "alloc took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "alloc took " << std::chrono::duration(stop - start).count() << "s\n"; { const auto start = std::chrono::high_resolution_clock::now(); std::default_random_engine engine; std::normal_distribution dist(FP(0), FP(1)); - for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) + for (std::size_t i = 0; i < PROBLEM_SIZE; ++i) { auto p = particles(i); - p(tag::Pos{}, tag::X{}) = dist(engine); - p(tag::Pos{}, tag::Y{}) = dist(engine); - p(tag::Pos{}, tag::Z{}) = dist(engine); - p(tag::Vel{}, tag::X{}) = dist(engine) / FP(10); - p(tag::Vel{}, tag::Y{}) = dist(engine) / FP(10); - p(tag::Vel{}, tag::Z{}) = dist(engine) / FP(10); - p(tag::Mass{}) = dist(engine) / FP(100); + p(tag::Pos {}, tag::X {}) = dist(engine); + p(tag::Pos {}, tag::Y {}) = dist(engine); + p(tag::Pos {}, tag::Z {}) = dist(engine); + p(tag::Vel {}, tag::X {}) = dist(engine) / FP(10); + p(tag::Vel {}, tag::Y {}) = dist(engine) / FP(10); + p(tag::Vel {}, tag::Z {}) = dist(engine) / FP(10); + p(tag::Mass {}) = dist(engine) / FP(100); } const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "init took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "init took " << std::chrono::duration(stop - start).count() << "s\n"; } - for(std::size_t s = 0; s < STEPS; ++s) + for (std::size_t s = 0; s < STEPS; ++s) { { const auto start = std::chrono::high_resolution_clock::now(); update(particles, ts); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "update took " - << std::chrono::duration(stop - start).count() - << "s\t"; + std::cout << "update took " << std::chrono::duration(stop - start).count() << "s\t"; } { const auto start = std::chrono::high_resolution_clock::now(); move(particles, ts); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "move took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "move took " << std::chrono::duration(stop - start).count() << "s\n"; } } return 0; } -} +} // namespace usellama namespace manualAoS { @@ -170,7 +158,7 @@ namespace manualAoS FP y; FP z; - auto operator*=(FP s) -> Vec & + auto operator*=(FP s) -> Vec& { x *= s; y *= s; @@ -178,7 +166,7 @@ namespace manualAoS return *this; } - auto operator*=(Vec v) -> Vec & + auto operator*=(Vec v) -> Vec& { x *= v.x; y *= v.y; @@ -186,7 +174,7 @@ namespace manualAoS return *this; } - auto operator+=(Vec v) -> Vec & + auto operator+=(Vec v) -> Vec& { x += v.x; y += v.y; @@ -218,7 +206,7 @@ namespace manualAoS FP mass; }; - inline void pPInteraction(Particle & p1, const Particle & p2, FP ts) + inline void pPInteraction(Particle& p1, const Particle& p2, FP ts) { auto distance = p1.pos + p2.pos; distance *= distance; @@ -230,24 +218,24 @@ namespace manualAoS p1.vel += distance; } - void update(Particle * particles, FP ts) + void update(Particle* particles, FP ts) { - for(std::size_t i = 0; i < PROBLEM_SIZE; i++) + for (std::size_t i = 0; i < PROBLEM_SIZE; i++) { LLAMA_INDEPENDENT_DATA - for(std::size_t j = 0; j < PROBLEM_SIZE; j++) + for (std::size_t j = 0; j < PROBLEM_SIZE; j++) pPInteraction(particles[j], particles[i], ts); } } - void move(Particle * particles, FP ts) + void move(Particle* particles, FP ts) { LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; i++) + for (std::size_t i = 0; i < PROBLEM_SIZE; i++) particles[i].pos += particles[i].vel * ts; } - int main(int argc, char ** argv) + int main(int argc, char** argv) { constexpr FP ts = 0.0001f; @@ -256,16 +244,14 @@ namespace manualAoS const auto start = std::chrono::high_resolution_clock::now(); std::vector particles(PROBLEM_SIZE); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "alloc took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "alloc took " << std::chrono::duration(stop - start).count() << "s\n"; { const auto start = std::chrono::high_resolution_clock::now(); std::default_random_engine engine; std::normal_distribution dist(FP(0), FP(1)); - for(auto & p : particles) + for (auto& p : particles) { p.pos.x = dist(engine); p.pos.y = dist(engine); @@ -277,34 +263,28 @@ namespace manualAoS } const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "init took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "init took " << std::chrono::duration(stop - start).count() << "s\n"; } - for(std::size_t s = 0; s < STEPS; ++s) + for (std::size_t s = 0; s < STEPS; ++s) { { const auto start = std::chrono::high_resolution_clock::now(); update(particles.data(), ts); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "update took " - << std::chrono::duration(stop - start).count() - << "s\t"; + std::cout << "update took " << std::chrono::duration(stop - start).count() << "s\t"; } { const auto start = std::chrono::high_resolution_clock::now(); move(particles.data(), ts); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "move took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "move took " << std::chrono::duration(stop - start).count() << "s\n"; } } return 0; } -} +} // namespace manualAoS namespace manualSoA { @@ -312,9 +292,9 @@ namespace manualSoA FP p1posx, FP p1posy, FP p1posz, - FP & p1velx, - FP & p1vely, - FP & p1velz, + FP& p1velx, + FP& p1vely, + FP& p1velz, FP p2posx, FP p2posy, FP p2posz, @@ -339,20 +319,12 @@ namespace manualSoA p1velz += zdistance; } - void update( - FP * posx, - FP * posy, - FP * posz, - FP * velx, - FP * vely, - FP * velz, - FP * mass, - FP ts) + void update(FP* posx, FP* posy, FP* posz, FP* velx, FP* vely, FP* velz, FP* mass, FP ts) { - for(std::size_t i = 0; i < PROBLEM_SIZE; i++) + for (std::size_t i = 0; i < PROBLEM_SIZE; i++) { LLAMA_INDEPENDENT_DATA - for(std::size_t j = 0; j < PROBLEM_SIZE; j++) + for (std::size_t j = 0; j < PROBLEM_SIZE; j++) pPInteraction( posx[j], posy[j], @@ -368,18 +340,10 @@ namespace manualSoA } } - void move( - FP * posx, - FP * posy, - FP * posz, - FP * velx, - FP * vely, - FP * velz, - FP * mass, - FP ts) + void move(FP* posx, FP* posy, FP* posz, FP* velx, FP* vely, FP* velz, FP* mass, FP ts) { LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; i++) + for (std::size_t i = 0; i < PROBLEM_SIZE; i++) { posx[i] += velx[i] * ts; posy[i] += vely[i] * ts; @@ -387,7 +351,7 @@ namespace manualSoA } } - int main(int argc, char ** argv) + int main(int argc, char** argv) { constexpr FP ts = 0.0001f; @@ -402,16 +366,14 @@ namespace manualSoA std::vector velz(PROBLEM_SIZE); std::vector mass(PROBLEM_SIZE); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "alloc took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "alloc took " << std::chrono::duration(stop - start).count() << "s\n"; { const auto start = std::chrono::high_resolution_clock::now(); std::default_random_engine engine; std::normal_distribution dist(FP(0), FP(1)); - for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) + for (std::size_t i = 0; i < PROBLEM_SIZE; ++i) { posx[i] = dist(engine); posy[i] = dist(engine); @@ -423,52 +385,30 @@ namespace manualSoA } const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "init took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "init took " << std::chrono::duration(stop - start).count() << "s\n"; } - for(std::size_t s = 0; s < STEPS; ++s) + for (std::size_t s = 0; s < STEPS; ++s) { { const auto start = std::chrono::high_resolution_clock::now(); - update( - posx.data(), - posy.data(), - posz.data(), - velx.data(), - vely.data(), - velz.data(), - mass.data(), - ts); + update(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data(), mass.data(), ts); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "update took " - << std::chrono::duration(stop - start).count() - << "s\t"; + std::cout << "update took " << std::chrono::duration(stop - start).count() << "s\t"; } { const auto start = std::chrono::high_resolution_clock::now(); - move( - posx.data(), - posy.data(), - posz.data(), - velx.data(), - vely.data(), - velz.data(), - mass.data(), - ts); + move(posx.data(), posy.data(), posz.data(), velx.data(), vely.data(), velz.data(), mass.data(), ts); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "move took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "move took " << std::chrono::duration(stop - start).count() << "s\n"; } } return 0; } -} +} // namespace manualSoA -int main(int argc, char ** argv) +int main(int argc, char** argv) { int r = 0; r += usellama::main(argc, argv); diff --git a/examples/simpletest/simpletest.cpp b/examples/simpletest/simpletest.cpp index 4d6445a657..1a0f244e63 100644 --- a/examples/simpletest/simpletest.cpp +++ b/examples/simpletest/simpletest.cpp @@ -57,32 +57,32 @@ using Name = llama::DS< namespace { - template - std::string type(const T & t) + template + std::string type(const T& t) { return boost::core::demangle(typeid(t).name()); } /// Prints the coordinates of a given \ref llama::DatumCoord for debugging /// and testing purposes - template + template void printCoords(llama::DatumCoord dc) { (std::cout << ... << T_coords); } - template - void split(const std::string & s, char delim, Out result) + template + void split(const std::string& s, char delim, Out result) { std::stringstream ss(s); std::string item; - while(std::getline(ss, item, delim)) + while (std::getline(ss, item, delim)) { *(result++) = item; } } - std::vector split(const std::string & s, char delim) + std::vector split(const std::string& s, char delim) { std::vector elems; split(s, delim, std::back_inserter(elems)); @@ -92,7 +92,8 @@ namespace std::string nSpaces(int n) { std::string result = ""; - for(int i = 0; i < n; ++i) result += " "; + for (int i = 0; i < n; ++i) + result += " "; return result; } @@ -105,24 +106,24 @@ namespace auto tokens = split(raw, '\n'); std::string result = ""; int indent = 0; - for(auto t : tokens) + for (auto t : tokens) { - if(t.back() == '>' || (t.length() > 1 && t[t.length() - 2] == '>')) + if (t.back() == '>' || (t.length() > 1 && t[t.length() - 2] == '>')) indent -= 4; result += nSpaces(indent) + t + "\n"; - if(t.back() == '<') + if (t.back() == '<') indent += 4; } return result; } -} +} // namespace /// Example functor for \ref llama::forEach which can also be used to print the /// coordinates inside of a datum domain when called. -template +template struct SetZeroFunctor { - template + template void operator()(Coord coord) { vd(coord) = 0; @@ -130,25 +131,19 @@ struct SetZeroFunctor VirtualDatum vd; }; -int main(int argc, char ** argv) +int main(int argc, char** argv) { // Defining a two-dimensional user domain using UD = llama::UserDomain<2>; // Setting the run time size of the user domain to 8192 * 8192 - UD udSize{8192, 8192}; + UD udSize {8192, 8192}; // Printing the domain informations at runtime std::cout << "Datum Domain is " << addLineBreaks(type(Name())) << '\n'; std::cout << "AoS address of (0,100) <0,1>: " - << llama::mapping::AoS(udSize) - .getBlobNrAndOffset<0, 1>({0, 100}) - .offset - << '\n'; + << llama::mapping::AoS(udSize).getBlobNrAndOffset<0, 1>({0, 100}).offset << '\n'; std::cout << "SoA address of (0,100) <0,1>: " - << llama::mapping::SoA(udSize) - .getBlobNrAndOffset<0, 1>({0, 100}) - .offset - << '\n'; + << llama::mapping::SoA(udSize).getBlobNrAndOffset<0, 1>({0, 100}).offset << '\n'; std::cout << "sizeOf DatumDomain: " << llama::sizeOf << '\n'; std::cout << type(llama::GetCoordFromTags()) << '\n'; @@ -162,61 +157,57 @@ int main(int argc, char ** argv) auto view = allocView(mapping); // defining a position in the user domain - const UD pos{0, 0}; + const UD pos {0, 0}; st::Options Options_; - const auto Weight_ = st::Weight{}; + const auto Weight_ = st::Weight {}; // using the position in the user domain and a tree coord or a uid in the // datum domain to get the reference to an element in the view - float & position_x = view(pos).access<0, 0>(); - double & momentum_z = view[pos].access(); - int & weight = view[{0, 0}](llama::DatumCoord<2>()); - int & weight_2 = view(pos)(Weight_); - bool & options_2 = view[0](st::Options())(llama::DatumCoord<2>()); - bool & options_3 = view(pos)(Options_)(llama::DatumCoord<2>()); + float& position_x = view(pos).access<0, 0>(); + double& momentum_z = view[pos].access(); + int& weight = view[{0, 0}](llama::DatumCoord<2>()); + int& weight_2 = view(pos)(Weight_); + bool& options_2 = view[0](st::Options())(llama::DatumCoord<2>()); + bool& options_3 = view(pos)(Options_)(llama::DatumCoord<2>()); // printing the address and distances of the element in the memory. This // will change based on the chosen mapping. When array of struct is chosen // instead the elements will be much closer than with struct of array. std::cout << &position_x << '\n'; - std::cout << &momentum_z << " " << (size_t)&momentum_z - (size_t)&position_x - << '\n'; - std::cout << &weight << " " << (size_t)&weight - (size_t)&momentum_z - << '\n'; - std::cout << &options_2 << " " << (size_t)&options_2 - (size_t)&weight - << '\n'; + std::cout << &momentum_z << " " << (size_t) &momentum_z - (size_t) &position_x << '\n'; + std::cout << &weight << " " << (size_t) &weight - (size_t) &momentum_z << '\n'; + std::cout << &options_2 << " " << (size_t) &options_2 - (size_t) &weight << '\n'; // iterating over the user domain at run time to do some stuff with the // allocated data - for(size_t x = 0; x < udSize[0]; ++x) + for (size_t x = 0; x < udSize[0]; ++x) // telling the compiler that all data in the following loop is // independent to each other and thus can be vectorized LLAMA_INDEPENDENT_DATA - for(size_t y = 0; y < udSize[1]; ++y) + for (size_t y = 0; y < udSize[1]; ++y) { // Defining a functor for a given virtual datum - SetZeroFunctor szf{view(x, y)}; + SetZeroFunctor szf {view(x, y)}; // Applying the functor for the sub tree 0,0 (pos.x), so basically // only for this element - llama::forEach(szf, llama::DatumCoord<0, 0>{}); + llama::forEach(szf, llama::DatumCoord<0, 0> {}); // Applying the functor for the sub tree momentum (0), so basically // for momentum.z, and momentum.x - llama::forEach(szf, st::Momentum{}); + llama::forEach(szf, st::Momentum {}); // the user domain address can be given as multiple comma separated // arguments or as one parameter of type user domain view({x, y}) = double(x + y) / double(udSize[0] + udSize[1]); } - for(size_t x = 0; x < udSize[0]; ++x) LLAMA_INDEPENDENT_DATA - for(size_t y = 0; y < udSize[1]; ++y) + for (size_t x = 0; x < udSize[0]; ++x) + LLAMA_INDEPENDENT_DATA + for (size_t y = 0; y < udSize[1]; ++y) { // Showing different options of access data with llama. Internally // all do the same data- and mappingwise auto datum = view(x, y); - datum.access() - += datum.access>(); - datum.access(st::Pos(), st::Y()) - += datum.access(llama::DatumCoord<1, 1>()); + datum.access() += datum.access>(); + datum.access(st::Pos(), st::Y()) += datum.access(llama::DatumCoord<1, 1>()); datum(st::Pos(), st::Z()) += datum(llama::DatumCoord<2>()); // It is also possible to work only on a part of data. @@ -224,8 +215,10 @@ int main(int argc, char ** argv) } double sum = 0.0; LLAMA_INDEPENDENT_DATA - for(size_t x = 0; x < udSize[0]; ++x) LLAMA_INDEPENDENT_DATA - for(size_t y = 0; y < udSize[1]; ++y) sum += view(x, y).access<1, 0>(); + for (size_t x = 0; x < udSize[0]; ++x) + LLAMA_INDEPENDENT_DATA + for (size_t y = 0; y < udSize[1]; ++y) + sum += view(x, y).access<1, 0>(); std::cout << "Sum: " << sum << '\n'; return 0; diff --git a/examples/vectoradd/vectoradd.cpp b/examples/vectoradd/vectoradd.cpp index 27b18850c8..6a9f0ec584 100644 --- a/examples/vectoradd/vectoradd.cpp +++ b/examples/vectoradd/vectoradd.cpp @@ -3,8 +3,7 @@ #include #include -constexpr auto MAPPING - = 3; /// 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA +constexpr auto MAPPING = 3; /// 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA constexpr auto PROBLEM_SIZE = 64 * 1024 * 1024; ///< problem size constexpr auto STEPS = 10; ///< number of vector adds to perform @@ -27,35 +26,34 @@ namespace usellama >; // clang-format on - template - void add(const View & a, const View & b, View & c) + template + void add(const View& a, const View& b, View& c) { LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; i++) + for (std::size_t i = 0; i < PROBLEM_SIZE; i++) { - c(i)(tag::X{}) = a(i)(tag::X{}) + b(i)(tag::X{}); - c(i)(tag::Y{}) = a(i)(tag::Y{}) - b(i)(tag::Y{}); - c(i)(tag::Z{}) = a(i)(tag::Z{}) * b(i)(tag::Z{}); + c(i)(tag::X {}) = a(i)(tag::X {}) + b(i)(tag::X {}); + c(i)(tag::Y {}) = a(i)(tag::Y {}) - b(i)(tag::Y {}); + c(i)(tag::Z {}) = a(i)(tag::Z {}) * b(i)(tag::Z {}); } } - int main(int argc, char ** argv) + int main(int argc, char** argv) { - const auto userDomain = llama::UserDomain{PROBLEM_SIZE}; + const auto userDomain = llama::UserDomain {PROBLEM_SIZE}; const auto mapping = [&] { - if constexpr(MAPPING == 0) - return llama::mapping::AoS{userDomain, Vector{}}; - if constexpr(MAPPING == 1) - return llama::mapping::SoA{userDomain, Vector{}}; - if constexpr(MAPPING == 2) - return llama::mapping::tree::Mapping{ - userDomain, llama::Tuple{}, Vector{}}; - if constexpr(MAPPING == 3) - return llama::mapping::tree::Mapping{ + if constexpr (MAPPING == 0) + return llama::mapping::AoS {userDomain, Vector {}}; + if constexpr (MAPPING == 1) + return llama::mapping::SoA {userDomain, Vector {}}; + if constexpr (MAPPING == 2) + return llama::mapping::tree::Mapping {userDomain, llama::Tuple {}, Vector {}}; + if constexpr (MAPPING == 3) + return llama::mapping::tree::Mapping { userDomain, - llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, - Vector{}}; + llama::Tuple {llama::mapping::tree::functor::LeafOnlyRT()}, + Vector {}}; }(); std::cout << PROBLEM_SIZE / 1000 / 1000 << " million vectors LLAMA\n"; @@ -67,32 +65,28 @@ namespace usellama const auto start = std::chrono::high_resolution_clock::now(); LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) + for (std::size_t i = 0; i < PROBLEM_SIZE; ++i) { - a[i](tag::X{}) = i; // X + a[i](tag::X {}) = i; // X a[i].access() = i; // Y a[i].access<2>() = i; // Z b(i) = i; // writes to all (X, Y, Z) } const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "init took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "init took " << std::chrono::duration(stop - start).count() << "s\n"; - for(std::size_t s = 0; s < STEPS; ++s) + for (std::size_t s = 0; s < STEPS; ++s) { const auto start = std::chrono::high_resolution_clock::now(); add(a, b, c); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "add took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "add took " << std::chrono::duration(stop - start).count() << "s\n"; } - return (int)c.storageBlobs[0][0]; + return (int) c.storageBlobs[0][0]; } -} +} // namespace usellama namespace manualAoS { @@ -103,10 +97,10 @@ namespace manualAoS FP z; }; - inline void add(const Vector * a, const Vector * b, Vector * c) + inline void add(const Vector* a, const Vector* b, Vector* c) { LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; i++) + for (std::size_t i = 0; i < PROBLEM_SIZE; i++) { c[i].x = a[i].x + b[i].x; c[i].y = a[i].y - b[i].y; @@ -114,7 +108,7 @@ namespace manualAoS } } - int main(int argc, char ** argv) + int main(int argc, char** argv) { std::cout << PROBLEM_SIZE / 1000 / 1000 << " million vectors AoS\n"; @@ -125,7 +119,7 @@ namespace manualAoS const auto start = std::chrono::high_resolution_clock::now(); LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) + for (std::size_t i = 0; i < PROBLEM_SIZE; ++i) { a[i].x = i; a[i].y = i; @@ -136,39 +130,35 @@ namespace manualAoS } const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "init took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "init took " << std::chrono::duration(stop - start).count() << "s\n"; - for(std::size_t s = 0; s < STEPS; ++s) + for (std::size_t s = 0; s < STEPS; ++s) { const auto start = std::chrono::high_resolution_clock::now(); add(a.data(), b.data(), c.data()); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "add took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "add took " << std::chrono::duration(stop - start).count() << "s\n"; } return c[0].x; } -} +} // namespace manualAoS namespace manualSoA { - inline void - add(const FP * ax, - const FP * ay, - const FP * az, - const FP * bx, - const FP * by, - const FP * bz, - FP * cx, - FP * cy, - FP * cz) + inline void add( + const FP* ax, + const FP* ay, + const FP* az, + const FP* bx, + const FP* by, + const FP* bz, + FP* cx, + FP* cy, + FP* cz) { LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; i++) + for (std::size_t i = 0; i < PROBLEM_SIZE; i++) { cx[i] = ax[i] + bx[i]; cy[i] = ay[i] - by[i]; @@ -176,7 +166,7 @@ namespace manualSoA } } - int main(int argc, char ** argv) + int main(int argc, char** argv) { std::cout << PROBLEM_SIZE / 1000 / 1000 << " million vectors SoA\n"; @@ -193,7 +183,7 @@ namespace manualSoA const auto start = std::chrono::high_resolution_clock::now(); LLAMA_INDEPENDENT_DATA - for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) + for (std::size_t i = 0; i < PROBLEM_SIZE; ++i) { ax[i] = i; ay[i] = i; @@ -204,33 +194,21 @@ namespace manualSoA } const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "init took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "init took " << std::chrono::duration(stop - start).count() << "s\n"; - for(std::size_t s = 0; s < STEPS; ++s) + for (std::size_t s = 0; s < STEPS; ++s) { const auto start = std::chrono::high_resolution_clock::now(); - add(ax.data(), - ay.data(), - az.data(), - bx.data(), - by.data(), - bz.data(), - cx.data(), - cy.data(), - cz.data()); + add(ax.data(), ay.data(), az.data(), bx.data(), by.data(), bz.data(), cx.data(), cy.data(), cz.data()); const auto stop = std::chrono::high_resolution_clock::now(); - std::cout << "add took " - << std::chrono::duration(stop - start).count() - << "s\n"; + std::cout << "add took " << std::chrono::duration(stop - start).count() << "s\n"; } return cx[0]; } -} +} // namespace manualSoA -int main(int argc, char ** argv) +int main(int argc, char** argv) { int r = 0; r += usellama::main(argc, argv); diff --git a/include/llama/Allocators.hpp b/include/llama/Allocators.hpp index 9820d48073..a57b1f7c9e 100644 --- a/include/llama/Allocators.hpp +++ b/include/llama/Allocators.hpp @@ -10,7 +10,7 @@ #include #include #if BOOST_COMP_INTEL != 0 -#include +# include #endif namespace llama::allocator @@ -18,11 +18,10 @@ namespace llama::allocator /// Allocates stack memory for a \ref View, which is copied each time a \ref /// View is copied. /// \tparam BytesToReserve the amount of memory to reserve. - template + template struct Stack { - LLAMA_FN_HOST_ACC_INLINE auto operator()(std::size_t) const - -> Array + LLAMA_FN_HOST_ACC_INLINE auto operator()(std::size_t) const -> Array { return {}; } @@ -31,81 +30,71 @@ namespace llama::allocator /// Allocates heap memory managed by a `std::shared_ptr` for a \ref View. /// This memory is shared between all copies of a \ref View. /// \tparam Alignment aligment of the allocated block of memory. - template + template struct SharedPtr { - inline auto operator()(std::size_t count) const - -> std::shared_ptr + inline auto operator()(std::size_t count) const -> std::shared_ptr { - auto * ptr = static_cast(::operator new[]( - count * sizeof(std::byte), std::align_val_t{Alignment})); - auto deleter = [=](std::byte * ptr) { - ::operator delete[](ptr, std::align_val_t{Alignment}); - }; - return std::shared_ptr{ptr, deleter}; + auto* ptr + = static_cast(::operator new[](count * sizeof(std::byte), std::align_val_t {Alignment})); + auto deleter = [=](std::byte* ptr) { ::operator delete[](ptr, std::align_val_t {Alignment}); }; + return std::shared_ptr {ptr, deleter}; } }; namespace internal { - template + template struct AlignmentAllocator { using value_type = T; inline AlignmentAllocator() noexcept = default; - template - inline AlignmentAllocator( - AlignmentAllocator const &) noexcept - {} + template + inline AlignmentAllocator(AlignmentAllocator const&) noexcept + { + } inline ~AlignmentAllocator() noexcept = default; - inline auto allocate(std::size_t n) -> T * + inline auto allocate(std::size_t n) -> T* { - return static_cast(::operator new[]( - n * sizeof(T), std::align_val_t{Alignment})); + return static_cast(::operator new[](n * sizeof(T), std::align_val_t {Alignment})); } - inline void deallocate(T * p, std::size_t) + inline void deallocate(T* p, std::size_t) { - ::operator delete[](p, std::align_val_t{Alignment}); + ::operator delete[](p, std::align_val_t {Alignment}); } - template + template struct rebind { using other = AlignmentAllocator; }; - auto - operator!=(const AlignmentAllocator & other) const - -> bool + auto operator!=(const AlignmentAllocator& other) const -> bool { return !(*this == other); } - auto - operator==(const AlignmentAllocator & other) const - -> bool + auto operator==(const AlignmentAllocator& other) const -> bool { return true; } }; - } + } // namespace internal /// Allocates heap memory managed by a `std::vector` for a \ref View, which /// is copied each time a \ref View is copied. /// \tparam Alignment aligment of the allocated block of memory. - template + template struct Vector { inline auto operator()(std::size_t count) const { - return std::vector< - std::byte, - internal::AlignmentAllocator>(count); + return std::vector>(count); } }; -} +} // namespace llama::allocator diff --git a/include/llama/Array.hpp b/include/llama/Array.hpp index eb34bb2eec..5d57cb9529 100644 --- a/include/llama/Array.hpp +++ b/include/llama/Array.hpp @@ -13,88 +13,87 @@ namespace llama /// devices like GPUs. /// \tparam T type if array elements. /// \tparam N rank of the array. - template + template struct Array { static constexpr std::size_t rank = N; T element[N > 0 ? N : 1]; - LLAMA_FN_HOST_ACC_INLINE T * begin() + LLAMA_FN_HOST_ACC_INLINE T* begin() { return &element[0]; } - LLAMA_FN_HOST_ACC_INLINE const T * begin() const + LLAMA_FN_HOST_ACC_INLINE const T* begin() const { return &element[0]; } - LLAMA_FN_HOST_ACC_INLINE T * end() + LLAMA_FN_HOST_ACC_INLINE T* end() { return &element[N]; }; - LLAMA_FN_HOST_ACC_INLINE const T * end() const + LLAMA_FN_HOST_ACC_INLINE const T* end() const { return &element[N]; }; - template - LLAMA_FN_HOST_ACC_INLINE auto operator[](IndexType && idx) -> T & + template + LLAMA_FN_HOST_ACC_INLINE auto operator[](IndexType&& idx) -> T& { return element[idx]; } - template - LLAMA_FN_HOST_ACC_INLINE constexpr auto - operator[](IndexType && idx) const -> T const & + template + LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&& idx) const -> T const& { return element[idx]; } - LLAMA_FN_HOST_ACC_INLINE friend auto - operator==(const Array & a, const Array & b) -> bool + LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const Array& a, const Array& b) -> bool { - for(std::size_t i = 0; i < N; ++i) - if(a.element[i] != b.element[i]) + for (std::size_t i = 0; i < N; ++i) + if (a.element[i] != b.element[i]) return false; return true; } - LLAMA_FN_HOST_ACC_INLINE friend auto - operator+(const Array & a, const Array & b) -> Array + LLAMA_FN_HOST_ACC_INLINE friend auto operator+(const Array& a, const Array& b) -> Array { Array temp; - for(std::size_t i = 0; i < N; ++i) temp[i] = a[i] + b[i]; + for (std::size_t i = 0; i < N; ++i) + temp[i] = a[i] + b[i]; return temp; } - template - auto get() -> T & + template + auto get() -> T& { return element[I]; } - template - auto get() const -> const T & + template + auto get() const -> const T& { return element[I]; } }; - template + template Array(First, Args... args) -> Array; -} +} // namespace llama namespace std { - template + template struct tuple_size> : integral_constant - {}; + { + }; - template + template struct tuple_element> { using type = T; }; -} +} // namespace std diff --git a/include/llama/Concepts.hpp b/include/llama/Concepts.hpp index 89f00a244d..f8dafb1aed 100644 --- a/include/llama/Concepts.hpp +++ b/include/llama/Concepts.hpp @@ -2,9 +2,9 @@ #ifdef __cpp_concepts -#include "Types.hpp" +# include "Types.hpp" -#include +# include namespace llama { @@ -18,6 +18,6 @@ namespace llama { m.getBlobNrAndOffset(typename M::UserDomain{}) } -> std::same_as; }; // clang-format on -} +} // namespace llama #endif diff --git a/include/llama/DatumCoord.hpp b/include/llama/DatumCoord.hpp index 7cc4ec4b18..ca3eb6748a 100644 --- a/include/llama/DatumCoord.hpp +++ b/include/llama/DatumCoord.hpp @@ -11,7 +11,7 @@ namespace llama { /// Represents a coordinate for an element inside the datum domain tree. /// \tparam Coords... the compile time coordinate. - template + template struct DatumCoord { /// The list of integral coordinates as `boost::mp11::mp_list`. @@ -22,7 +22,7 @@ namespace llama static constexpr std::size_t size = sizeof...(Coords); }; - template<> + template <> struct DatumCoord<> { using List = boost::mp11::mp_list_c; @@ -32,101 +32,94 @@ namespace llama namespace internal { - template + template struct mp_unwrap_sizes_impl; - template class L, typename... T> + template