From aacf68e45153be4a6fc4537cf157ba3a25986714 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 9 Jun 2023 10:14:02 -0400 Subject: [PATCH 01/94] Adding exception for arrayOfStructure option for bGrid. --- libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h | 3 +++ libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp | 4 ++-- .../tests/domain-neighbour-globalIdx/src/runHelper.h | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 3c57d7d9..3d8fbfb7 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -28,6 +28,9 @@ bField(grid.getBackend()); mData->grid = std::make_shared(grid); + if(memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs){ + NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs"); + } // the allocation size is the number of blocks x block size x cardinality mData->memoryField = mData->grid->helpGetBlockViewGrid().template newField( "BitMask", diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp index bdf77a74..feba5a9b 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp @@ -22,9 +22,9 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGridSingleGPU) +TEST(domain_unit_test_globalIdx, bGrid) { - int nGpus = 1; + int nGpus = 5; using Type = int64_t; runAllTestConfiguration(std::function(globalIdx::run), nGpus, diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h index e064a49a..0014594c 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h @@ -82,6 +82,9 @@ void runAllTestConfiguration( if (dim.z < 8 * ngpu * 3) { dim.z = ngpu * 3 * 8; } + if(memoryLayout == Neon::MemoryLayout::arrayOfStructs){ + continue ; + } } assert(card == 1); From 18f2d7223fc2de5a0e41c7bd1c906035ea2e79a4 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 15:37:00 -0400 Subject: [PATCH 02/94] Some documentation to bGrid. --- .../Neon/domain/details/bGrid/bField_imp.h | 4 +- .../include/Neon/domain/details/bGrid/bGrid.h | 81 ++++++++++++++++--- .../Neon/domain/details/bGrid/bGrid_imp.h | 4 +- 3 files changed, 74 insertions(+), 15 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 3d8fbfb7..687b7a0d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -32,7 +32,7 @@ bFieldmemoryField = mData->grid->helpGetBlockViewGrid().template newField( + mData->memoryField = mData->grid->getBlockViewGrid().template newField( "BitMask", [&] { int elPerBlock = dataBlockSize3D.rMul(); @@ -53,7 +53,7 @@ bFieldmemoryField.getPartition(execution, setIdx, Neon::DataView::STANDARD); auto& blockConnectivity = mData->grid->helpGetBlockConnectivity().getPartition(execution, setIdx, Neon::DataView::STANDARD); - auto& bitmask = mData->grid->helpGetActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD); + auto& bitmask = mData->grid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD); auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD); partition = bPartition(setIdx, diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index e19ef98a..c31831ff 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -66,25 +66,46 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate - bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const int voxelSpacing, - const double_3d& spacingData = double_3d(1, 1, 1), - const double_3d& origin = double_3d(0, 0, 0)); - + bGrid(const Neon::Backend& backend /**< Neon backend for the computation */, + const Neon::int32_3d& domainSize /**< Size of the bounded Cartesian */, + const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, + const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, + const int voxelSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/ + , + const double_3d& spacingData = double_3d(1, 1, 1) /** Physical spacing between two consecutive data points in the Cartesian domain */, + const double_3d& origin = double_3d(0, 0, 0) /** Physical location in space of the origin of the Cartesian discretization */); + /** + * Returns some properties for a given cartesian in the Cartesian domain. + * The provide index my be inside or outside the user defined bounded Cartesian domain + */ auto getProperties(const Neon::index_3d& idx) const -> typename GridBaseTemplate::CellProperties final; + /** + * Returns true if the query 3D point is inside the user domain + * @param idx + * @return + */ auto isInsideDomain(const Neon::index_3d& idx) const -> bool final; + /** + * Retrieves the device index that contains the query point + * @param idx + * @return + */ auto getSetIdx(const Neon::index_3d& idx) const -> int32_t final; + /** + * Allocates a new field on the grid + */ template auto newField(const std::string name, int cardinality, @@ -93,6 +114,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate Field; + /** + * Allocates a new field on the block view grid + */ template auto newBlockViewField(const std::string name, int cardinality, @@ -101,6 +125,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate BlockViewGrid::Field; + /* + * Allocates a new container to execute some computation in the grid + */ template auto newContainer(const std::string& name, @@ -108,26 +135,58 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate Neon::set::Container; + /* + * Allocates a new container to execute some computation in the grid + */ template auto newContainer(const std::string& name, LoadingLambda lambda) const -> Neon::set::Container; - + /** + * Defines a new set of parameter to launch a Container + */ auto getLaunchParameters(Neon::DataView dataView, const Neon::index_3d& blockSize, const size_t& sharedMem) const -> Neon::set::LaunchParameters; + /** + * Retrieve the span associated to the grid w.r.t. some user defined parameters. + */ auto getSpan(Neon::Execution execution, SetIdx setIdx, Neon::DataView dataView) -> const Span&; - auto helpGetBlockViewGrid() const -> BlockViewGrid&; - auto helpGetActiveBitMask() const -> BlockViewGrid::Field&; + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getBlockViewGrid() const -> BlockViewGrid&; + + /** + * Retrieve the block vew grid internally used. + * This grid can be leverage to allocate data at the block level. + */ + auto getActiveBitMask() const -> BlockViewGrid::Field&; + + /** + * Help function to retrieve the block connectivity as a BlockViewGrid field + */ auto helpGetBlockConnectivity() const -> BlockViewGrid::Field&; + + /** + * Help function to retrieve the block origin as a BlockViewGrid field + */ auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field&; + + /* + * Help function to retrieve the map that converts a stencil point id to 3d offset + */ auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet&; + /* + * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid + */ auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple; struct Data diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index fcd0f803..03c1bd59 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -313,7 +313,7 @@ auto bGrid auto bGrid:: - helpGetBlockViewGrid() + getBlockViewGrid() const -> BlockViewGrid& { return mData->blockViewGrid; @@ -321,7 +321,7 @@ auto bGrid auto bGrid:: - helpGetActiveBitMask() + getActiveBitMask() const -> BlockViewGrid::Field& { return mData->activeBitMask; From b81c423586558e2d20b4e9c3684d077999ea0fbf Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 17:20:19 -0400 Subject: [PATCH 03/94] bGrid: API documentation and refactoring of the template API. --- libNeonDomain/include/Neon/domain/bGrid.h | 2 +- .../Neon/domain/details/bGrid/StaticBlock.h | 46 +++++ .../Neon/domain/details/bGrid/bField.h | 22 +-- .../Neon/domain/details/bGrid/bField_imp.h | 99 +++++----- .../include/Neon/domain/details/bGrid/bGrid.h | 29 +-- .../Neon/domain/details/bGrid/bGrid_imp.h | 174 +++++++++--------- .../Neon/domain/details/bGrid/bIndex.h | 38 ++-- .../Neon/domain/details/bGrid/bIndex_imp.h | 80 +++----- .../Neon/domain/details/bGrid/bPartition.h | 8 +- .../domain/details/bGrid/bPartition_imp.h | 70 +++---- .../include/Neon/domain/details/bGrid/bSpan.h | 6 +- .../Neon/domain/details/bGrid/bSpan_imp.h | 32 ++-- .../src/domain/details/bGrid/bGrid.cpp | 2 +- .../tests/domain-bGrid-tray/src/gtests.cpp | 42 ++--- 14 files changed, 322 insertions(+), 328 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h diff --git a/libNeonDomain/include/Neon/domain/bGrid.h b/libNeonDomain/include/Neon/domain/bGrid.h index 13c01cc3..39a4f366 100644 --- a/libNeonDomain/include/Neon/domain/bGrid.h +++ b/libNeonDomain/include/Neon/domain/bGrid.h @@ -2,5 +2,5 @@ #include "Neon/domain/details/bGrid/bGrid.h" namespace Neon { -using bGrid = Neon::domain::details::bGrid::bGrid<8,8,8>; +using bGrid = Neon::domain::details::bGrid::bGrid>; } \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h new file mode 100644 index 00000000..612c6b9a --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h @@ -0,0 +1,46 @@ +#include "Neon/domain/details/bGrid/bSpan.h" + +namespace Neon::domain::details::bGrid { + +template +struct StaticBlock +{ + public: + constexpr static uint32_t memBlockSizeX = memBlockSizeX_; + constexpr static uint32_t memBlockSizeY = memBlockSizeY_; + constexpr static uint32_t memBlockSizeZ = memBlockSizeZ_; + constexpr static Neon::uint32_3d memBlockSize3D = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); + + constexpr static uint32_t userBlockSizeX = userBlockSizeX_; + constexpr static uint32_t userBlockSizeY = userBlockSizeY_; + constexpr static uint32_t userBlockSizeZ = userBlockSizeZ_; + constexpr static Neon::uint32_3d userBlockSize3D = Neon::uint32_3d(userBlockSizeX, userBlockSizeY, userBlockSizeZ); + + constexpr static uint32_t blockRatioX = memBlockSizeX / userBlockSizeX; + constexpr static uint32_t blockRatioY = memBlockSizeY / userBlockSizeY; + constexpr static uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ; + + constexpr static uint32_t memBlockPitchX = 1; + constexpr static uint32_t memBlockPitchY = memBlockSizeX; + constexpr static uint32_t memBlockPitchZ = memBlockSizeX * memBlockSizeY; + + constexpr static bool isMultiResMode = isMultiResMode_; + + constexpr static uint32_t memBlockCountElements = memBlockSizeX * memBlockSizeY * memBlockSizeZ; + + static_assert(memBlockSizeX >= userBlockSizeX); + static_assert(memBlockSizeY >= userBlockSizeY); + static_assert(memBlockSizeZ >= userBlockSizeZ); + + static_assert(memBlockSizeX % userBlockSizeX == 0); + static_assert(memBlockSizeY % userBlockSizeY == 0); + static_assert(memBlockSizeZ % userBlockSizeZ == 0); +}; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index f232d96b..95c1d6d5 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -18,31 +18,25 @@ namespace Neon::domain::details::bGrid { -template +template class bField : public Neon::domain::interface::FieldBaseTemplate, - bPartition, + bGrid, + bPartition, int> { - friend bGrid; + friend bGrid; public: using Type = T; - using Grid = bGrid; - using Field = bField; - using Partition = bPartition; - using Idx = bIndex; + using Grid = bGrid; + using Field = bField; + using Partition = bPartition; + using Idx = bIndex; using NghIdx = typename Partition::NghIdx; using NghData = typename Partition::NghData; - static constexpr Neon::index_3d dataBlockSize3D = Neon::index_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); - - static constexpr Neon::int8_3d DataBlockSize = Neon::int8_3d(memBlockSizeX, - memBlockSizeY, - memBlockSizeZ); - bField(const std::string& fieldUserName, Neon::DataUse dataUse, diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 687b7a0d..a9c249ca 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -4,19 +4,19 @@ namespace Neon::domain::details::bGrid { -template -bField::bField() +template +bField::bField() { mData = std::make_shared(); } -template -bField::bField(const std::string& fieldUserName, - Neon::DataUse dataUse, - const Neon::MemoryOptions& memoryOptions, - const Grid& grid, - int cardinality, - T inactiveValue) +template +bField::bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + const Neon::MemoryOptions& memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue) : Neon::domain::interface::FieldBaseTemplate(&grid, fieldUserName, "bField", @@ -28,20 +28,19 @@ bField(grid.getBackend()); mData->grid = std::make_shared(grid); - if(memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs){ + if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) { NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs"); } // the allocation size is the number of blocks x block size x cardinality mData->memoryField = mData->grid->getBlockViewGrid().template newField( "BitMask", [&] { - int elPerBlock = dataBlockSize3D.rMul(); - elPerBlock = elPerBlock * cardinality; + int elPerBlock = SBlock::memBlockCountElements * cardinality; return elPerBlock; }(), 0, dataUse, - mData->grid->getBackend().getMemoryOptions(bSpan::activeMaskMemoryLayout)); + mData->grid->getBackend().getMemoryOptions(bSpan::activeMaskMemoryLayout)); { // Setting up partitionTable @@ -56,28 +55,28 @@ bFieldgrid->getActiveBitMask().getPartition(execution, setIdx, Neon::DataView::STANDARD); auto& dataBlockOrigins = mData->grid->helpGetDataBlockOriginField().getPartition(execution, setIdx, Neon::DataView::STANDARD); - partition = bPartition(setIdx, - cardinality, - memoryFieldPartition.mem(), - blockConnectivity.mem(), - bitmask.mem(), - dataBlockOrigins.mem(), - mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx)); + partition = bPartition(setIdx, + cardinality, + memoryFieldPartition.mem(), + blockConnectivity.mem(), + bitmask.mem(), + dataBlockOrigins.mem(), + mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx)); }); } initHaloUpdateTable(); } -template -auto bField::isInsideDomain(const Neon::index_3d& idx) const -> bool +template +auto bField::isInsideDomain(const Neon::index_3d& idx) const -> bool { return mData->grid->isInsideDomain(idx); } -template -auto bField::getReference(const Neon::index_3d& cartesianIdx, - const int& cardinality) -> T& +template +auto bField::getReference(const Neon::index_3d& cartesianIdx, + const int& cardinality) -> T& { auto& grid = this->getGrid(); auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); @@ -86,9 +85,9 @@ auto bField -auto bField::operator()(const Neon::index_3d& cartesianIdx, - const int& cardinality) const -> T +template +auto bField::operator()(const Neon::index_3d& cartesianIdx, + const int& cardinality) const -> T { auto& grid = this->getGrid(); auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); @@ -100,22 +99,22 @@ auto bField -auto bField::updateHostData(int streamId) -> void +template +auto bField::updateHostData(int streamId) -> void { mData->memoryField.updateHostData(streamId); } -template -auto bField::updateDeviceData(int streamId) -> void +template +auto bField::updateDeviceData(int streamId) -> void { mData->memoryField.updateDeviceData(streamId); } -template -auto bField::getPartition(Neon::Execution execution, - Neon::SetIdx setIdx, - const Neon::DataView& dataView) const -> const Partition& +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) const -> const Partition& { const Neon::DataUse dataUse = this->getDataUse(); bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); @@ -128,10 +127,10 @@ auto bField -auto bField::getPartition(Neon::Execution execution, - Neon::SetIdx setIdx, - const Neon::DataView& dataView) -> Partition& +template +auto bField::getPartition(Neon::Execution execution, + Neon::SetIdx setIdx, + const Neon::DataView& dataView) -> Partition& { const Neon::DataUse dataUse = this->getDataUse(); bool isOk = Neon::ExecutionUtils::checkCompatibility(dataUse, execution); @@ -144,10 +143,10 @@ auto bField -auto bField::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic, - Neon::set::TransferMode transferMode, - Neon::Execution execution) const -> Neon::set::Container +template +auto bField::newHaloUpdate(Neon::set::StencilSemantic stencilSemantic, + Neon::set::TransferMode transferMode, + Neon::Execution execution) const -> Neon::set::Container { @@ -220,8 +219,8 @@ auto bField -auto bField::initHaloUpdateTable() -> void +template +auto bField::initHaloUpdateTable() -> void { // NEON_THROW_UNSUPPORTED_OPERATION(""); auto& grid = this->getGrid(); @@ -269,10 +268,10 @@ auto bFieldgetCountAllocated()) * dataBlockSize3D.rMul()); + size_t(blockViewPartitions[endPoint]->getCountAllocated()) * SBlock::memBlockCountElements); } if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSrc)) { @@ -299,10 +298,8 @@ auto bField + +template class bField; -template -class bGrid : public Neon::domain::interface::GridBaseTemplate, - bIndex > +template +class bGrid : public Neon::domain::interface::GridBaseTemplate, + bIndex > { public: - using Grid = bGrid; + using Grid = bGrid; - template - using Partition = bPartition; + template + using Partition = bPartition; - template - using Field = Neon::domain::details::bGrid::bField; + template + using Field = Neon::domain::details::bGrid::bField; - using Span = bSpan; + using Span = bSpan; using NghIdx = typename Partition::NghIdx; - using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate >; + using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate >; - using Idx = bIndex; + using Idx = bIndex; static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3; using ExecutionThreadSpanIndexType = uint32_t; - static constexpr Neon::index_3d dataBlockSize3D = Neon::index_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); using BlockIdx = uint32_t; bGrid() = default; @@ -227,7 +228,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate mData; }; -extern template class bGrid<8, 8, 8>; +extern template class bGrid>; } // namespace Neon::domain::details::bGrid #include "bField_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 03c1bd59..1b40a8b7 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -2,42 +2,38 @@ namespace Neon::domain::details::bGrid { -template +template template -bGrid::bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const double_3d& spacingData, - const double_3d& origin) +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData, + const double_3d& origin) : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin) { } -template +template template -bGrid::bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const int voxelSpacing, - const double_3d& spacingData, - const double_3d& origin) +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const int voxelSpacing, + const double_3d& spacingData, + const double_3d& origin) { - static_assert(memBlockSizeX >= userBlockSizeX); - static_assert(memBlockSizeY >= userBlockSizeY); - static_assert(memBlockSizeZ >= userBlockSizeZ); - static_assert(memBlockSizeX % userBlockSizeX == 0); - static_assert(memBlockSizeY % userBlockSizeY == 0); - static_assert(memBlockSizeZ % userBlockSizeZ == 0); mData = std::make_shared(); mData->init(backend); mData->voxelSpacing = voxelSpacing; mData->stencil = stencil; - const index_3d defaultKernelBlockSize(memBlockSizeX, memBlockSizeY, memBlockSizeZ); + const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX, + SBlock::memBlockSizeY, + SBlock::memBlockSizeZ); { auto nElementsPerPartition = backend.devSet().template newDataSet(0); @@ -59,7 +55,7 @@ bGrid(), domainSize, Neon::domain::Stencil::s27_t(false), 1); @@ -76,7 +72,7 @@ bGridpartitioner1D.getBlockSpan(), mData->partitioner1D, Neon::domain::Stencil::s27_t(false), - spacingData * dataBlockSize3D, + spacingData * SBlock::memBlockSize3D, origin); mData->blockViewGrid = BlockViewGrid(egrid); @@ -106,9 +102,9 @@ bGrid().z; k++) { + for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { + for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { Neon::int32_3d localPosition(i, j, k); typename Span::BitMaskWordType mask; @@ -166,7 +162,7 @@ bGrid(nghIdx.helpGet()); + blockNghIdx = static_cast(nghIdx.helpGet()); } blockConnectivity(idx, targetDirection) = blockNghIdx; } @@ -220,7 +216,7 @@ bGrid(); + Neon::int8_3d pShort = pLong.newType(); mData->stencilIdTo3dOffset.eRef(devIdx, i) = pShort; } } @@ -232,7 +228,7 @@ bGridmNumActiveVoxel, - dataBlockSize3D, + SBlock::memBlockSize3D.template newType(), spacingData, origin); { // setting launchParameters @@ -244,47 +240,47 @@ bGrid( eDomainGridSize.x); + int nBlocks = static_cast(eDomainGridSize.x); bLaunchParameters.get(setIdx).set(Neon::sys::GpuLaunchInfo::mode_e::cudaGridMode, - nBlocks, dataBlockSize3D, 0); + nBlocks, SBlock::memBlockSize3D.template newType(), 0); }); }); } } -template +template template -auto bGrid::newField(const std::string name, - int cardinality, - T inactiveValue, - Neon::DataUse dataUse, - Neon::MemoryOptions memoryOptions) const -> Field +auto bGrid::newField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> Field { memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); Field field(name, dataUse, memoryOptions, *this, cardinality, inactiveValue); return field; } -template +template template -auto bGrid::newBlockViewField(const std::string name, - int cardinality, - T inactiveValue, - Neon::DataUse dataUse, - Neon::MemoryOptions memoryOptions) const -> BlockViewGrid::Field +auto bGrid::newBlockViewField(const std::string name, + int cardinality, + T inactiveValue, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions) const -> BlockViewGrid::Field { memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); BlockViewGrid::Field blockViewField = mData->blockViewGrid.template newField(name, cardinality, inactiveValue, dataUse, memoryOptions); return blockViewField; } -template +template template -auto bGrid::newContainer(const std::string& name, - index_3d blockSize, - size_t sharedMem, - LoadingLambda lambda) const -> Neon::set::Container +auto bGrid::newContainer(const std::string& name, + index_3d blockSize, + size_t sharedMem, + LoadingLambda lambda) const -> Neon::set::Container { Neon::set::Container kContainer = Neon::set::Container::factory(name, Neon::set::internal::ContainerAPI::DataViewSupport::on, @@ -295,11 +291,11 @@ auto bGrid +template template -auto bGrid::newContainer(const std::string& name, - LoadingLambda lambda) const -> Neon::set::Container +auto bGrid::newContainer(const std::string& name, + LoadingLambda lambda) const -> Neon::set::Container { const Neon::index_3d& defaultBlockSize = this->getDefaultBlock(); Neon::set::Container kContainer = Neon::set::Container::factory(name, @@ -311,50 +307,50 @@ auto bGrid -auto bGrid:: +template +auto bGrid:: getBlockViewGrid() const -> BlockViewGrid& { return mData->blockViewGrid; } -template -auto bGrid:: +template +auto bGrid:: getActiveBitMask() const -> BlockViewGrid::Field& { return mData->activeBitMask; } -template -auto bGrid:: +template +auto bGrid:: helpGetBlockConnectivity() const -> BlockViewGrid::Field& { return mData->blockConnectivity; } -template -auto bGrid:: +template +auto bGrid:: helpGetDataBlockOriginField() const -> Neon::aGrid::Field& { return mData->mDataBlockOriginField; } -template -auto bGrid::getSpan(Neon::Execution execution, - SetIdx setIdx, - Neon::DataView dataView) -> const bGrid::Span& +template +auto bGrid::getSpan(Neon::Execution execution, + SetIdx setIdx, + Neon::DataView dataView) -> const bGrid::Span& { return mData->spanTable.getSpan(execution, setIdx, dataView); } -template -bGrid::~bGrid() +template +bGrid::~bGrid() { } -template -auto bGrid::getSetIdx(const index_3d& idx) const -> int32_t +template +auto bGrid::getSetIdx(const index_3d& idx) const -> int32_t { typename GridBaseTemplate::CellProperties cellProperties; @@ -365,10 +361,10 @@ auto bGrid -auto bGrid::getLaunchParameters(Neon::DataView dataView, - const index_3d&, - const size_t& sharedMem) const -> Neon::set::LaunchParameters +template +auto bGrid::getLaunchParameters(Neon::DataView dataView, + const index_3d&, + const size_t& sharedMem) const -> Neon::set::LaunchParameters { auto res = mData->launchParametersTable.get(dataView); res.forEachSeq([&](SetIdx const& /*setIdx*/, @@ -378,19 +374,19 @@ auto bGrid -auto bGrid:: +template +auto bGrid:: helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet& { return mData->stencilIdTo3dOffset; } -template -auto bGrid::isInsideDomain(const index_3d& idx) const -> bool +template +auto bGrid::isInsideDomain(const index_3d& idx) const -> bool { // 1. check if the block is active - const index_3d blockIdx3d = idx / dataBlockSize3D; + const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); auto blockProperties = mData->blockViewGrid.getProperties(blockIdx3d); if (!blockProperties.isInside()) { @@ -399,17 +395,17 @@ auto bGridactiveBitMask.getReference(blockIdx3d, int(wordCardinality)); return (activeBits & mask) != 0; } -template -auto bGrid::getProperties(const index_3d& idx) +template +auto bGrid::getProperties(const index_3d& idx) const -> typename GridBaseTemplate::CellProperties { typename GridBaseTemplate::CellProperties cellProperties; @@ -422,7 +418,7 @@ auto bGridgetDevSet().setCardinality() == 1) { cellProperties.init(0, DataView::INTERNAL); } else { - const index_3d blockIdx3d = idx / dataBlockSize3D; + const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); auto blockViewProperty = mData->blockViewGrid.getProperties(blockIdx3d); cellProperties.init(blockViewProperty.getSetIdx(), @@ -431,17 +427,17 @@ auto bGrid -auto bGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) +template +auto bGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple { - const index_3d blockIdx3d = idx / dataBlockSize3D; + const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); auto [setIdx, bvGridIdx] = mData->blockViewGrid.helpGetSetIdxAndGridIdx(blockIdx3d); Idx bIdx; bIdx.mDataBlockIdx = bvGridIdx.helpGet(); - bIdx.mInDataBlockIdx.x = static_cast(idx.x % dataBlockSize3D.x); - bIdx.mInDataBlockIdx.y = static_cast(idx.y % dataBlockSize3D.y); - bIdx.mInDataBlockIdx.z = static_cast(idx.z % dataBlockSize3D.z); + bIdx.mInDataBlockIdx.x = static_cast(idx.x % SBlock::memBlockSize3D.x); + bIdx.mInDataBlockIdx.y = static_cast(idx.y % SBlock::memBlockSize3D.y); + bIdx.mInDataBlockIdx.z = static_cast(idx.z % SBlock::memBlockSize3D.z); return {setIdx, bIdx}; } diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h index 7b8d7bcf..bbf103d1 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex.h @@ -6,11 +6,11 @@ namespace Neon::domain::details::bGrid { // Common forward declarations -template +template class bGrid; -template +template class bSpan; -template +template class bPartition; class MicroIndex @@ -59,26 +59,24 @@ class MicroIndex TrayIdx mTrayBlockIdx{}; }; -template +template class bIndex { public: - template + template friend class bSpan; - using OuterIdx = bIndex; - - static constexpr Neon::uint32_3d memBlock3DSize = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ); + using OuterIdx = bIndex; using NghIdx = int8_3d; - template + template friend class bPartition; - template + template friend class bField; - template + template friend class bSpan; - template + template friend class bGrid; @@ -109,25 +107,25 @@ class bIndex DataBlockIdx mDataBlockIdx{}; }; -template -NEON_CUDA_HOST_DEVICE auto bIndex::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void +template +NEON_CUDA_HOST_DEVICE auto bIndex::setDataBlockIdx(const bIndex::DataBlockIdx& dataBlockIdx) -> void { mDataBlockIdx = dataBlockIdx; } -template -NEON_CUDA_HOST_DEVICE auto bIndex::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void +template +NEON_CUDA_HOST_DEVICE auto bIndex::setInDataBlockIdx(const bIndex::InDataBlockIdx& inDataBlockIdx) -> void { mInDataBlockIdx = inDataBlockIdx; } -template -NEON_CUDA_HOST_DEVICE auto bIndex::getDataBlockIdx() const -> const bIndex::DataBlockIdx& +template +NEON_CUDA_HOST_DEVICE auto bIndex::getDataBlockIdx() const -> const bIndex::DataBlockIdx& { return mDataBlockIdx; } -template -NEON_CUDA_HOST_DEVICE auto bIndex::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx& +template +NEON_CUDA_HOST_DEVICE auto bIndex::getInDataBlockIdx() const -> const bIndex::InDataBlockIdx& { return mInDataBlockIdx; } diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h index a55fddbb..be45749d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bIndex_imp.h @@ -3,8 +3,8 @@ namespace Neon::domain::details::bGrid { -template -NEON_CUDA_HOST_DEVICE inline bIndex:: +template +NEON_CUDA_HOST_DEVICE inline bIndex:: bIndex(const DataBlockIdx& blockIdx, const InDataBlockIdx::Integer& x, const InDataBlockIdx::Integer& y, @@ -16,86 +16,52 @@ NEON_CUDA_HOST_DEVICE inline bIndex -// NEON_CUDA_HOST_DEVICE inline auto bIndex::getTrayIdx() -> TrayIdx -//{ -// -// TrayIdx const exBlockOffset = mDataBlockIdx * (userBlockSizeX * userBlockSizeY * userBlockSizeZ); -// TrayIdx const exTrayOffset = [&]() { -// int const trayBlockIdxX = mInDataBlockIdx.x / userBlockSizeX; -// int const trayBlockIdxY = mInDataBlockIdx.y / userBlockSizeY; -// int const trayBlockIdxZ = mInDataBlockIdx.z / userBlockSizeZ; -// -// constexpr int countMicroBlocksInTrayX = (memBlockSizeX / userBlockSizeX); -// constexpr int countMicroBlocksInTrayY = (memBlockSizeY / userBlockSizeY); -// -// int const res = trayBlockIdxX + trayBlockIdxY * countMicroBlocksInTrayX + -// trayBlockIdxZ * (countMicroBlocksInTrayX * countMicroBlocksInTrayY); -// return res; -// }; -// return exBlockOffset + exTrayOffset; -//} -// -// -// template -// NEON_CUDA_HOST_DEVICE inline auto bIndex::getInTrayIdx() -> InTrayIdx -//{ -// InTrayIdx inTrayIdx; -// inTrayIdx.x = mInDataBlockIdx.x % userBlockSizeX; -// inTrayIdx.y = mInDataBlockIdx.y % userBlockSizeY; -// inTrayIdx.z = mInDataBlockIdx.z % userBlockSizeZ; -// -// return inTrayIdx; -//} -template -NEON_CUDA_HOST_DEVICE inline auto bIndex::getMicroIndex() -> MicroIndex +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::getMicroIndex() -> MicroIndex { - constexpr uint32_t blockRatioX = memBlockSizeX / userBlockSizeX; - constexpr uint32_t blockRatioY = memBlockSizeY / userBlockSizeY; - constexpr uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ; - TrayIdx const exBlockOffset = mDataBlockIdx * (blockRatioX * blockRatioY * blockRatioZ); + + TrayIdx const exBlockOffset = mDataBlockIdx * (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ); TrayIdx const exTrayOffset = [&] { - TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / userBlockSizeX; - TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / userBlockSizeY; - TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / userBlockSizeZ; + TrayIdx const trayBlockIdxX = mInDataBlockIdx.x / SBlock::userBlockSizeX; + TrayIdx const trayBlockIdxY = mInDataBlockIdx.y / SBlock::userBlockSizeY; + TrayIdx const trayBlockIdxZ = mInDataBlockIdx.z / SBlock::userBlockSizeZ; - TrayIdx const res = trayBlockIdxX + trayBlockIdxY * blockRatioX + - trayBlockIdxZ * (blockRatioX * blockRatioY); + TrayIdx const res = trayBlockIdxX + trayBlockIdxY * SBlock::blockRatioX + + trayBlockIdxZ * (SBlock::blockRatioX * SBlock::blockRatioY); return res; }(); MicroIndex res; res.setTrayBlockIdx(exBlockOffset + exTrayOffset); - res.setInTrayBlockIdx({static_cast(mInDataBlockIdx.x % userBlockSizeX), - static_cast(mInDataBlockIdx.y % userBlockSizeY), - static_cast(mInDataBlockIdx.z % userBlockSizeZ)}); + res.setInTrayBlockIdx({static_cast(mInDataBlockIdx.x % SBlock::userBlockSizeX), + static_cast(mInDataBlockIdx.y % SBlock::userBlockSizeY), + static_cast(mInDataBlockIdx.z % SBlock::userBlockSizeZ)}); return res; } -template -NEON_CUDA_HOST_DEVICE inline auto bIndex::init(MicroIndex const& microIndex) -> void +template +NEON_CUDA_HOST_DEVICE inline auto bIndex::init(MicroIndex const& microIndex) -> void { - constexpr uint32_t memBlockSize = memBlockSizeX * memBlockSizeY * memBlockSizeZ; - constexpr uint32_t userBlockSize = userBlockSizeX * userBlockSizeY * userBlockSizeZ; + constexpr uint32_t memBlockSize = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + constexpr uint32_t userBlockSize = SBlock::userBlockSizeX * SBlock::userBlockSizeY * SBlock::userBlockSizeZ; constexpr uint32_t blockRatioSize = memBlockSize / userBlockSize; - constexpr uint32_t blockRatioX = memBlockSizeX / userBlockSizeX; - constexpr uint32_t blockRatioY = memBlockSizeY / userBlockSizeY; + constexpr uint32_t blockRatioX = SBlock::memBlockSizeX / SBlock::userBlockSizeX; + constexpr uint32_t blockRatioY = SBlock::memBlockSizeY / SBlock::userBlockSizeY; mDataBlockIdx = microIndex.getTrayBlockIdx() / (blockRatioSize); uint32_t reminder = microIndex.getTrayBlockIdx() % (blockRatioSize); const uint32_t reminderInZ = reminder / (blockRatioX * blockRatioY); - mInDataBlockIdx.z = static_cast < InDataBlockIdx::Integer>( microIndex.getInTrayBlockIdx().z + reminderInZ * userBlockSizeZ); + mInDataBlockIdx.z = static_cast(microIndex.getInTrayBlockIdx().z + reminderInZ * SBlock::userBlockSizeZ); reminder = reminder % (blockRatioX * blockRatioY); const uint32_t reminderInY = reminder / (blockRatioX); - mInDataBlockIdx.y = static_cast(microIndex.getInTrayBlockIdx().y + reminderInY * userBlockSizeY); + mInDataBlockIdx.y = static_cast(microIndex.getInTrayBlockIdx().y + reminderInY * SBlock::userBlockSizeY); const uint32_t reminderInX = reminder % blockRatioX; - mInDataBlockIdx.x = static_cast(microIndex.getInTrayBlockIdx().x + reminderInX * userBlockSizeX); + mInDataBlockIdx.x = static_cast(microIndex.getInTrayBlockIdx().x + reminderInX * SBlock::userBlockSizeX); } } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 09db40e4..f20a513d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -9,15 +9,15 @@ namespace Neon::domain::details::bGrid { -template +template class bSpan; -template +template class bPartition { public: - using Span = bSpan; - using Idx = bIndex; + using Span = bSpan; + using Idx = bIndex; using NghIdx = typename Idx::NghIdx; using Type = T; using NghData = Neon::domain::NghData; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index db057f47..8506476b 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -5,8 +5,8 @@ namespace Neon::domain::details::bGrid { -template -bPartition::bPartition() +template +bPartition::bPartition() : mCardinality(0), mMem(nullptr), mStencilNghIndex(), @@ -17,8 +17,8 @@ bPartition -bPartition:: +template +bPartition:: bPartition(int setIdx, int cardinality, T* mem, @@ -36,8 +36,8 @@ bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: getGlobalIndex(const Idx& gidx) const -> Neon::index_3d { @@ -48,8 +48,8 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: getBlockViewGridIdx(const Idx& gidx) const -> BlockViewGridIdx { @@ -58,32 +58,32 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: cardinality() const -> int { return mCardinality; } -template -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: operator()(const Idx& cell, int card) -> T& { return mMem[helpGetPitch(cell, card)]; } -template -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: operator()(const Idx& cell, int card) const -> const T& { return mMem[helpGetPitch(cell, card)]; } -template -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpGetPitch(const Idx& idx, int card) const -> uint32_t { @@ -92,22 +92,22 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpGetValidIdxPitchExplicit(const Idx& idx, int card) const -> uint32_t { - uint32_t const blockPitchByCard = memBlockSizeX * memBlockSizeY * memBlockSizeZ; + uint32_t const blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x + - memBlockSizeX * idx.mInDataBlockIdx.y + - (memBlockSizeX * memBlockSizeY) * idx.mInDataBlockIdx.z; + SBlock::memBlockSizeX * idx.mInDataBlockIdx.y + + (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z; uint32_t const blockAdnCardPitch = (idx.mDataBlockIdx * mCardinality + card) * blockPitchByCard; uint32_t const pitch = blockAdnCardPitch + inBlockInCardPitch; return pitch; } -template -inline NEON_CUDA_HOST_DEVICE auto bPartition:: +template +inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpNghPitch(const Idx& nghIdx, int card) const -> std::tuple { @@ -126,8 +126,8 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: helpGetNghIdx(const Idx& idx, const NghIdx& offset) const -> Idx @@ -142,9 +142,9 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition= memBlockSizeX ? +1 : 0); - const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= memBlockSizeX ? +1 : 0); - const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= memBlockSizeX ? +1 : 0); + const int xFlag = ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + const int yFlag = ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + const int zFlag = ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); const bool isLocal = (xFlag | yFlag | zFlag) == 0; if (!(isLocal)) { @@ -177,9 +177,9 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: getNghData(const Idx& eId, uint8_t nghID, int card) @@ -207,8 +207,8 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition -NEON_CUDA_HOST_DEVICE inline auto bPartition:: +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: getNghData(const Idx& idx, const NghIdx& offset, const int card) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h index bf91dc16..80fb12ab 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h @@ -4,7 +4,7 @@ namespace Neon::domain::details::bGrid { -template +template class bSpan { public: @@ -15,8 +15,8 @@ class bSpan static constexpr Neon::MemoryLayout activeMaskMemoryLayout = Neon::MemoryLayout::arrayOfStructs; static constexpr uint32_t log2OfbitMaskWordSize = 6; - using Idx = bIndex; - friend class bGrid; + using Idx = bIndex; + friend class bGrid; static constexpr int SpaceDim = 3; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h index 50f441a0..57d7aeca 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h @@ -2,9 +2,9 @@ namespace Neon::domain::details::bGrid { -template +template NEON_CUDA_HOST_DEVICE inline auto -bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool +bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool { #ifdef NEON_PLACE_CUDA_DEVICE bidx.mDataBlockIdx = blockIdx.x + mFirstDataBlockOffset; @@ -22,9 +22,9 @@ bSpan +template NEON_CUDA_HOST_DEVICE inline auto -bSpan::setAndValidateCPUDevice(Idx& bidx, +bSpan::setAndValidateCPUDevice(Idx& bidx, uint32_t const& dataBlockIdx, uint32_t const& x, uint32_t const& y, @@ -41,8 +41,8 @@ bSpan -bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, +template +bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, BitMaskWordType* activeMask, Neon::DataView dataView) : mFirstDataBlockOffset(firstDataBlockOffset), @@ -51,16 +51,16 @@ bSpan -NEON_CUDA_HOST_DEVICE inline auto bSpan::getRequiredWordsForBlockBitMask() -> uint32_t +template +NEON_CUDA_HOST_DEVICE inline auto bSpan::getRequiredWordsForBlockBitMask() -> uint32_t { - uint32_t requiredBits = memBlockSizeX * memBlockSizeY * memBlockSizeZ; - uint32_t requiredWords = ((requiredBits - 1) >> bSpan::log2OfbitMaskWordSize) + 1; + uint32_t requiredBits = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + uint32_t requiredWords = ((requiredBits - 1) >> bSpan::log2OfbitMaskWordSize) + 1; return requiredWords; } -template -inline auto bSpan::getMaskAndWordIdforBlockBitMask(int threadX, +template +inline auto bSpan::getMaskAndWordIdforBlockBitMask(int threadX, int threadY, int threadZ, NEON_OUT BitMaskWordType& mask, @@ -68,7 +68,7 @@ inline auto bSpan> log2OfbitMaskWordSize // the same as: threadPitch / 2^{log2OfbitMaskWordSize} wordIdx = threadPitch >> log2OfbitMaskWordSize; @@ -82,8 +82,8 @@ inline auto bSpan -NEON_CUDA_HOST_DEVICE inline auto bSpan::getActiveStatus( +template +NEON_CUDA_HOST_DEVICE inline auto bSpan::getActiveStatus( const typename Idx::DataBlockIdx& dataBlockIdx, int threadX, int threadY, @@ -92,7 +92,7 @@ NEON_CUDA_HOST_DEVICE inline auto bSpan> log2OfbitMaskWordSize // the same as: threadPitch / 2^{log2OfbitMaskWordSize} const uint32_t wordIdx = threadPitch >> log2OfbitMaskWordSize; diff --git a/libNeonDomain/src/domain/details/bGrid/bGrid.cpp b/libNeonDomain/src/domain/details/bGrid/bGrid.cpp index 0cc0dfef..78dad9bf 100644 --- a/libNeonDomain/src/domain/details/bGrid/bGrid.cpp +++ b/libNeonDomain/src/domain/details/bGrid/bGrid.cpp @@ -3,6 +3,6 @@ namespace Neon::domain::details::bGrid { -template class bGrid<8,8,8>; +template class bGrid>; } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp b/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp index 9e0cd408..794dfde0 100644 --- a/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp +++ b/libNeonDomain/tests/domain-bGrid-tray/src/gtests.cpp @@ -4,38 +4,34 @@ #include "gtest/gtest.h" - -template +template void test_backToBackConversion() { - using bGrid = Neon::domain::details::bGrid::bGrid; + using bGrid = Neon::domain::details::bGrid::bGrid; using MicroIndex = Neon::domain::details::bGrid::MicroIndex; typename bGrid::Idx bIdx; MicroIndex microIdx; - uint32_t ratioOnX = (memBlockSizeX) / (userBlockSizeX); - uint32_t ratioOnY = (memBlockSizeY) / (userBlockSizeY); - uint32_t ratioOnZ = (memBlockSizeZ) / (userBlockSizeZ); for (uint32_t memBlockIdx = 0; memBlockIdx < 10; memBlockIdx++) { - const uint32_t memBlockJump = (ratioOnX*ratioOnY*ratioOnZ)*memBlockIdx; - for (uint32_t rZ = 0; rZ < ratioOnZ; rZ++) { - for (uint32_t rY = 0; rY < ratioOnY; rY++) { - for (uint32_t rX = 0; rX < ratioOnX; rX++) { - for (int8_t k = 0; k < int8_t(userBlockSizeX); k++) { - for (int8_t j = 0; j < int8_t(userBlockSizeY); j++) { - for (int8_t i = 0; i < int8_t(userBlockSizeZ); i++) { // Set the micro idx to the first voxel + const uint32_t memBlockJump = (SBlock::blockRatioX * SBlock::blockRatioY * SBlock::blockRatioZ) * memBlockIdx; + for (uint32_t rZ = 0; rZ < SBlock::blockRatioZ; rZ++) { + for (uint32_t rY = 0; rY < SBlock::blockRatioY; rY++) { + for (uint32_t rX = 0; rX < SBlock::blockRatioX; rX++) { + for (int8_t k = 0; k < int8_t(SBlock::userBlockSizeX); k++) { + for (int8_t j = 0; j < int8_t(SBlock::userBlockSizeY); j++) { + for (int8_t i = 0; i < int8_t(SBlock::userBlockSizeZ); i++) { // Set the micro idx to the first voxel // Check that bIdx point to the first voxels too - microIdx.setTrayBlockIdx(memBlockJump + rX + rY * ratioOnX + rZ * ratioOnY * ratioOnX); + microIdx.setTrayBlockIdx(memBlockJump + rX + rY * SBlock::blockRatioX + rZ * SBlock::blockRatioY * SBlock::blockRatioX); microIdx.setInTrayBlockIdx({i, j, k}); bIdx.init(microIdx); auto res = bIdx.getMicroIndex(); ASSERT_EQ(bIdx.getDataBlockIdx(), memBlockIdx); - ASSERT_EQ(bIdx.getInDataBlockIdx(), Neon::int8_3d(static_cast(i + rX * userBlockSizeX), - static_cast(j + rY * userBlockSizeY), - static_cast( k + rZ * userBlockSizeZ))) - << bIdx.getInDataBlockIdx() << " instead of " << Neon::int8_3d(static_cast(i + rX * userBlockSizeX), static_cast(j + rY * userBlockSizeY),static_cast( k + rZ * userBlockSizeZ)) << " with rX,Ry,rZ " << rX << "," << rY << "," << rZ << " and i,j,k = " << i << "," << j << "," << k; + ASSERT_EQ(bIdx.getInDataBlockIdx(), Neon::int8_3d(static_cast(i + rX * SBlock::userBlockSizeX), + static_cast(j + rY * SBlock::userBlockSizeY), + static_cast(k + rZ * SBlock::userBlockSizeZ))) + << bIdx.getInDataBlockIdx() << " instead of " << Neon::int8_3d(static_cast(i + rX * SBlock::userBlockSizeX), static_cast(j + rY * SBlock::userBlockSizeY), static_cast(k + rZ * SBlock::userBlockSizeZ)) << " with rX,Ry,rZ " << rX << "," << rY << "," << rZ << " and i,j,k = " << i << "," << j << "," << k; ASSERT_EQ(res.getTrayBlockIdx(), microIdx.getTrayBlockIdx()); @@ -51,27 +47,27 @@ void test_backToBackConversion() TEST(bGrid_tray, init_4_4_4_2_2_2) { - test_backToBackConversion<4, 4, 4, 2, 2, 2>(); + test_backToBackConversion>(); } TEST(bGrid_tray, init_8_8_8_2_2_2) { - test_backToBackConversion<8, 8, 8, 2, 2, 2>(); + test_backToBackConversion>(); } TEST(bGrid_tray, init_8_8_8_1_1_1) { - test_backToBackConversion<8, 8, 8, 1, 1, 1>(); + test_backToBackConversion>(); } TEST(bGrid_tray, init_8_8_8_4_4_4) { - test_backToBackConversion<8, 8, 8, 4, 4, 4>(); + test_backToBackConversion>(); } TEST(bGrid_tray, init_4_4_4_2_1_2) { - test_backToBackConversion<4,4,4, 2, 1, 2>(); + test_backToBackConversion>(); } int main(int argc, char** argv) From d82e985c2c92863daf39ae3d595d2dafc9ea443f Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 17:54:05 -0400 Subject: [PATCH 04/94] Cleaning up naming for the BlockViewGrid --- .../bGrid/BlockViewGrid/BlockViewGrid.h | 2 +- .../Neon/domain/details/bGrid/StaticBlock.h | 13 +++++++++++ .../Neon/domain/details/bGrid/bField.h | 6 +++-- .../include/Neon/domain/details/bGrid/bGrid.h | 23 +++++++++++-------- .../Neon/domain/details/bGrid/bPartition.h | 12 ++++++---- 5 files changed, 38 insertions(+), 18 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h index 3f2f3544..cc714802 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h @@ -90,8 +90,8 @@ struct GridTransformation }); } }; +using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; } // namespace details -using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h index 612c6b9a..14872577 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h @@ -41,6 +41,19 @@ struct StaticBlock static_assert(memBlockSizeX % userBlockSizeX == 0); static_assert(memBlockSizeY % userBlockSizeY == 0); static_assert(memBlockSizeZ % userBlockSizeZ == 0); + + struct BitMask + { + auto reset() + { + for (uint32_t i = 0; i < nWords; ++i) { + bits[i] = 0; + } + } + + constexpr static uint32_t nWords = (memBlockCountElements + 31) / 32; + uint32_t bits[nWords]; + }; }; } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index 95c1d6d5..8f1ac485 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -33,11 +33,13 @@ class bField : public Neon::domain::interface::FieldBaseTemplate; using Partition = bPartition; using Idx = bIndex; + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + template + using BlockViewField = BlockViewGrid::template Field; using NghIdx = typename Partition::NghIdx; using NghData = typename Partition::NghData; - bField(const std::string& fieldUserName, Neon::DataUse dataUse, const Neon::MemoryOptions& memoryOptions, @@ -109,7 +111,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate grid; - BlockViewGrid::Field memoryField; + BlockViewField memoryField; int mCardinality; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index 9d91df5d..a40935bb 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -5,6 +5,7 @@ #include "BlockViewGrid/BlockViewGrid.h" #include "Neon/domain/aGrid.h" +#include "Neon/domain/details/bGrid/StaticBlock.h" #include "Neon/domain/details/bGrid/bField.h" #include "Neon/domain/details/bGrid/bIndex.h" #include "Neon/domain/details/bGrid/bPartition.h" @@ -16,8 +17,6 @@ #include "Neon/domain/tools/SpanTable.h" #include "Neon/set/Containter.h" #include "Neon/set/LaunchParametersTable.h" -#include "Neon/domain/details/bGrid/StaticBlock.h" - #include "bField.h" #include "bPartition.h" @@ -31,7 +30,7 @@ class bField; template class bGrid : public Neon::domain::interface::GridBaseTemplate, - bIndex > + bIndex> { public: using Grid = bGrid; @@ -42,9 +41,13 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, template using Field = Neon::domain::details::bGrid::bField; + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; + template + using BlockViewField = BlockViewGrid::template Field; + using Span = bSpan; using NghIdx = typename Partition::NghIdx; - using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate >; + using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate>; using Idx = bIndex; static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d1b3; @@ -124,9 +127,9 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, T inactiveValue, Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const - -> BlockViewGrid::Field; + -> BlockViewField; - /* + /** * Allocates a new container to execute some computation in the grid */ template , size_t sharedMem, LoadingLambda lambda) const -> Neon::set::Container; - /* + /** * Allocates a new container to execute some computation in the grid */ template , * Retrieve the block vew grid internally used. * This grid can be leverage to allocate data at the block level. */ - auto getActiveBitMask() const -> BlockViewGrid::Field&; + auto getActiveBitMask() const -> BlockViewField&; /** * Help function to retrieve the block connectivity as a BlockViewGrid field */ - auto helpGetBlockConnectivity() const -> BlockViewGrid::Field&; + auto helpGetBlockConnectivity() const -> BlockViewField&; /** * Help function to retrieve the block origin as a BlockViewGrid field @@ -228,7 +231,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, }; std::shared_ptr mData; }; -extern template class bGrid>; +extern template class bGrid>; } // namespace Neon::domain::details::bGrid #include "bField_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index f20a513d..48312b22 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -21,6 +21,8 @@ class bPartition using NghIdx = typename Idx::NghIdx; using Type = T; using NghData = Neon::domain::NghData; + + using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; using BlockViewGridIdx = BlockViewGrid::Idx; public: @@ -90,13 +92,13 @@ class bPartition const -> Idx; - int mCardinality; - T* mMem; - NghIdx* mStencilNghIndex; + int mCardinality; + T* mMem; + NghIdx* mStencilNghIndex; typename Idx::DataBlockIdx* mBlockConnectivity; typename Span::BitMaskWordType* mMask; - Neon::int32_3d* mOrigin; - int mSetIdx; + Neon::int32_3d* mOrigin; + int mSetIdx; }; } // namespace Neon::domain::details::bGrid From 9e29f8e78a673dda0f587184ee981f0793b75fa0 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 19:35:02 -0400 Subject: [PATCH 05/94] bGrid - introducing the concept of BlockView and refactoring the bitmask field. --- .../Neon/domain/details/bGrid/BlockView.h | 29 +++++ .../BlockViewGrid.h | 0 .../BlockViewPartition.h | 0 .../BlockViewPartition_imp.h | 0 .../Neon/domain/details/bGrid/StaticBlock.h | 53 +++++++++- .../include/Neon/domain/details/bGrid/bGrid.h | 32 +++--- .../Neon/domain/details/bGrid/bGrid_imp.h | 100 ++++++++---------- .../Neon/domain/details/bGrid/bPartition.h | 28 ++--- .../domain/details/bGrid/bPartition_imp.h | 19 ++-- .../include/Neon/domain/details/bGrid/bSpan.h | 40 +++---- .../Neon/domain/details/bGrid/bSpan_imp.h | 89 ++-------------- .../Neon/domain/details/eGrid/eField_imp.h | 2 +- .../Neon/domain/interface/FieldBase_imp.h | 2 +- 13 files changed, 185 insertions(+), 209 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h rename libNeonDomain/include/Neon/domain/details/bGrid/{BlockViewGrid => BlockView}/BlockViewGrid.h (100%) rename libNeonDomain/include/Neon/domain/details/bGrid/{BlockViewGrid => BlockView}/BlockViewPartition.h (100%) rename libNeonDomain/include/Neon/domain/details/bGrid/{BlockViewGrid => BlockView}/BlockViewPartition_imp.h (100%) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h new file mode 100644 index 00000000..42093147 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h @@ -0,0 +1,29 @@ +#include "Neon/domain/details/bGrid/BlockView/BlockViewGrid.h" +#include "Neon/domain/tools/GridTransformer.h" + +namespace Neon::domain::details::bGrid { + +struct BlockView +{ + public: + using Grid = Neon::domain::tool::GridTransformer::Grid; + template + using Field = Grid::template Field; + using index_3d = Neon::index_3d; + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * card]; + } + + template + static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t + { + return mem[idx * C]; + } + + static constexpr Neon::MemoryLayout layout = Neon::MemoryLayout::arrayOfStructs; +}; + +} // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewGrid.h similarity index 100% rename from libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewGrid.h rename to libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewGrid.h diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition.h similarity index 100% rename from libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition.h rename to libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition.h diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition_imp.h similarity index 100% rename from libNeonDomain/include/Neon/domain/details/bGrid/BlockViewGrid/BlockViewPartition_imp.h rename to libNeonDomain/include/Neon/domain/details/bGrid/BlockView/BlockViewPartition_imp.h diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h index 14872577..951f9fd3 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h @@ -44,15 +44,60 @@ struct StaticBlock struct BitMask { - auto reset() + using BitMaskWordType = uint32_t; + auto reset() -> void { - for (uint32_t i = 0; i < nWords; ++i) { + for (BitMaskWordType i = 0; i < nWords; ++i) { bits[i] = 0; } } - constexpr static uint32_t nWords = (memBlockCountElements + 31) / 32; - uint32_t bits[nWords]; + auto setActive(int threadX, + int threadY, + int threadZ) -> void + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + word = word | mask; + } + + inline auto NEON_CUDA_HOST_DEVICE isActive(int threadX, + int threadY, + int threadZ) const -> bool + { + BitMaskWordType mask; + uint32_t wordIdx; + getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx); + auto& word = bits[wordIdx]; + return (word & mask) != 0; + } + + static inline auto NEON_CUDA_HOST_DEVICE getMaskAndWordI(int threadX, + int threadY, + int threadZ, + NEON_OUT BitMaskWordType& mask, + NEON_OUT uint32_t& wordIdx) -> void + { + const uint32_t threadPitch = threadX * memBlockPitchX + + threadY * memBlockPitchY + + threadZ * memBlockPitchZ; + + // threadPitch >> log2_of_bitPerWord + // the same as: threadPitch / 2^{log2_of_bitPerWord} + wordIdx = threadPitch >> log2_of_bitPerWord; + // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1); + // same as threadPitch % 2^{log2OfbitMaskWordSize} + const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitPerWord)) - 1); + mask = BitMaskWordType(1) << offsetInWord; + } + + constexpr static BitMaskWordType nWords = (memBlockCountElements + 31) / 32; + static constexpr uint32_t log2_of_bitPerWord = 5; + static constexpr uint32_t bitPerWord = 32; + + BitMaskWordType bits[nWords]; }; }; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index a40935bb..8ed458c8 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -1,10 +1,8 @@ #pragma once #include "Neon/core/core.h" -#include "Neon/set/memory/memSet.h" - -#include "BlockViewGrid/BlockViewGrid.h" #include "Neon/domain/aGrid.h" +#include "Neon/domain/details/bGrid/BlockView.h" #include "Neon/domain/details/bGrid/StaticBlock.h" #include "Neon/domain/details/bGrid/bField.h" #include "Neon/domain/details/bGrid/bIndex.h" @@ -17,6 +15,7 @@ #include "Neon/domain/tools/SpanTable.h" #include "Neon/set/Containter.h" #include "Neon/set/LaunchParametersTable.h" +#include "Neon/set/memory/memSet.h" #include "bField.h" #include "bPartition.h" @@ -34,17 +33,11 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, { public: using Grid = bGrid; - template using Partition = bPartition; - template using Field = Neon::domain::details::bGrid::bField; - using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; - template - using BlockViewField = BlockViewGrid::template Field; - using Span = bSpan; using NghIdx = typename Partition::NghIdx; using GridBaseTemplate = Neon::domain::interface::GridBaseTemplate>; @@ -127,7 +120,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, T inactiveValue, Neon::DataUse dataUse = Neon::DataUse::HOST_DEVICE, Neon::MemoryOptions memoryOptions = Neon::MemoryOptions()) const - -> BlockViewField; + -> BlockView::Field; /** * Allocates a new container to execute some computation in the grid @@ -165,30 +158,30 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, * Retrieve the block vew grid internally used. * This grid can be leverage to allocate data at the block level. */ - auto getBlockViewGrid() const -> BlockViewGrid&; + auto getBlockViewGrid() const -> BlockView::Grid&; /** * Retrieve the block vew grid internally used. * This grid can be leverage to allocate data at the block level. */ - auto getActiveBitMask() const -> BlockViewField&; + auto getActiveBitMask() const -> BlockView::Field&; /** * Help function to retrieve the block connectivity as a BlockViewGrid field */ - auto helpGetBlockConnectivity() const -> BlockViewField&; + auto helpGetBlockConnectivity() const -> BlockView::Field&; /** * Help function to retrieve the block origin as a BlockViewGrid field */ auto helpGetDataBlockOriginField() const -> Neon::aGrid::Field&; - /* + /** * Help function to retrieve the map that converts a stencil point id to 3d offset */ auto helpGetStencilIdTo3dOffset() const -> Neon::set::MemSet&; - /* + /** * Help function retriev the device and the block index associated to a point in the BlockViewGrid grid */ auto helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple; @@ -212,11 +205,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, Neon::aGrid::Field mDataBlockOriginField; Neon::set::MemSet mStencil3dTo1dOffset; - BlockViewGrid blockViewGrid; - BlockViewGrid::Field activeBitMask; - BlockViewGrid::Field blockConnectivity; - - Neon::set::MemSet stencilIdTo3dOffset; + BlockView::Grid blockViewGrid; + BlockView::Field activeBitField; + BlockView::Field blockConnectivity; + Neon::set::MemSet stencilIdTo3dOffset; tool::Partitioner1D::DenseMeta denseMeta; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 1b40a8b7..7505a06b 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -75,66 +75,60 @@ bGrid::bGrid(const Neon::Backend& backend, spacingData * SBlock::memBlockSize3D, origin); - mData->blockViewGrid = BlockViewGrid(egrid); + mData->blockViewGrid = BlockView::Grid(egrid); } { // Active bitmask - int requiredWords = Span::getRequiredWordsForBlockBitMask(); - mData->activeBitMask = mData->blockViewGrid.template newField("BitMask", - requiredWords, - 0, - Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(Span::activeMaskMemoryLayout)); + mData->activeBitField = mData->blockViewGrid.template newField( + "BlockViewBitMask", + 1, + [] { + typename SBlock::BitMask outsideBitMask; + outsideBitMask.reset(); + return outsideBitMask; + }(), + Neon::DataUse::HOST_DEVICE, backend.getMemoryOptions(BlockView::layout)); mData->mNumActiveVoxel = backend.devSet().template newDataSet(); - mData->activeBitMask + mData->activeBitField .getGrid() .template newContainer( "activeBitMaskInit", [&](Neon::set::Loader& loader) { - auto bitMask = loader.load(mData->activeBitMask); - return [&, bitMask](const auto& bitMaskIdx) mutable { - auto prtIdx = bitMask.prtID(); - int coutActive = 0; - auto const blockOrigin = bitMask.getGlobalIndex(bitMaskIdx); - - for (int c = 0; c < bitMask.cardinality(); c++) { - bitMask(bitMaskIdx, c) = 0; - } + auto bitMaskPartition = loader.load(mData->activeBitField); + return [&, bitMaskPartition](const auto& bitMaskIdx) mutable { + auto prtIdx = bitMaskPartition.prtID(); + int countActive = 0; + auto const blockOrigin = bitMaskPartition.getGlobalIndex(bitMaskIdx); + typename SBlock::BitMask& bitMask = bitMaskPartition(bitMaskIdx, 0); + bitMask.reset(); for (int k = 0; k < SBlock::memBlockSize3D.template newType().z; k++) { for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { - - Neon::int32_3d localPosition(i, j, k); - typename Span::BitMaskWordType mask; - uint32_t wordIdx; - - Span::getMaskAndWordIdforBlockBitMask(i, j, k, NEON_OUT mask, NEON_OUT wordIdx); - auto globalPosition = localPosition + blockOrigin; - bool isInDomain = globalPosition < domainSize; - bool isActive = activeCellLambda(globalPosition); + auto globalPosition = blockOrigin + Neon::int32_3d(i, j, k); + bool const isInDomain = globalPosition < domainSize; + bool const isActive = activeCellLambda(globalPosition); if (isActive && isInDomain) { - coutActive++; - auto value = bitMask(bitMaskIdx, wordIdx); - value = value | mask; - bitMask(bitMaskIdx, wordIdx) = value; + countActive++; + bitMask.setActive(i, j, k); } } } } #pragma omp critical { - mData->mNumActiveVoxel[prtIdx] += coutActive; + mData->mNumActiveVoxel[prtIdx] += countActive; } }; }) .run(Neon::Backend::mainStreamIdx); - mData->activeBitMask.updateDeviceData(Neon::Backend::mainStreamIdx); - mData->activeBitMask.newHaloUpdate(Neon::set::StencilSemantic::standard, - Neon::set::TransferMode::put, - Neon::Execution::device) + mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx); + mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::put, + Neon::Execution::device) .run(Neon::Backend::mainStreamIdx); } @@ -184,20 +178,20 @@ bGrid::bGrid(const Neon::Backend& backend, case Neon::DataView::STANDARD: { span.mFirstDataBlockOffset = 0; span.mDataView = dw; - span.mActiveMask = mData->activeBitMask.getPartition(execution, setIdx, dw).mem(); + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); break; } case Neon::DataView::BOUNDARY: { span.mFirstDataBlockOffset = mData->partitioner1D.getSpanClassifier().countInternal(setIdx); span.mDataView = dw; - span.mActiveMask = mData->activeBitMask.getPartition(execution, setIdx, dw).mem(); + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); break; } case Neon::DataView::INTERNAL: { span.mFirstDataBlockOffset = 0; span.mDataView = dw; - span.mActiveMask = mData->activeBitMask.getPartition(execution, setIdx, dw).mem(); + span.mActiveMask = mData->activeBitField.getPartition(execution, setIdx, dw).mem(); break; } default: { @@ -267,10 +261,10 @@ auto bGrid::newBlockViewField(const std::string name, int cardinality, T inactiveValue, Neon::DataUse dataUse, - Neon::MemoryOptions memoryOptions) const -> BlockViewGrid::Field + Neon::MemoryOptions memoryOptions) const -> BlockView::Field { memoryOptions = this->getDevSet().sanitizeMemoryOption(memoryOptions); - BlockViewGrid::Field blockViewField = mData->blockViewGrid.template newField(name, cardinality, inactiveValue, dataUse, memoryOptions); + BlockView::Field blockViewField = mData->blockViewGrid.template newField(name, cardinality, inactiveValue, dataUse, memoryOptions); return blockViewField; } @@ -310,7 +304,7 @@ auto bGrid::newContainer(const std::string& name, template auto bGrid:: getBlockViewGrid() - const -> BlockViewGrid& + const -> BlockView::Grid& { return mData->blockViewGrid; } @@ -318,15 +312,15 @@ auto bGrid:: template auto bGrid:: getActiveBitMask() - const -> BlockViewGrid::Field& + const -> BlockView::Field& { - return mData->activeBitMask; + return mData->activeBitField; } template auto bGrid:: helpGetBlockConnectivity() - const -> BlockViewGrid::Field& + const -> BlockView::Field& { return mData->blockConnectivity; } @@ -386,22 +380,18 @@ template auto bGrid::isInsideDomain(const index_3d& idx) const -> bool { // 1. check if the block is active - const index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); - auto blockProperties = mData->blockViewGrid.getProperties(blockIdx3d); + const BlockView::index_3d blockIdx3d = idx / SBlock::memBlockSize3D.template newType(); + auto blockProperties = mData->blockViewGrid.getProperties(blockIdx3d); if (!blockProperties.isInside()) { return false; } - // 2. The block is active, check the element on the block - uint32_t wordCardinality; - typename Span::BitMaskWordType mask; - Span::getMaskAndWordIdforBlockBitMask(idx.x % SBlock::memBlockSize3D.x, - idx.y % SBlock::memBlockSize3D.y, - idx.z % SBlock::memBlockSize3D.z, - NEON_OUT mask, - NEON_OUT wordCardinality); - auto activeBits = mData->activeBitMask.getReference(blockIdx3d, int(wordCardinality)); - return (activeBits & mask) != 0; + // 2. The block is active, check the element in the block + typename SBlock::BitMask const& bitMask = mData->activeBitField.getReference(blockIdx3d, 0); + bool isActive = bitMask.isActive(idx.x % SBlock::memBlockSize3D.x, + idx.y % SBlock::memBlockSize3D.y, + idx.z % SBlock::memBlockSize3D.z); + return isActive; } template diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 48312b22..7f537ad5 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -30,13 +30,13 @@ class bPartition ~bPartition() = default; - explicit bPartition(int setIdx, - int mCardinality, - T* mMem, - typename Idx::DataBlockIdx* mBlockConnectivity, - typename Span::BitMaskWordType* mMask, - Neon::int32_3d* mOrigin, - NghIdx* mStencilNghIndex); + explicit bPartition(int setIdx, + int mCardinality, + T* mMem, + typename Idx::DataBlockIdx* mBlockConnectivity, + typename SBlock::BitMask const* NEON_RESTRICT mMask, + Neon::int32_3d* mOrigin, + NghIdx* mStencilNghIndex); inline NEON_CUDA_HOST_DEVICE auto cardinality() @@ -92,13 +92,13 @@ class bPartition const -> Idx; - int mCardinality; - T* mMem; - NghIdx* mStencilNghIndex; - typename Idx::DataBlockIdx* mBlockConnectivity; - typename Span::BitMaskWordType* mMask; - Neon::int32_3d* mOrigin; - int mSetIdx; + int mCardinality; + T* mMem; + NghIdx const* NEON_RESTRICT mStencilNghIndex; + typename Idx::DataBlockIdx const* NEON_RESTRICT mBlockConnectivity; + typename SBlock::BitMask const* NEON_RESTRICT mMask; + Neon::int32_3d const* NEON_RESTRICT mOrigin; + int mSetIdx; }; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index 8506476b..6e3b728f 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -19,13 +19,13 @@ bPartition::bPartition() template bPartition:: - bPartition(int setIdx, - int cardinality, - T* mem, - typename Idx::DataBlockIdx* blockConnectivity, - typename Span::BitMaskWordType* mask, - Neon::int32_3d* origin, - NghIdx* stencilNghIndex) + bPartition(int setIdx, + int cardinality, + T* mem, + typename Idx::DataBlockIdx* blockConnectivity, + typename SBlock::BitMask const* NEON_RESTRICT mask, + Neon::int32_3d* origin, + NghIdx* stencilNghIndex) : mCardinality(cardinality), mMem(mem), mStencilNghIndex(stencilNghIndex), @@ -115,10 +115,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: return {false, 0}; } - bool isActive = Span::getActiveStatus(nghIdx.mDataBlockIdx, - nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z, - mMask); - + const bool isActive = mMask[nghIdx.mDataBlockIdx].isActive(nghIdx.mInDataBlockIdx.x, nghIdx.mInDataBlockIdx.y, nghIdx.mInDataBlockIdx.z); if (!isActive) { return {false, 0}; } diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h index 80fb12ab..9c6ed821 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan.h @@ -23,42 +23,32 @@ class bSpan bSpan() = default; virtual ~bSpan() = default; - NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId() -> typename Idx::DataBlockIdx + NEON_CUDA_HOST_DEVICE inline static auto getInvalidBlockId() + -> typename Idx::DataBlockIdx { return std::numeric_limits::max(); } - inline bSpan(typename Idx::DataBlockCount mFirstDataBlockOffset, - bSpan::BitMaskWordType* mActiveMask, - Neon::DataView mDataView); + inline bSpan( + typename Idx::DataBlockCount mFirstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask, + Neon::DataView mDataView); - NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice(Idx& bidx, - uint32_t const& threadIdx, - uint32_t const& x, - uint32_t const& y, - uint32_t const& z) const -> bool; + NEON_CUDA_HOST_DEVICE inline auto setAndValidateCPUDevice( + Idx& bidx, + uint32_t const& threadIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool; NEON_CUDA_HOST_DEVICE inline auto setAndValidateGPUDevice( Idx& bidx) const -> bool; - static NEON_CUDA_HOST_DEVICE inline auto getRequiredWordsForBlockBitMask() -> uint32_t; - static NEON_CUDA_HOST_DEVICE inline auto getActiveStatus( - const typename Idx::DataBlockIdx& dataBlockIdx, - int threadX, - int threadY, - int threadZ, - bSpan::BitMaskWordType* mActiveMask) -> bool; - - static inline auto getMaskAndWordIdforBlockBitMask(int threadX, - int threadY, - int threadZ, - BitMaskWordType& mask, - uint32_t& wordIdx) -> void; // We don't need to have a count on active blocks - typename Idx::DataBlockCount mFirstDataBlockOffset; - bSpan::BitMaskWordType* mActiveMask; - Neon::DataView mDataView; + typename Idx::DataBlockCount mFirstDataBlockOffset; + typename SBlock::BitMask const* NEON_RESTRICT mActiveMask; + Neon::DataView mDataView; }; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h index 57d7aeca..8a208110 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bSpan_imp.h @@ -12,10 +12,8 @@ bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool bidx.mInDataBlockIdx.y = threadIdx.y; bidx.mInDataBlockIdx.z = threadIdx.z; - bool const isActive = getActiveStatus(bidx.mDataBlockIdx, - bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z, - mActiveMask); - // printf("%d %d %d is active %d\n",bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z, (isActive?1:-1)); + const bool isActive = mActiveMask[bidx.mDataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); + return isActive; #else NEON_THROW_UNSUPPORTED_OPERATION("Operation supported only on GPU"); @@ -25,94 +23,29 @@ bSpan::setAndValidateGPUDevice([[maybe_unused]] Idx& bidx) const -> bool template NEON_CUDA_HOST_DEVICE inline auto bSpan::setAndValidateCPUDevice(Idx& bidx, - uint32_t const& dataBlockIdx, - uint32_t const& x, - uint32_t const& y, - uint32_t const& z) const -> bool + uint32_t const& dataBlockIdx, + uint32_t const& x, + uint32_t const& y, + uint32_t const& z) const -> bool { bidx.mDataBlockIdx = dataBlockIdx; - bidx.mInDataBlockIdx.x = static_cast < typename Idx::InDataBlockIdx::Integer>(x); + bidx.mInDataBlockIdx.x = static_cast(x); bidx.mInDataBlockIdx.y = static_cast(y); bidx.mInDataBlockIdx.z = static_cast(z); - bool const isActive = getActiveStatus(bidx.mDataBlockIdx, - bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z, - mActiveMask); + const bool isActive = mActiveMask[dataBlockIdx].isActive(bidx.mInDataBlockIdx.x, bidx.mInDataBlockIdx.y, bidx.mInDataBlockIdx.z); return isActive; } template -bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, - BitMaskWordType* activeMask, - Neon::DataView dataView) +bSpan::bSpan(typename Idx::DataBlockCount firstDataBlockOffset, + typename SBlock::BitMask const* NEON_RESTRICT activeMask, + Neon::DataView dataView) : mFirstDataBlockOffset(firstDataBlockOffset), mActiveMask(activeMask), mDataView(dataView) { } -template -NEON_CUDA_HOST_DEVICE inline auto bSpan::getRequiredWordsForBlockBitMask() -> uint32_t -{ - uint32_t requiredBits = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; - uint32_t requiredWords = ((requiredBits - 1) >> bSpan::log2OfbitMaskWordSize) + 1; - return requiredWords; -} - -template -inline auto bSpan::getMaskAndWordIdforBlockBitMask(int threadX, - int threadY, - int threadZ, - NEON_OUT BitMaskWordType& mask, - NEON_OUT uint32_t& wordIdx) -> void -{ - if constexpr (activeMaskMemoryLayout == Neon::MemoryLayout::arrayOfStructs) { - // 6 = log_2 64 - const uint32_t threadPitch = threadX + threadY * SBlock::memBlockSizeX + threadZ * SBlock::memBlockSizeX * SBlock::memBlockSizeY; - // threadPitch >> log2OfbitMaskWordSize - // the same as: threadPitch / 2^{log2OfbitMaskWordSize} - wordIdx = threadPitch >> log2OfbitMaskWordSize; - // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1); - // same as threadPitch % 2^{log2OfbitMaskWordSize} - const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitMaskStorageBitWidth)) - 1); - mask = BitMaskWordType(1) << offsetInWord; - } else { - assert(false); - } -} - - -template -NEON_CUDA_HOST_DEVICE inline auto bSpan::getActiveStatus( - const typename Idx::DataBlockIdx& dataBlockIdx, - int threadX, - int threadY, - int threadZ, - BitMaskWordType* mActiveMask) -> bool -{ - if constexpr (activeMaskMemoryLayout == Neon::MemoryLayout::arrayOfStructs) { - // 6 = log_2 64 - const uint32_t threadPitch = threadX + threadY * SBlock::memBlockSizeX + threadZ * SBlock::memBlockSizeX * SBlock::memBlockSizeY; - // threadPitch >> log2OfbitMaskWordSize - // the same as: threadPitch / 2^{log2OfbitMaskWordSize} - const uint32_t wordIdx = threadPitch >> log2OfbitMaskWordSize; - // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1); - // same as threadPitch % 2^{log2OfbitMaskWordSize} - const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitMaskStorageBitWidth)) - 1); - BitMaskWordType mask = BitMaskWordType(1) << offsetInWord; - - uint32_t const cardinality = getRequiredWordsForBlockBitMask(); - uint32_t const pitch = (cardinality * dataBlockIdx) + wordIdx; - BitMaskWordType targetWord = mActiveMask[pitch]; - auto masked = targetWord & mask; - if (masked != 0) { - return true; - } - return false; - } else { - assert(false); - } - // -} } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h index 5cd93860..1843c4df 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h @@ -35,7 +35,7 @@ eField::eField(const std::string& fieldUserName, mData->memoryField = mData->grid->getMemoryGrid().template newField(fieldUserName + "-storage", cardinality, - T(0), + inactiveValue, dataUse); diff --git a/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h b/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h index ea10edf6..97d10dc1 100644 --- a/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h +++ b/libNeonDomain/include/Neon/domain/interface/FieldBase_imp.h @@ -359,7 +359,7 @@ template FieldBase::Storage::Storage() : dimension(0), cardinality(0), - outsideVal(static_cast(0.0)), + outsideVal(T()), dataUse(), memoryOptions(), haloStatus(), From cdcdc0df3dcf91aa7927f9085e30041d057a7336 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 15 Jun 2023 20:09:20 -0400 Subject: [PATCH 06/94] bGrid - fixing multi-GPU --- .../include/Neon/domain/details/bGrid/bField.h | 18 +++++++++--------- .../Neon/domain/details/bGrid/bField_imp.h | 15 ++++++++------- .../Neon/domain/details/bGrid/bGrid_imp.h | 2 ++ .../tests/domain-stencil/src/gtests.cpp | 4 ++-- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index 8f1ac485..d0dd45c5 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -34,18 +34,18 @@ class bField : public Neon::domain::interface::FieldBaseTemplate; using Idx = bIndex; using BlockViewGrid = Neon::domain::tool::GridTransformer::Grid; - template + template using BlockViewField = BlockViewGrid::template Field; using NghIdx = typename Partition::NghIdx; using NghData = typename Partition::NghData; - bField(const std::string& fieldUserName, - Neon::DataUse dataUse, - const Neon::MemoryOptions& memoryOptions, - const Grid& grid, - int cardinality, - T inactiveValue); + bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue); bField(); @@ -110,8 +110,8 @@ class bField : public Neon::domain::interface::FieldBaseTemplate grid; - BlockViewField memoryField; + std::shared_ptr grid; + BlockViewField memoryField; int mCardinality; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index a9c249ca..a6127c43 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -11,12 +11,12 @@ bField::bField() } template -bField::bField(const std::string& fieldUserName, - Neon::DataUse dataUse, - const Neon::MemoryOptions& memoryOptions, - const Grid& grid, - int cardinality, - T inactiveValue) +bField::bField(const std::string& fieldUserName, + Neon::DataUse dataUse, + Neon::MemoryOptions memoryOptions, + const Grid& grid, + int cardinality, + T inactiveValue) : Neon::domain::interface::FieldBaseTemplate(&grid, fieldUserName, "bField", @@ -29,7 +29,8 @@ bField::bField(const std::string& fieldUserName, mData->grid = std::make_shared(grid); if (memoryOptions.getOrder() == Neon::MemoryLayout::arrayOfStructs) { - NEON_THROW_UNSUPPORTED_OPERATION("bField does not support MemoryLayout::arrayOfStructs"); + NEON_WARNING("bField does not support MemoryLayout::arrayOfStructs, enforcing MemoryLayout::structOfArrays"); + memoryOptions.setOrder(Neon::MemoryLayout::structOfArrays); } // the allocation size is the number of blocks x block size x cardinality mData->memoryField = mData->grid->getBlockViewGrid().template newField( diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 7505a06b..b921a3e1 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -125,7 +125,9 @@ bGrid::bGrid(const Neon::Backend& backend, }) .run(Neon::Backend::mainStreamIdx); + mData->activeBitField.updateDeviceData(Neon::Backend::mainStreamIdx); + this->getBackend().sync(Neon::Backend::mainStreamIdx); mData->activeBitField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device) diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp index 70d9d650..ec6f892a 100644 --- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp +++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp @@ -22,9 +22,9 @@ TEST(domain_stencil, eGrid) 1); } -TEST(domain_stencil, bGridSingleGPU) +TEST(domain_stencil, bGri ) { - int nGpus = 1; + int nGpus = 5; using Type = int64_t; runAllTestConfiguration(std::function(map::run), nGpus, From ea82dfc1c0553a64f5c0bd0d8b23b4be04436195 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 16 Jun 2023 10:14:27 -0400 Subject: [PATCH 07/94] Adding scripts --- .../lbm-lid-driven-cavity-flow.py | 48 +++++++++++++++++++ .../lbm-lid-driven-cavity-flow.sh | 30 ++++++------ 2 files changed, 64 insertions(+), 14 deletions(-) create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py new file mode 100644 index 00000000..f4b48dd3 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -0,0 +1,48 @@ +import subprocess + +DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split() +DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split() +DEVICE_TYPE_LIST = 'cpu gpu'.split() +GRID_LIST = "dGrid bGrid eGrid".split() +STORAGE_FP_LIST = "double float".split() +COMPUTE_FP_LIST = "double float".split() +OCC_LIST = "nOCC".split() +WARM_UP_ITER = 10 +MAX_ITER = 100 +REPETITIONS = 5 + +for DEVICE_TYPE in DEVICE_TYPE_LIST: + + DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in DEVICE_ID_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + + command = 'lbm-lid-driven-cavity-flow' + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + DEVICE_TYPE) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + + STORAGE_FP + '_' + COMPUTE_FP + '_' + + DEVICE_SET.replace(' ', '_') + '_' + OCC) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--benchmark') + parameters.append('--' + OCC) + + subprocess.run(['echo' , ' '.join(parameters)]) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh index ba5fe106..7cc5108c 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh @@ -1,7 +1,7 @@ set -x -DOMAIN_SIZE_LIST="128 192 256 320 384 448 512" -GRID="dGrid" +DOMAIN_SIZE_LIST="64 128 192 256 320 384 448 512" +GRID_LIST="dGrid bGrid eGrid" STORAGE_FP_LIST="double float" COMPUTE_FP_LIST="double float" OCC="nOCC" @@ -9,20 +9,22 @@ OCC="nOCC" for DOMAIN_SIZE in ${DOMAIN_SIZE_LIST}; do for STORAGE_FP in ${STORAGE_FP_LIST}; do for COMPUTE_FP in ${COMPUTE_FP_LIST}; do + for GRID in ${GRID_LIST}; do - if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then - continue - fi + if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then + continue + fi - echo ./lbm-lid-driven-cavity-flow \ - --deviceType gpu --deviceIds 0 \ - --grid "${GRID}" \ - --domain-size "${DOMAIN_SIZE}" \ - --warmup-iter 10 --max-iter 100 --repetitions 5 \ - --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \ - --computeFP "${COMPUTE_FP}" \ - --storageFP "${STORAGE_FP}" \ - --${OCC} --benchmark + echo ./lbm-lid-driven-cavity-flow \ + --deviceType gpu --deviceIds 0 \ + --grid "${GRID}" \ + --domain-size "${DOMAIN_SIZE}" \ + --warmup-iter 10 --max-iter 100 --repetitions 5 \ + --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \ + --computeFP "${COMPUTE_FP}" \ + --storageFP "${STORAGE_FP}" \ + --${OCC} --benchmark + done done done done From 55af7081427f22a352ecfe69cb03c2ad722c16df Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 19 Jun 2023 09:54:52 -0400 Subject: [PATCH 08/94] Benchmarks and scripts Adding scripts Adding scripts Adding scripts Adding scripts Adding back eGrid and bGrid to the LBM benchmark. Fixing warning issue Fixing script. --- .../lbm-lid-driven-cavity-flow/CMakeLists.txt | 9 +- .../lbm-lid-driven-cavity-flow.py | 123 ++++++++++++----- .../src/RunCavityTwoPop.cu | 8 +- .../Neon/domain/details/bGrid/bField.h | 2 +- .../Neon/domain/details/bGrid/bField_imp.h | 2 +- .../Neon/domain/details/bGrid/bPartition.h | 48 ++++++- .../domain/details/bGrid/bPartition_imp.h | 130 +++++++++++++++++- .../Neon/domain/details/eGrid/ePartition.h | 39 ++++-- .../domain/details/eGrid/ePartition_imp.h | 37 +++++ libNeonSet/include/Neon/set/Containter_imp.h | 1 + 10 files changed, 336 insertions(+), 63 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt b/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt index dfb18a8c..ed03a750 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt +++ b/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt @@ -23,4 +23,11 @@ add_custom_command( TARGET ${APP} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.sh - ${CMAKE_BINARY_DIR}/bin/${APP}.sh) \ No newline at end of file + ${CMAKE_BINARY_DIR}/bin/${APP}.sh) + +add_custom_command( + TARGET ${APP} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.py + ${CMAKE_BINARY_DIR}/bin/${APP}.py +) \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index f4b48dd3..5aebe104 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -1,5 +1,3 @@ -import subprocess - DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split() DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split() DEVICE_TYPE_LIST = 'cpu gpu'.split() @@ -11,38 +9,89 @@ MAX_ITER = 100 REPETITIONS = 5 -for DEVICE_TYPE in DEVICE_TYPE_LIST: - - DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] - if DEVICE_TYPE == 'gpu': - for DEVICE in DEVICE_ID_LIST[1:]: - DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) - for OCC in OCC_LIST: - for DOMAIN_SIZE in DOMAIN_SIZE_LIST: - for STORAGE_FP in STORAGE_FP_LIST: - for COMPUTE_FP in COMPUTE_FP_LIST: - for DEVICE_SET in DEVICE_SET_LIST: - - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue - - command = 'lbm-lid-driven-cavity-flow' - parameters = [] - parameters.append('--deviceType ' + DEVICE_TYPE) - parameters.append('--deviceIds ' + DEVICE_SET) - parameters.append('--grid ' + DEVICE_TYPE) - parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) - parameters.append('--max-iter ' + str(MAX_ITER)) - parameters.append( - '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + - STORAGE_FP + '_' + COMPUTE_FP + '_' + - DEVICE_SET.replace(' ', '_') + '_' + OCC) - parameters.append('--computeFP ' + COMPUTE_FP) - parameters.append('--storageFP ' + STORAGE_FP) - parameters.append('--benchmark') - parameters.append('--' + OCC) - - subprocess.run(['echo' , ' '.join(parameters)]) +import subprocess +import sys + + +def printProgressBar(value, label): + n_bar = 40 # size of progress bar + max = 100 + j = value / max + sys.stdout.write('\r') + bar = 'â–ˆ' * int(n_bar * j) + bar = bar + '-' * int(n_bar * (1 - j)) + + sys.stdout.write(f"{label.ljust(10)} | [{bar:{n_bar}s}] {int(100 * j)}% ") + sys.stdout.flush() + + +def countAll(): + counter = 0 + for DEVICE_TYPE in DEVICE_TYPE_LIST: + DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in DEVICE_ID_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for GRID in GRID_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + + counter += 1 + return counter + + +SAMPLES = countAll() +counter = 0 +command = './lbm-lid-driven-cavity-flow' +with open(command + '.log', 'w') as fp: + for DEVICE_TYPE in DEVICE_TYPE_LIST: + DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in DEVICE_ID_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for GRID in GRID_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + + STORAGE_FP + '_' + COMPUTE_FP + '_' + + DEVICE_SET.replace(' ', '_') + '_' + OCC) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--benchmark') + parameters.append('--' + OCC) + + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) + + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + subprocess.run(commandList, text=True, stdout=fp) + + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index 2ca5e128..c603415c 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -128,7 +128,7 @@ auto run(Config& config, }, Neon::computeMode_t::seq); - //sort the position so the linear interpolation works + // sort the position so the linear interpolation works std::sort(xPosVal.begin(), xPosVal.end(), [=](std::pair& a, std::pair& b) { return a.first < b.first; }); @@ -308,12 +308,10 @@ auto run(Config& config, return details::runFilterStoreType(config, report); } if (config.gridType == "eGrid") { - NEON_DEV_UNDER_CONSTRUCTION(""); - // return details::runFilterStoreType(config, report); + return details::runFilterStoreType(config, report); } if (config.gridType == "bGrid") { - NEON_DEV_UNDER_CONSTRUCTION(""); - // return details::runFilterStoreType(config, report); + return details::runFilterStoreType(config, report); } } } // namespace CavityTwoPop diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index d0dd45c5..d4d663fd 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -111,7 +111,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate grid; - BlockViewField memoryField; + BlockViewField memoryField; int mCardinality; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index a6127c43..29a71248 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -250,7 +250,7 @@ auto bField::initHaloUpdateTable() -> void setIdxVec[Data::EndPoints::src] = setIdxSrc; std::array partitions; - std::array*, Data::EndPointsUtils::nConfigs> blockViewPartitions; + std::array*, Data::EndPointsUtils::nConfigs> blockViewPartitions; std::array, Data::EndPointsUtils::nConfigs> ghostZBeginIdx; std::array, Data::EndPointsUtils::nConfigs> boundaryZBeginIdx; std::array memPhyDim; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 7f537ad5..35abdc50 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -38,42 +38,80 @@ class bPartition Neon::int32_3d* mOrigin, NghIdx* mStencilNghIndex); + /** + * Retrieve the cardinality of the field. + */ inline NEON_CUDA_HOST_DEVICE auto cardinality() const -> int; + /** + * Gets the field metadata at a cartesian point. + */ inline NEON_CUDA_HOST_DEVICE auto operator()(const Idx& cell, int card) -> T&; + /** + * Gets the field metadata at a cartesian point. + */ inline NEON_CUDA_HOST_DEVICE auto operator()(const Idx& cell, int card) const -> const T&; + /** + * Gets the field metadata at a neighbour cartesian point. + */ NEON_CUDA_HOST_DEVICE inline auto getNghData(const Idx& cell, const NghIdx& offset, const int card) const -> NghData; + /** + * Gets the field metadata at a neighbour cartesian point. + */ NEON_CUDA_HOST_DEVICE inline auto getNghData(const Idx& eId, uint8_t nghID, int card) const -> NghData; + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + int card) + const -> NghData; + + /** + * Gets the field metadata at a neighbour cartesian point. + */ + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& eId, + int card, + T defaultValue) + const -> NghData; + + /** + * Gets the global coordinates of the cartesian point. + */ NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& cell) const -> Neon::index_3d; - + /** + * Gets the Idx for in the block view space. + */ NEON_CUDA_HOST_DEVICE inline auto - getBlockViewGridIdx(const Idx& cell) + getBlockViewIdx(const Idx& cell) const -> BlockViewGridIdx; - protected: NEON_CUDA_HOST_DEVICE inline auto helpGetPitch(const Idx& cell, int card) @@ -91,6 +129,10 @@ class bPartition helpGetNghIdx(const Idx& idx, const NghIdx& offset) const -> Idx; + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& idx) + const -> Idx; int mCardinality; T* mMem; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index 6e3b728f..d8bbef08 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -50,7 +50,7 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: template NEON_CUDA_HOST_DEVICE inline auto bPartition:: - getBlockViewGridIdx(const Idx& gidx) + getBlockViewIdx(const Idx& gidx) const -> BlockViewGridIdx { BlockViewGridIdx res; @@ -193,6 +193,96 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: } } +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + helpGetNghIdx(const Idx& idx) + const -> Idx +{ + + typename Idx::InDataBlockIdx ngh(idx.mInDataBlockIdx.x + xOff, + idx.mInDataBlockIdx.y + yOff, + idx.mInDataBlockIdx.z + zOff); + + /** + * 0 if no offset on the direction + * 1 positive offset + * -1 negative offset + */ + const int xFlag = [&] { + if constexpr (xOff == 0) { + return 0; + } else { + return ngh.x < 0 ? -1 : (ngh.x >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + + const int yFlag = [&] { + if constexpr (yOff == 0) { + return 0; + } else { + return ngh.y < 0 ? -1 : (ngh.y >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + const int zFlag = [&] { + if constexpr (zOff == 0) { + return 0; + } else { + return ngh.z < 0 ? -1 : (ngh.z >= SBlock::memBlockSizeX ? +1 : 0); + } + }(); + + const bool isLocal = (xFlag | yFlag | zFlag) == 0; + if (!(isLocal)) { + typename Idx::InDataBlockIdx remoteInBlockOffset; + /** + * Example + * - 8 block (1D case) + * Case 1: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 starting point + * + * - idx.inBlock = 2 + * - offset = -1 + * - remote.x = (2-3) - ((-1) * 4) = -1 + 4 = 3 + * Case 2: + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * starting point +3 from 3 + * + * - idx.inBlock = 3 + * - offset = (+3,0) + * - remote.x = (7+3) - ((+1) * 8) = 10 - 8 = 2 + * + * |0,1,2,3|0,1,2,3|0,1,2,3| + * ^ ^ + * -3 from 0 +3 from 3 + * + * NOTE: if in one direction the neighbour offet is zero, xFalg is 0; + * */ + + Idx remoteNghIdx; + remoteNghIdx.mInDataBlockIdx.x = ngh.x - xFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.y = ngh.y - yFlag * SBlock::memBlockSizeX; + remoteNghIdx.mInDataBlockIdx.z = ngh.z - zFlag * SBlock::memBlockSizeX; + + int connectivityJump = idx.mDataBlockIdx * 27 + + (xFlag + 1) + + (yFlag + 1) * 3 + + (zFlag + 1) * 9; + remoteNghIdx.mDataBlockIdx = mBlockConnectivity[connectivityJump]; + + return remoteNghIdx; + } else { + Idx localNghIdx; + localNghIdx.mDataBlockIdx = idx.mDataBlockIdx; + localNghIdx.mInDataBlockIdx = ngh; + return localNghIdx; + } +} + template NEON_CUDA_HOST_DEVICE inline auto bPartition:: getNghData(const Idx& eId, @@ -223,4 +313,42 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: return result; } +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.invalidate(); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& idx, + int card, + T defaultValue) + const -> NghData +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(idx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + result.set(defaultValue, false); + return result; + } + auto const value = mMem[pitch]; + result.set(value, true); + return result; +} } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h index cacac275..012a3588 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h @@ -74,8 +74,8 @@ class ePartition public: //-- [PUBLIC TYPES] ---------------------------------------------------------------------------- - using Self = ePartition; //<- this type - using Idx = eIndex; //<- index type + using Self = ePartition; //<- this type + using Idx = eIndex; //<- index type using OuterIdx = typename Idx::OuterIdx; //<- index type for the subGrid static constexpr int Cardinality = C; @@ -147,15 +147,15 @@ class ePartition operator()(Idx eId, int cardinalityIdx) -> T&; -// template -// NEON_CUDA_HOST_DEVICE inline auto -// castRead(Idx eId, int cardinalityIdx) const -// -> ComputeType; -// -// template -// NEON_CUDA_HOST_DEVICE inline auto -// castWrite(Idx eId, int cardinalityIdx, const ComputeType& value) -// -> void; + // template + // NEON_CUDA_HOST_DEVICE inline auto + // castRead(Idx eId, int cardinalityIdx) const + // -> ComputeType; + // + // template + // NEON_CUDA_HOST_DEVICE inline auto + // castWrite(Idx eId, int cardinalityIdx, const ComputeType& value) + // -> void; /** * Retrieve value of a neighbour for a field with multiple cardinalities * @tparam dataView_ta @@ -165,9 +165,9 @@ class ePartition * @return */ NEON_CUDA_HOST_DEVICE inline auto - getNghData(Idx eId, - NghIdx nghIdx, - int card) + getNghData(Idx eId, + NghIdx nghIdx, + int card) const -> NghData; NEON_CUDA_HOST_DEVICE inline auto @@ -176,7 +176,18 @@ class ePartition int card) const -> NghData; + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(Idx eId, + int card) + const -> NghData; + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(Idx eId, + int card, + T defaultValue) + const -> NghData; /** * Check is the * @tparam dataView_ta diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h index c2ff1ae0..0063ee9e 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h @@ -87,6 +87,43 @@ ePartition::getNghData(eIndex eId, return res; } +template +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(eIndex eId, + int card) + const -> NghData +{ + int tablePithc = (xOff + mStencilRadius) + + (yOff + mStencilRadius) * mStencilTableYPitch + + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(eId, nghIdx, card); + + return res; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(eIndex eId, + int card, + T defaultVal) + const -> NghData +{ + int tablePithc = (xOff + mStencilRadius) + + (yOff + mStencilRadius) * mStencilTableYPitch + + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(eId, nghIdx, card); + if (!res.isValid()) { + res.set(defaultVal, false); + } + return res; +} + template NEON_CUDA_HOST_DEVICE inline auto diff --git a/libNeonSet/include/Neon/set/Containter_imp.h b/libNeonSet/include/Neon/set/Containter_imp.h index 534d92ff..f7f421e6 100644 --- a/libNeonSet/include/Neon/set/Containter_imp.h +++ b/libNeonSet/include/Neon/set/Containter_imp.h @@ -48,6 +48,7 @@ auto Container::factory(const std::string& name, std::shared_ptr tmp(k); return {tmp}; } + NEON_THROW_UNSUPPORTED_OPERATION("Execution type not supported"); } template From 90a4ba9f628b93e86edf440b3df883cf83ee7ab2 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 19 Jun 2023 10:47:29 -0400 Subject: [PATCH 09/94] Code documentation --- .../include/Neon/domain/details/bGrid/bGrid.h | 7 +- .../Neon/domain/details/bGrid/bGrid_imp.h | 6 +- .../include/Neon/domain/tools/Partitioner1D.h | 74 +++++++++++++------ 3 files changed, 58 insertions(+), 29 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index 8ed458c8..62ae8ad6 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -71,9 +71,8 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, const Neon::int32_3d& domainSize /**< Size of the bounded Cartesian */, const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, - const int voxelSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. - * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/ - , + const int multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/, const double_3d& spacingData = double_3d(1, 1, 1) /** Physical spacing between two consecutive data points in the Cartesian domain */, const double_3d& origin = double_3d(0, 0, 0) /** Physical location in space of the origin of the Cartesian discretization */); @@ -212,7 +211,7 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, tool::Partitioner1D::DenseMeta denseMeta; - int voxelSpacing; + int mMultiResDiscreteIdxSpacing; // number of active voxels in each block Neon::set::DataSet mNumActiveVoxel; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index b921a3e1..85da8a62 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -20,7 +20,7 @@ bGrid::bGrid(const Neon::Backend& backend, const Neon::int32_3d& domainSize, const ActiveCellLambda activeCellLambda, const Neon::domain::Stencil& stencil, - const int voxelSpacing, + const int multiResDiscreteIdxSpacing, const double_3d& spacingData, const double_3d& origin) { @@ -29,7 +29,7 @@ bGrid::bGrid(const Neon::Backend& backend, mData = std::make_shared(); mData->init(backend); - mData->voxelSpacing = voxelSpacing; + mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing; mData->stencil = stencil; const index_3d defaultKernelBlockSize(SBlock::memBlockSizeX, SBlock::memBlockSizeY, @@ -45,7 +45,7 @@ bGrid::bGrid(const Neon::Backend& backend, stencil, nElementsPerPartition, defaultKernelBlockSize, - voxelSpacing, + multiResDiscreteIdxSpacing, origin); } diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h index e162512c..0204098c 100644 --- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h +++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h @@ -8,6 +8,34 @@ namespace Neon::domain::tool { +/** + * Abstraction for a partitioner on a 1D domain. + * + * Partitioning is executed over the cartesian index space of the domain. + * The Partitioner works at the block granularity. The block size is defined by the user. + * + * The partitioning is done in thee steps: + * a. [DOMAIN DECOMPOSITION] - Projecting of the blocks into the Z-axis and then applying a uniform partitioning schema. + * Definition of the span of each partition is the final result of this step. + * + * b. [CLASSIFIER] - For each partition, the indexes in a partition span are classified twice: + * - First, the indexes are classified according to the data view configuration. + * - INTERNAL: The span is fully contained in the partition. + * - BOUNDARY: The span is partially contained in the partition. + * - GHOST: The span is not contained in the partition. + * - Second, the indexes are classified according to the boundary conditions. This is a user driven classification + * + * c. [LAYOUT] - The final step is to layout the indexes in memory, i.e. decide for each index its position in a 1D array. + * + * The final layout of each partitioning will look like the following: + * + * -------------------------------------------------------------------- + * | Internal | Boundary | Ghost | + * | | UP | DW | UP | Dw | + * | Bulk | Bc | Bulk | Bc | Bulk | Bc | Bulk | Bc | Bulk | Bc | + * -------------------------------------------------------------------- + * + */ class Partitioner1D { public: @@ -75,59 +103,59 @@ class Partitioner1D Meta invalidMeta; }; - template Partitioner1D(const Neon::Backend& backend, - const ActiveCellLambda& activeCellLambda, + const ActiveIndexLambda& activeIndexLambda, const BcLambda& bcLambda, const Neon::index_3d& dataBlockSize, const Neon::int32_3d& domainSize, const Neon::domain::Stencil stencil, - const int& discreteVoxelSpacing = 1) + const int& multiResDiscreteIdxSpacing = 1) { mData = std::make_shared(); mData->mDataBlockSize = dataBlockSize; - mData->mDiscreteVoxelSpacing = discreteVoxelSpacing; + mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing; mData->mStencil = stencil; mData->mDomainSize = domainSize; - Neon::int32_3d block3DSpan(NEON_DIVIDE_UP(domainSize.x, dataBlockSize.x), - NEON_DIVIDE_UP(domainSize.y, dataBlockSize.y), - NEON_DIVIDE_UP(domainSize.z, dataBlockSize.z)); + // Block space interval (i.e. indexing space at the block granularity) - mData->block3DSpan = block3DSpan; + mData->block3DSpan = Neon::int32_3d(NEON_DIVIDE_UP(domainSize.x, dataBlockSize.x), + NEON_DIVIDE_UP(domainSize.y, dataBlockSize.y), + NEON_DIVIDE_UP(domainSize.z, dataBlockSize.z)); std::vector nBlockProjectedToZ(block3DSpan.z); auto block3dIdxToBlockOrigin = [&](Neon::int32_3d const& block3dIdx) { - Neon::int32_3d blockOrigin(block3dIdx.x * dataBlockSize.x * discreteVoxelSpacing, - block3dIdx.y * dataBlockSize.y * discreteVoxelSpacing, - block3dIdx.z * dataBlockSize.z * discreteVoxelSpacing); + Neon::int32_3d blockOrigin(block3dIdx.x * dataBlockSize.x * multiResDiscreteIdxSpacing, + block3dIdx.y * dataBlockSize.y * multiResDiscreteIdxSpacing, + block3dIdx.z * dataBlockSize.z * multiResDiscreteIdxSpacing); return blockOrigin; }; auto getVoxelAbsolute3DIdx = [&](Neon::int32_3d const& blockOrigin, Neon::int32_3d const& voxelRelative3DIdx) { - const Neon::int32_3d id(blockOrigin.x + voxelRelative3DIdx.x * discreteVoxelSpacing, - blockOrigin.y + voxelRelative3DIdx.y * discreteVoxelSpacing, - blockOrigin.z + voxelRelative3DIdx.z * discreteVoxelSpacing); + const Neon::int32_3d id(blockOrigin.x + voxelRelative3DIdx.x * multiResDiscreteIdxSpacing, + blockOrigin.y + voxelRelative3DIdx.y * multiResDiscreteIdxSpacing, + blockOrigin.z + voxelRelative3DIdx.z * multiResDiscreteIdxSpacing); return id; }; mData->spanDecomposition = std::make_shared( backend, - activeCellLambda, + activeIndexLambda, block3dIdxToBlockOrigin, getVoxelAbsolute3DIdx, block3DSpan, dataBlockSize, domainSize, - discreteVoxelSpacing); + multiResDiscreteIdxSpacing); mData->mSpanClassifier = std::make_shared( backend, - activeCellLambda, + activeIndexLambda, bcLambda, block3dIdxToBlockOrigin, getVoxelAbsolute3DIdx, @@ -135,7 +163,7 @@ class Partitioner1D dataBlockSize, domainSize, stencil, - discreteVoxelSpacing, + multiResDiscreteIdxSpacing, mData->spanDecomposition); mData->mSpanLayout = std::make_shared( @@ -147,10 +175,12 @@ class Partitioner1D mData->mSpanLayout->getStandardAndGhostCount().typedClone(), {251, 1, 1}); } - auto getBlockSpan() -> Neon::int32_3d + auto getBlockSpan() const + -> Neon::int32_3d { return mData->block3DSpan; } + auto getMemoryGrid() -> Neon::aGrid& { return mData->mTopologyWithGhost; @@ -207,7 +237,7 @@ class Partitioner1D aGrid::Cell idx(count); Neon::int32_3d point3d = mapperVec[j]; - point3d = point3d * mData->mDiscreteVoxelSpacing * mData->mDataBlockSize; + point3d = point3d * mData->mMultiResDiscreteIdxSpacing * mData->mDataBlockSize; partition(idx, 0) = point3d; count++; } @@ -349,7 +379,7 @@ class Partitioner1D if (findings.first) { targetNgh = findings.second; } - aGrid::Cell aIdx(static_cast (start + blockIdx)); + aGrid::Cell aIdx(static_cast(start + blockIdx)); partition(aIdx, s) = targetNgh; } } @@ -402,7 +432,7 @@ class Partitioner1D { public: Neon::index_3d mDataBlockSize = 0; - int mDiscreteVoxelSpacing = 0; + int mMultiResDiscreteIdxSpacing = 0; Neon::domain::Stencil mStencil; Neon::index_3d mDomainSize; Neon::int32_3d block3DSpan; From 019db4d6a03e771b8309d3d2291ccb151e071e98 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 19 Jun 2023 17:38:57 -0400 Subject: [PATCH 10/94] Fixing grid spacing in bGrid. --- .../Neon/domain/details/bGrid/bField.h | 10 +-------- .../Neon/domain/details/bGrid/bField_imp.h | 21 ++++++++++++++----- .../include/Neon/domain/details/bGrid/bGrid.h | 15 ++++++++++--- .../Neon/domain/details/bGrid/bGrid_imp.h | 19 +++++++++++++---- .../Neon/domain/details/bGrid/bPartition.h | 1 + .../domain/details/bGrid/bPartition_imp.h | 5 ++++- .../include/Neon/domain/tools/Partitioner1D.h | 6 +++--- 7 files changed, 52 insertions(+), 25 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h index d4d663fd..565ae518 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField.h @@ -84,13 +84,6 @@ class bField : public Neon::domain::interface::FieldBaseTemplate void; - // - // enum PartitionBackend - // { - // cpu = 0, - // gpu = 1, - // }; - struct Data { Data() = default; @@ -112,8 +105,7 @@ class bField : public Neon::domain::interface::FieldBaseTemplate grid; BlockViewField memoryField; - - int mCardinality; + int cardinality; // Neon::domain::tool::HaloTable1DPartitioning latticeHaloUpdateTable; Neon::domain::tool::HaloTable1DPartitioning soaHaloUpdateTable; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 29a71248..52802f1c 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -79,11 +79,22 @@ template auto bField::getReference(const Neon::index_3d& cartesianIdx, const int& cardinality) -> T& { - auto& grid = this->getGrid(); - auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); - auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); - auto& result = partition(bIdx, cardinality); - return result; + if constexpr (SBlock::isMultiResMode) { + auto& grid = this->getGrid(); + auto uniformCartesianIdx = cartesianIdx / grid.helpGetMultiResFactor(); + auto uniformCartesianIdxTruncation = cartesianIdx % grid.helpGetMultiResFactor(); + static_assert(uniformCartesianIdxTruncation == 0); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(uniformCartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } else { + auto& grid = this->getGrid(); + auto [setIdx, bIdx] = grid.helpGetSetIdxAndGridIdx(cartesianIdx); + auto& partition = getPartition(Neon::Execution::host, setIdx, Neon::DataView::STANDARD); + auto& result = partition(bIdx, cardinality); + return result; + } } template diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index 62ae8ad6..d94d1aa1 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -72,9 +72,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, const int multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. - * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1*/, - const double_3d& spacingData = double_3d(1, 1, 1) /** Physical spacing between two consecutive data points in the Cartesian domain */, - const double_3d& origin = double_3d(0, 0, 0) /** Physical location in space of the origin of the Cartesian discretization */); + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */ + , + const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */, + const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */); /** * Returns some properties for a given cartesian in the Cartesian domain. @@ -159,12 +160,20 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, */ auto getBlockViewGrid() const -> BlockView::Grid&; + /** * Retrieve the block vew grid internally used. * This grid can be leverage to allocate data at the block level. */ auto getActiveBitMask() const -> BlockView::Field&; + /** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ + template + auto helGetMultiResDiscreteIdxSpacing() const -> std::enable_if_t; + + /** * Help function to retrieve the block connectivity as a BlockViewGrid field */ diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 85da8a62..bde200e3 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -58,7 +58,7 @@ bGrid::bGrid(const Neon::Backend& backend, SBlock::memBlockSize3D.template newType(), domainSize, Neon::domain::Stencil::s27_t(false), - 1); + multiResDiscreteIdxSpacing); mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping(); mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset(); @@ -95,7 +95,7 @@ bGrid::bGrid(const Neon::Backend& backend, .getGrid() .template newContainer( "activeBitMaskInit", - [&](Neon::set::Loader& loader) { + [&, this](Neon::set::Loader& loader) { auto bitMaskPartition = loader.load(mData->activeBitField); return [&, bitMaskPartition](const auto& bitMaskIdx) mutable { auto prtIdx = bitMaskPartition.prtID(); @@ -107,9 +107,9 @@ bGrid::bGrid(const Neon::Backend& backend, for (int k = 0; k < SBlock::memBlockSize3D.template newType().z; k++) { for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { - auto globalPosition = blockOrigin + Neon::int32_3d(i, j, k); + auto globalPosition = blockOrigin + Neon::int32_3d(i, j, k); bool const isInDomain = globalPosition < domainSize; - bool const isActive = activeCellLambda(globalPosition); + bool const isActive = activeCellLambda(globalPosition * mData->mMultiResDiscreteIdxSpacing); if (isActive && isInDomain) { countActive++; bitMask.setActive(i, j, k); @@ -319,6 +319,17 @@ auto bGrid:: return mData->activeBitField; } +/** + * Helper function to retrieve the discrete index spacing used for the multi-resolution + */ +template +template +auto bGrid::helGetMultiResDiscreteIdxSpacing() const + -> std::enable_if_t +{ + return mData->mMultiResDiscreteIdxSpacing; +} + template auto bGrid:: helpGetBlockConnectivity() diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 35abdc50..73ccb914 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -141,6 +141,7 @@ class bPartition typename SBlock::BitMask const* NEON_RESTRICT mMask; Neon::int32_3d const* NEON_RESTRICT mOrigin; int mSetIdx; + int mMultiResDiscreteIdxSpacing = 1; }; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index d8bbef08..dc4c5880 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -45,7 +45,10 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: location.x += gidx.mInDataBlockIdx.x; location.y += gidx.mInDataBlockIdx.y; location.z += gidx.mInDataBlockIdx.z; - return location; + if constexpr (SBlock::isMultiResMode){ + return location * mMultiResDiscreteIdxSpacing; + } + return location ; } template diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h index 0204098c..ac49dc6f 100644 --- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h +++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h @@ -126,7 +126,7 @@ class Partitioner1D NEON_DIVIDE_UP(domainSize.y, dataBlockSize.y), NEON_DIVIDE_UP(domainSize.z, dataBlockSize.z)); - std::vector nBlockProjectedToZ(block3DSpan.z); + std::vector nBlockProjectedToZ(mData->block3DSpan.z); auto block3dIdxToBlockOrigin = [&](Neon::int32_3d const& block3dIdx) { Neon::int32_3d blockOrigin(block3dIdx.x * dataBlockSize.x * multiResDiscreteIdxSpacing, @@ -148,7 +148,7 @@ class Partitioner1D activeIndexLambda, block3dIdxToBlockOrigin, getVoxelAbsolute3DIdx, - block3DSpan, + mData->block3DSpan, dataBlockSize, domainSize, multiResDiscreteIdxSpacing); @@ -159,7 +159,7 @@ class Partitioner1D bcLambda, block3dIdxToBlockOrigin, getVoxelAbsolute3DIdx, - block3DSpan, + mData->block3DSpan, dataBlockSize, domainSize, stencil, From 588b74601d393955714aead173d82dc161327182 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 22 Jun 2023 18:51:30 -0400 Subject: [PATCH 11/94] WIP --- .../src/RunCavityTwoPop.cu | 4 + .../Neon/core/types/vec/vec4d_integer.tdecl.h | 1 + .../Neon/domain/details/dGrid/dPartition.h | 40 +- .../Neon/domain/details/dGridSoA/dGridSoA.h | 97 +++++ .../Neon/domain/details/dGridSoA/dIndexSoA.h | 53 +++ .../domain/details/dGridSoA/dIndexSoA_imp.h | 50 +++ .../domain/details/dGridSoA/dPartitionSoA.h | 342 ++++++++++++++++++ .../Neon/domain/details/dGridSoA/dSpanSoA.h | 52 +++ .../domain/details/dGridSoA/dSpanSoA_imp.h | 71 ++++ 9 files changed, 696 insertions(+), 14 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h create mode 100644 libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index c603415c..d28688d1 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -3,6 +3,7 @@ #include "Neon/domain/bGrid.h" #include "Neon/domain/dGrid.h" #include "Neon/domain/eGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "CellType.h" #include "LbmIteration.h" @@ -313,5 +314,8 @@ auto run(Config& config, if (config.gridType == "bGrid") { return details::runFilterStoreType(config, report); } + if (config.gridType == "dGridSoA") { + return details::runFilterStoreType(config, report); + } } } // namespace CavityTwoPop diff --git a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h index 788291a6..940c6d2c 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h @@ -58,6 +58,7 @@ template class Vec_4d { public: + using Integer = IntegerType_ta; using element_t = IntegerType_ta; using self_t = Vec_4d; diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 196f6b70..31e480aa 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -150,13 +150,13 @@ class dPartition return NghData(val, isValidNeighbour); } - template + template NEON_CUDA_HOST_DEVICE inline auto getNghData(const Idx& eId, int card, LambdaVALID funIfValid, LambdaNOTValid funIfNOTValid = nullptr) - const -> std::enable_if_t , void> + const -> std::enable_if_t, void> { Idx cellNgh; const bool isValidNeighbour = nghIdx(eId, cellNgh); @@ -419,19 +419,31 @@ class dPartition return; } + auto getDataView() + -> Neon::DataView + { + return m_dataView; + } + + auto helpGetGlobalToLocalOffets() const + -> NghIdx const* + { + return mStencil; + } + private: - Neon::DataView m_dataView; - T* m_mem; - Neon::index_3d m_dim; - int m_zHaloRadius; - int m_zBoundaryRadius; - Pitch m_pitch; - int m_prtID; - Neon::index_3d m_origin; - int m_cardinality; - Neon::index_3d m_fullGridSize; - bool mPeriodicZ; - NghIdx* mStencil; + Neon::DataView m_dataView; + T* NEON_RESTRICT m_mem; + Neon::index_3d m_dim; + int m_zHaloRadius; + int m_zBoundaryRadius; + Pitch m_pitch; + int m_prtID; + Neon::index_3d m_origin; + int m_cardinality; + Neon::index_3d m_fullGridSize; + bool mPeriodicZ; + NghIdx* NEON_RESTRICT mStencil; }; diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h new file mode 100644 index 00000000..61b182b2 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h @@ -0,0 +1,97 @@ +#pragma once +#include + +#include "Neon/core/core.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/BlockConfig.h" +#include "Neon/set/Containter.h" +#include "Neon/set/DevSet.h" +#include "Neon/set/MemoryOptions.h" + +#include "Neon/sys/memory/MemDevice.h" + +#include "Neon/domain/aGrid.h" + +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/interface/GridConcept.h" +#include "Neon/domain/interface/KernelConfig.h" +#include "Neon/domain/interface/LaunchConfig.h" +#include "Neon/domain/interface/Stencil.h" +#include "Neon/domain/interface/common.h" + +#include "Neon/domain/tools/GridTransformer.h" +#include "Neon/domain/tools/SpanTable.h" + +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/patterns/PatternScalar.h" + +#include "dPartitionSoA.h" + +namespace Neon::domain::details::dGridSoA { + +namespace details { +struct dGridSoATransformation +{ + template + using Partition = dPartitionSoA; + using Span = Neon::domain::details::eGrid::eSpan; + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + + using FoundationGrid = Neon::domain::details::eGrid::eGrid; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + using ExecutionThreadSpanIndexType = int32_t; + using Idx = FoundationGrid::Idx; + + static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& + { + return foundationGrid.getDefaultBlock(); + } + + static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable& spanTable) -> void + { + spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span = foundationGrid.getSpan(execution, setIdx, dw); + }); + } + + static auto initLaunchParameters(FoundationGrid& foundationGrid, + Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& shareMem) -> Neon::set::LaunchParameters + { + return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); + } + + static auto helpGetGridIdx(FoundationGrid&, + Neon::SetIdx const&, + FoundationGrid::Idx const& fgIdx) + -> GridTransformation::Idx + { + GridTransformation::Idx tgIdx = fgIdx; + return tgIdx; + } + + template + static auto initFieldPartition(FoundationGrid::Field& foundationField, + Neon::domain::tool::PartitionTable>& partitionTable) -> void + { + partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Partition& partition) { + auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw); + partition = Partition(foundationPartition); + }); + } +}; + +} // namespace details +using dGridSoA = Neon::domain::tool::GridTransformer::Grid; + +} // namespace Neon::domain::details::dGridSoA diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h new file mode 100644 index 00000000..2ed82d86 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h @@ -0,0 +1,53 @@ +#pragma once + +#include "Neon/core/core.h" +#include "Neon/domain/details/dGridSoA/dIndexSoA.h" + +namespace Neon::domain::details::dGridSoA { + +// Common forward declarations +class dSpanSoA; +template +class dPartitionSoA; + +struct dIndexSoA +{ + using OuterIdx = dIndexSoA; + + template + friend class dPartition; + friend dSpanSoA; + + template + friend class dField; + + // dGrid specific types + using Offset = int32_t; + using Location = index_3d; + using Count = int32_t; + + dIndexSoA() = default; + Location mLocation = 0; + Offset mOffset = 0; + + NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location const& location, + Offset const& offset); + + NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location::Integer const& x, + Location::Integer const& y, + Location::Integer const& z, + Offset const& offset); + + NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&; + + NEON_CUDA_HOST_DEVICE inline auto setOffset() -> Offset&; + + NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&; + + NEON_CUDA_HOST_DEVICE inline auto getOffset() const -> const Offset&; +}; + +} // namespace Neon::domain::details::dGridSoA + +#include "dIndexSoA_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h new file mode 100644 index 00000000..790608c7 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h @@ -0,0 +1,50 @@ +#pragma once +#include "Neon/core/core.h" + +namespace Neon::domain::details::dGridSoA { + +NEON_CUDA_HOST_DEVICE inline dIndexSoA:: + dIndexSoA(const Location& location, + Offset const& offset) +{ + mLocation = location; + mOffset = offset; +} + +NEON_CUDA_HOST_DEVICE inline dIndexSoA:: + dIndexSoA(const Location::Integer& x, + const Location::Integer& y, + const Location::Integer& z, + Offset const& offset) +{ + mLocation.x = x; + mLocation.y = y; + mLocation.z = z; + mOffset = offset; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + setLocation() -> Location& +{ + return mLocation; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + setOffset() -> Offset& +{ + return mOffset; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + getLocation() const -> const Location& +{ + return mLocation; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + getOffset() + const -> const Offset& +{ + return mOffset; +} +} // namespace Neon::domain::details::dGridSoA \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h new file mode 100644 index 00000000..fc4c3642 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -0,0 +1,342 @@ +#pragma once +#include +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" +#include "Neon/domain/details/dGrid/dGrid.h" +#include "Neon/domain/interface/NghData.h" +#include "Neon/set/DevSet.h" +#include "Neon/sys/memory/CudaIntrinsics.h" +#include "cuda_fp16.h" +#include "dIndexSoA.h" + +namespace Neon::domain::details::dGridSoA { + +template +class dPartitionSoA +{ + public: + using Idx = dIndexSoA; + using NghData = Neon::domain::NghData; + using Pitch = uint32_4d; + + dPartitionSoA() + { + } + + dPartitionSoA(Neon::domain::details::dGrid::dPartition const& dPartitionOriginal) + { + mDataView = dPartitionOriginal.getDataView(); + mMem = dPartitionOriginal.mem(); + mDim = dPartitionOriginal.dim(); + mZHaloRadius = dPartitionOriginal.halo().z; + mPitch = dPartitionOriginal.getPitchData().template newType(); + mPrtID = dPartitionOriginal.prtID(); + mOrigin = dPartitionOriginal.origin(); + mCardinality = dPartitionOriginal.cardinality(); + mFullGridSize = dPartitionOriginal.fullGridSize(); + NghIdx* mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets(); + } + + inline NEON_CUDA_HOST_DEVICE auto + prtID() + const -> int + { + return mPrtID(); + } + + inline NEON_CUDA_HOST_DEVICE auto + cardinality() + const -> int + { + return mCardinality(); + } + + inline NEON_CUDA_HOST_DEVICE auto + getPitchData() + const -> const Pitch& + { + return mPitch; + } + + inline NEON_CUDA_HOST_DEVICE auto + getPitch(const Idx& idx, + int cardinality) + -> Idx::Offset + { + return idx.getLocationOffset() + cardinality * mPitch.w; + } + + inline NEON_CUDA_HOST_DEVICE auto + dim() + const -> const Neon::index_3d + { + return mDim(); + } + + inline NEON_CUDA_HOST_DEVICE auto + halo() + const -> const Neon::index_3d + { + return mDPartition.halo(); + } + + inline NEON_CUDA_HOST_DEVICE auto + origin() + const -> const Neon::index_3d + { + return m_ormDPartition.origin(); + } + + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + NghIdx nghOffset, + int card, + const T& alternativeVal) + const -> NghData + { + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh); + T val = alternativeVal; + if (isValidNeighbour) { + val = operator()(gidxNgh, card); + } + return NghData(val, isValidNeighbour); + } + + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + NghIdx nghOffset, + int card) + const -> NghData + { + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh); + T val; + if (isValidNeighbour) { + val = operator()(gidxNgh, card); + } + return NghData(val, isValidNeighbour); + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t, void> + { + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = this->operator()(gidxNgh, card); + funIfValid(val); + } + if constexpr (!std::is_same_v) { + if (!isValidNeighbour) { + funIfNOTValid(); + } + } + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card) + const -> NghData + { + NghData res; + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = operator()(gidxNgh, card); + res.set(val, true); + } else { + res.invalidate(); + } + return res; + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + T const& defaultValue) + const -> NghData + { + NghData res(defaultValue, false); + Idx gidxNgh; + const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = operator()(gidxNgh, card); + res.set(val, true); + } + return res; + } + + NEON_CUDA_HOST_DEVICE inline auto + nghVal(const Idx& gidx, + uint8_t nghID, + int card, + const T& alternativeVal) + const -> NghData + { + NghIdx nghOffset = mStencil[nghID]; + return getNghData(gidx, nghOffset, card, alternativeVal); + } + + /** + * Get the index of the neighbor given the offset + * @tparam dataView_ta + * @param[in] gidx Index of the current element + * @param[in] nghOffset Offset of the neighbor of interest from the current element + * @param[in,out] neighbourIdx Index of the neighbor + * @return Whether the neighbour is valid + */ + NEON_CUDA_HOST_DEVICE inline auto + nghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) + const -> bool + { + Neon::index_3d cartesian(gidx.get().x + nghOffset.x, + gidx.get().y + nghOffset.y, + gidx.get().z + nghOffset.z); + + neighbourIdx = Idx(cartesian, + gidx.getOffset() + nghOffset.x * getPitchData().x + + nghOffset.y * getPitchData().y + + nghOffset.z * getPitchData().z); + + Idx::Location nghCartesianGlobal = getGlobalIndex(gidxNgh); + + bool isValidNeighbour = true; + + isValidNeighbour = (gidxNghGlobal.x >= 0) && + (gidxNghGlobal.y >= 0) && + (gidxNghGlobal.z >= 0); + + isValidNeighbour = (gidxNghGlobal.x < m_fullGridSize.x) && + (gidxNghGlobal.y < m_fullGridSize.y) && + (gidxNghGlobal.z < m_fullGridSize.z) && + isValidNeighbour; + + return isValidNeighbour; + } + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& gidx, + Idx& gidxNgh) + const -> bool + { + Neon::index_3d cartesian(gidx.get().x + xOff, + gidx.get().y + yOff, + gidx.get().z + zOff); + gidxNgh = Idx(cartesian, + gidx.getOffset() + xOff * getPitchData().x + + yOff * getPitchData().y + + zOff * getPitchData().z); + + Idx::Location nghCartesianGlobal(getGlobalIndex(gidxNgh)); + + bool isValidNeighbour = true; + if constexpr (xOff > 0) { + isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour; + isValidNeighbour = nghCartesianGlobal.x <= mDPartition.m_fullGridSize.x && isValidNeighbour; + } + if constexpr (xOff < 0) { + isValidNeighbour = nghCartesianGlobal.x >= 0 && isValidNeighbour; + } + if constexpr (yOff > 0) { + isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour; + isValidNeighbour = nghCartesianGlobal.y <= mDPartition.m_fullGridSize.y && isValidNeighbour; + } + if constexpr (yOff < 0) { + isValidNeighbour = nghCartesianGlobal.y >= 0 && isValidNeighbour; + } + if constexpr (zOff > 0) { + isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour; + isValidNeighbour = nghCartesianGlobal.z <= mDPartition.m_fullGridSize.z && isValidNeighbour; + } + if constexpr (zOff < 0) { + isValidNeighbour = nghCartesianGlobal.z >= mDPartition.m_zHaloRadius && isValidNeighbour; + } + return isValidNeighbour; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem() + -> T* + { + return mDPartition.m_mem; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem() const + -> const T* + { + return mDPartition.m_mem; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem(const Idx& cell, + int cardinalityIdx) + -> T* + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mDPartition.m_mem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto + operator()(const Idx& cell, + int cardinalityIdx) + -> T& + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mDPartition.m_mem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto + operator()(const Idx& cell, + int cardinalityIdx) + const -> const T& + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mDPartition.m_mem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local) + const -> Neon::index_3d + { + Neon::index_3d result = local.mLocation + m_origin; + result.z -= mDPartition.m_zHaloRadius; + return result; + } + + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() + const -> Neon::index_3d + { + return mDPartition.m_fullGridSize; + } + + Neon::DataView mDataView; + T* NEON_RESTRICT mMem; + Neon::index_3d mDim; + int mZHaloRadius; + int mZBoundaryRadius; + Pitch mPitch; + int mPrtID; + Neon::index_3d mOrigin; + int mCardinality; + Neon::index_3d mFullGridSize; + bool mPeriodicZ; + NghIdx* NEON_RESTRICT mStencil; +}; + +} // namespace Neon::domain::details::dGridSoA diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h new file mode 100644 index 00000000..83d5a2dc --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h @@ -0,0 +1,52 @@ +#pragma once +#include "Neon/set/DevSet.h" +#include "dIndexSoA.h" +namespace Neon::domain::details::dGridSoA { + +/** + * Abstraction that represents the Cell space of a partition + * This abstraction is used by the neon lambda executor to + * run a containers on aGrid + */ +class dSpanSoA +{ + public: + using Idx = dIndexSoA; + + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d3; + using ExecutionThreadSpanIndexType = int32_t; + + + NEON_CUDA_HOST_DEVICE inline auto + setAndValidate(Idx& idx, + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) const + -> bool; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetDataView() + const -> Neon::DataView const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetZHaloRadius() + const -> int const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetZBoundaryRadius() + const -> int const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetDim() + const -> Neon::index_3d const&; + + private: + Neon::DataView mDataView; + int mZHaloRadius; + int mZBoundaryRadius; + Neon::index_3d mDim /** Dimension of the span, its values depends on the mDataView*/; +}; + +} // namespace Neon::domain::details::dGrid + +#include "dSpanSoA_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h new file mode 100644 index 00000000..a3dff4cf --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h @@ -0,0 +1,71 @@ +#pragma once + +namespace Neon::domain::details::dGridSoA { + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::setAndValidate(Idx& idx, + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) + const -> bool +{ + bool res = false; + idx.setLocation().x = int(x); + idx.setLocation().y = int(y); + idx.setLocation().z = int(z); + + if (idx.get() < mDim) { + res = true; + } + + switch (mDataView) { + case Neon::DataView::STANDARD: { + idx.setLocation().z += mZHaloRadius; + idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + return res; + } + case Neon::DataView::INTERNAL: { + idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; + idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + return res; + } + case Neon::DataView::BOUNDARY: { + + idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius + ? 0 + : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); + idx.setLocation().z += mZHaloRadius; + idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + return res; + } + default: { + } + } + return false; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDataView() + const -> Neon::DataView const& +{ + return mDataView; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZHaloRadius() + const -> int const& +{ + return mZHaloRadius; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZBoundaryRadius() + const -> int const& +{ + return mZBoundaryRadius; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDim() + const -> Neon::index_3d const& +{ + return mDim; +} + +} // namespace Neon::domain::details::dGrid \ No newline at end of file From 9a87088f549eee95154e7ab5e11a555a2db203b7 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 22 Jun 2023 09:34:01 -0400 Subject: [PATCH 12/94] Fixing report filename for benchmarks scripts --- .../lbm-lid-driven-cavity-flow.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index 5aebe104..90a55ad2 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -4,7 +4,7 @@ GRID_LIST = "dGrid bGrid eGrid".split() STORAGE_FP_LIST = "double float".split() COMPUTE_FP_LIST = "double float".split() -OCC_LIST = "nOCC".split() +OCC_LIST = "nOCC sOCC".split() WARM_UP_ITER = 10 MAX_ITER = 100 REPETITIONS = 5 @@ -48,17 +48,18 @@ def countAll(): SAMPLES = countAll() counter = 0 command = './lbm-lid-driven-cavity-flow' +# command = 'echo' with open(command + '.log', 'w') as fp: for DEVICE_TYPE in DEVICE_TYPE_LIST: DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] if DEVICE_TYPE == 'gpu': for DEVICE in DEVICE_ID_LIST[1:]: DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) - for OCC in OCC_LIST: - for DOMAIN_SIZE in DOMAIN_SIZE_LIST: - for STORAGE_FP in STORAGE_FP_LIST: - for COMPUTE_FP in COMPUTE_FP_LIST: - for DEVICE_SET in DEVICE_SET_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: for GRID in GRID_LIST: if STORAGE_FP == 'double' and COMPUTE_FP == 'float': continue @@ -73,9 +74,12 @@ def countAll(): parameters.append('--max-iter ' + str(MAX_ITER)) parameters.append( '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + - STORAGE_FP + '_' + COMPUTE_FP + '_' + - DEVICE_SET.replace(' ', '_') + '_' + OCC) + DEVICE_TYPE + '_' + + DEVICE_SET.replace(' ', '_') + '-' + + GRID + '_' + + DOMAIN_SIZE + '-' + + STORAGE_FP + '-' + COMPUTE_FP + '-' + + OCC) parameters.append('--computeFP ' + COMPUTE_FP) parameters.append('--storageFP ' + STORAGE_FP) parameters.append('--benchmark') @@ -91,6 +95,7 @@ def countAll(): fp.write(' '.join(commandList)) fp.write("\n-------------------------------------------\n") fp.flush() + print(' '.join(commandList)) subprocess.run(commandList, text=True, stdout=fp) counter += 1 From 1168cc2105986b9f07537f3dc379d5135cbefa47 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 23 Jun 2023 11:58:57 -0400 Subject: [PATCH 13/94] Adding halo option. --- .../lbm-lid-driven-cavity-flow.py | 80 ++++++++++--------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index 90a55ad2..795cb046 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -5,6 +5,7 @@ STORAGE_FP_LIST = "double float".split() COMPUTE_FP_LIST = "double float".split() OCC_LIST = "nOCC sOCC".split() +HU_LIST = "huGrid huLattice".split() WARM_UP_ITER = 10 MAX_ITER = 100 REPETITIONS = 5 @@ -38,10 +39,11 @@ def countAll(): for COMPUTE_FP in COMPUTE_FP_LIST: for DEVICE_SET in DEVICE_SET_LIST: for GRID in GRID_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue + for HU in HU_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue - counter += 1 + counter += 1 return counter @@ -61,42 +63,44 @@ def countAll(): for STORAGE_FP in STORAGE_FP_LIST: for COMPUTE_FP in COMPUTE_FP_LIST: for GRID in GRID_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue + for HU in HU_LIST: - parameters = [] - parameters.append('--deviceType ' + DEVICE_TYPE) - parameters.append('--deviceIds ' + DEVICE_SET) - parameters.append('--grid ' + GRID) - parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) - parameters.append('--max-iter ' + str(MAX_ITER)) - parameters.append( - '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + - DEVICE_SET.replace(' ', '_') + '-' + - GRID + '_' + - DOMAIN_SIZE + '-' + - STORAGE_FP + '-' + COMPUTE_FP + '-' + - OCC) - parameters.append('--computeFP ' + COMPUTE_FP) - parameters.append('--storageFP ' + STORAGE_FP) - parameters.append('--benchmark') - parameters.append('--' + OCC) + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + + DEVICE_SET.replace(' ', '_') + '-' + + GRID + '_' + + DOMAIN_SIZE + '-' + + STORAGE_FP + '-' + COMPUTE_FP + '-' + + OCC) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--benchmark') + parameters.append('--' + OCC) - commandList = [] - commandList.append(command) - for el in parameters: - for s in el.split(): - commandList.append(s) + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) - fp.write("\n-------------------------------------------\n") - fp.write(' '.join(commandList)) - fp.write("\n-------------------------------------------\n") - fp.flush() - print(' '.join(commandList)) - subprocess.run(commandList, text=True, stdout=fp) + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + print(' '.join(commandList)) + subprocess.run(commandList, text=True, stdout=fp) - counter += 1 - printProgressBar(counter * 100.0 / SAMPLES, 'Progress') + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') From 0bdce94ec294e0a6e142b704625882939906894e Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 23 Jun 2023 13:00:21 -0400 Subject: [PATCH 14/94] Adding halo option. --- .../lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index 795cb046..677aefba 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -67,7 +67,7 @@ def countAll(): if STORAGE_FP == 'double' and COMPUTE_FP == 'float': continue - + parameters = [] parameters.append('--deviceType ' + DEVICE_TYPE) parameters.append('--deviceIds ' + DEVICE_SET) @@ -88,6 +88,7 @@ def countAll(): parameters.append('--storageFP ' + STORAGE_FP) parameters.append('--benchmark') parameters.append('--' + OCC) + parameters.append('--' + HU) commandList = [] commandList.append(command) From 3dc808eaff2f0eb39423c76176223224087784e1 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 23 Jun 2023 18:52:44 -0400 Subject: [PATCH 15/94] WIP --- .../src/RunCavityTwoPop.cu | 2 +- .../Neon/core/types/vec/vec3d_integer.tdecl.h | 6 +- libNeonDomain/include/Neon/domain/Grids.h | 1 + libNeonDomain/include/Neon/domain/dGridSoA.h | 7 + .../Neon/domain/details/dGrid/dIndex.h | 4 +- .../Neon/domain/details/dGrid/dIndex_imp.h | 4 +- .../Neon/domain/details/dGrid/dPartition.h | 268 +++++++++--------- .../Neon/domain/details/dGrid/dSpan_imp.h | 16 +- .../Neon/domain/details/dGridSoA/dGridSoA.h | 29 +- .../domain/details/dGridSoA/dPartitionSoA.h | 140 +++++---- .../Neon/domain/details/dGridSoA/dSpanSoA.h | 5 + .../domain/details/dGridSoA/dSpanSoA_imp.h | 17 +- .../Neon/domain/details/eGrid/ePartition.h | 2 +- .../Neon/domain/tools/GridTransformer.h | 7 +- .../Neon/domain/tools/gridTransformer/tGrid.h | 11 +- .../domain/tools/gridTransformer/tGrid_ti.h | 28 ++ .../tests/domain-globalIdx/src/globalIdx.cu | 22 +- .../tests/domain-globalIdx/src/globalIdx.h | 5 +- .../tests/domain-globalIdx/src/gtests.cpp | 15 +- libNeonDomain/tests/domain-map/src/gtests.cpp | 9 + libNeonDomain/tests/domain-map/src/map.cu | 2 + libNeonDomain/tests/domain-map/src/map.h | 3 + 22 files changed, 365 insertions(+), 238 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/dGridSoA.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index d28688d1..29c7573d 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -315,7 +315,7 @@ auto run(Config& config, return details::runFilterStoreType(config, report); } if (config.gridType == "dGridSoA") { - return details::runFilterStoreType(config, report); + return details::runFilterStoreType(config, report); } } } // namespace CavityTwoPop diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h index acdae410..ae475c6e 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h @@ -56,6 +56,10 @@ class Vec_3d num_axis = 3 }; + static constexpr int directionX = axis_e::x_axis; + static constexpr int directionY = axis_e::y_axis; + static constexpr int directionZ = axis_e::z_axis; + union { Integer v[axis_e::num_axis]{0, 0, 0}; @@ -120,7 +124,7 @@ class Vec_3d NEON_CUDA_HOST_DEVICE inline void constexpr set(Integer p[self_t::num_axis]); - NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other); + NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other); NEON_CUDA_HOST_DEVICE inline void constexpr set(const Integer& xyz); diff --git a/libNeonDomain/include/Neon/domain/Grids.h b/libNeonDomain/include/Neon/domain/Grids.h index aad0cda5..7c899b98 100644 --- a/libNeonDomain/include/Neon/domain/Grids.h +++ b/libNeonDomain/include/Neon/domain/Grids.h @@ -3,3 +3,4 @@ #include "Neon/domain/aGrid.h" #include "Neon/domain/eGrid.h" #include "Neon/domain/bGrid.h" +#include "Neon/domain/dGridSoA.h" diff --git a/libNeonDomain/include/Neon/domain/dGridSoA.h b/libNeonDomain/include/Neon/domain/dGridSoA.h new file mode 100644 index 00000000..bdd63f25 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/dGridSoA.h @@ -0,0 +1,7 @@ +#pragma once +#include "Neon/domain/details/dGridSoA/dGridSoA.h" + + +namespace Neon { +using dGridSoA = Neon::domain::details::dGridSoA::dGridSoA; +} \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h index 3291e622..a2c57cdb 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h @@ -37,9 +37,9 @@ struct dIndex NEON_CUDA_HOST_DEVICE inline explicit dIndex(const Location& location); - NEON_CUDA_HOST_DEVICE inline auto set() -> Location&; + NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&; - NEON_CUDA_HOST_DEVICE inline auto get() const -> const Location&; + NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&; }; } // namespace Neon::domain::details::dGrid diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h index 4389fb3f..6426e43a 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h @@ -16,11 +16,11 @@ NEON_CUDA_HOST_DEVICE inline dIndex::dIndex(const Location::Integer &x, mLocation.z = z; } -NEON_CUDA_HOST_DEVICE inline auto dIndex::set() -> Location& +NEON_CUDA_HOST_DEVICE inline auto dIndex::setLocation() -> Location& { return mLocation; } -NEON_CUDA_HOST_DEVICE inline auto dIndex::get() const -> const Location& +NEON_CUDA_HOST_DEVICE inline auto dIndex::getLocation() const -> const Location& { return mLocation; } diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 31e480aa..86faf619 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -44,16 +44,16 @@ class dPartition int cardinality, Neon::index_3d fullGridSize, NghIdx* stencil = nullptr) - : m_dataView(dataView), - m_mem(mem), - m_dim(dim), - m_zHaloRadius(zHaloRadius), - m_zBoundaryRadius(zBoundaryRadius), - m_pitch(pitch), - m_prtID(prtID), - m_origin(origin), - m_cardinality(cardinality), - m_fullGridSize(fullGridSize), + : mDataView(dataView), + mMem(mem), + mDim(dim), + mZHaloRadius(zHaloRadius), + mZBoundaryRadius(zBoundaryRadius), + mPitch(pitch), + mPrtID(prtID), + mOrigin(origin), + mCardinality(cardinality), + mFullGridSize(fullGridSize), mPeriodicZ(false), mStencil(stencil) { @@ -70,21 +70,21 @@ class dPartition prtID() const -> int { - return m_prtID; + return mPrtID; } inline NEON_CUDA_HOST_DEVICE auto cardinality() const -> int { - return m_cardinality; + return mCardinality; } inline NEON_CUDA_HOST_DEVICE auto getPitchData() const -> const Pitch& { - return m_pitch; + return mPitch; } inline NEON_CUDA_HOST_DEVICE auto @@ -92,76 +92,76 @@ class dPartition int cardinalityIdx = 0) const -> int64_t { - return idx.get().x * int64_t(m_pitch.x) + - idx.get().y * int64_t(m_pitch.y) + - idx.get().z * int64_t(m_pitch.z) + - cardinalityIdx * int64_t(m_pitch.w); + return idx.getLocation().x * int64_t(mPitch.x) + + idx.getLocation().y * int64_t(mPitch.y) + + idx.getLocation().z * int64_t(mPitch.z) + + cardinalityIdx * int64_t(mPitch.w); } inline NEON_CUDA_HOST_DEVICE auto dim() const -> const Neon::index_3d { - return m_dim; + return mDim; } inline NEON_CUDA_HOST_DEVICE auto halo() const -> const Neon::index_3d { - return Neon::index_3d(0, 0, m_zHaloRadius); + return Neon::index_3d(0, 0, mZHaloRadius); } inline NEON_CUDA_HOST_DEVICE auto origin() const -> const Neon::index_3d { - return m_origin; + return mOrigin; } NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, NghIdx nghOffset, int card, const T& alternativeVal) const -> NghData { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val = alternativeVal; if (isValidNeighbour) { - val = operator()(cellNgh, card); + val = operator()(gidxNgh, card); } return NghData(val, isValidNeighbour); } NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, NghIdx nghOffset, int card) const -> NghData { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val; if (isValidNeighbour) { - val = operator()(cellNgh, card); + val = operator()(gidxNgh, card); } return NghData(val, isValidNeighbour); } template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card, LambdaVALID funIfValid, LambdaNOTValid funIfNOTValid = nullptr) const -> std::enable_if_t, void> { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { - T val = this->operator()(cellNgh, card); + T val = this->operator()(gidxNgh, card); funIfValid(val); } if constexpr (!std::is_same_v) { @@ -171,131 +171,130 @@ class dPartition } } - template + template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card) const -> NghData { - NghData res; - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + T val; if (isValidNeighbour) { - T val = operator()(cellNgh, card); - res.set(val, true); - } else { - res.invalidate(); + val = operator()(gidxNgh, card); } - return res; + return NghData(val, isValidNeighbour); } template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card, T const& defaultValue) const -> NghData { NghData res(defaultValue, false); - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { - T val = operator()(cellNgh, card); + T val = operator()(gidxNgh, card); res.set(val, true); } return res; } NEON_CUDA_HOST_DEVICE inline auto - nghVal(const Idx& eId, + nghVal(const Idx& gidx, uint8_t nghID, int card, const T& alternativeVal) const -> NghData { NghIdx nghOffset = mStencil[nghID]; - return getNghData(eId, nghOffset, card, alternativeVal); + return getNghData(gidx, nghOffset, card, alternativeVal); } /** * Get the index of the neighbor given the offset * @tparam dataView_ta - * @param[in] eId Index of the current element + * @param[in] gidx Index of the current element * @param[in] nghOffset Offset of the neighbor of interest from the current element * @param[in,out] neighbourIdx Index of the neighbor * @return Whether the neighbour is valid */ NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& eId, - const NghIdx& nghOffset, - Idx& neighbourIdx) + helpGetNghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) const -> bool { - Idx cellNgh(eId.get().x + nghOffset.x, - eId.get().y + nghOffset.y, - eId.get().z + nghOffset.z); + Idx gidxNgh(gidx.getLocation().x + nghOffset.x, + gidx.getLocation().y + nghOffset.y, + gidx.getLocation().z + nghOffset.z); - const auto cellNghGlobal = getGlobalIndex(cellNgh); + const auto gidxNghGlobal = getGlobalIndex(gidxNgh); bool isValidNeighbour = true; - if (mPeriodicZ) { - printf("Error, periodic not implemented yet"); - assert(false); - } - - isValidNeighbour = (cellNghGlobal.x >= 0) && - (cellNghGlobal.y >= 0) && - (cellNghGlobal.z >= 0); - - // isValidNeighbour = (cellNgh.get().x < m_dim.x) && - // (cellNgh.get().y < m_dim.y) && - // (cellNgh.get().z < m_dim.z + 2 * m_zHaloRadius) && isValidNeighbour; + isValidNeighbour = (gidxNghGlobal.x >= 0) && + (gidxNghGlobal.y >= 0) && + (gidxNghGlobal.z >= 0); - isValidNeighbour = (cellNghGlobal.x < m_fullGridSize.x) && - (cellNghGlobal.y < m_fullGridSize.y) && - (cellNghGlobal.z < m_fullGridSize.z) && + isValidNeighbour = (gidxNghGlobal.x < mFullGridSize.x) && + (gidxNghGlobal.y < mFullGridSize.y) && + (gidxNghGlobal.z < mFullGridSize.z) && isValidNeighbour; if (isValidNeighbour) { - neighbourIdx = cellNgh; + neighbourIdx = gidxNgh; } return isValidNeighbour; } template NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& eId, - Idx& cellNgh) + helpGetNghIdx(const Idx& gidx, + Idx& gidxNgh) const -> bool { - cellNgh = Idx(eId.get().x + xOff, - eId.get().y + yOff, - eId.get().z + zOff); - Idx cellNgh_global(cellNgh.get() + m_origin); - // const bool isValidNeighbour = (cellNgh_global >= 0 && cellNgh < (m_dim + m_halo) && cellNgh_global < m_fullGridSize); - bool isValidNeighbour = true; - if constexpr (xOff > 0) { - isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().x <= m_fullGridSize.x && isValidNeighbour; - } - if constexpr (xOff < 0) { - isValidNeighbour = cellNgh_global.get().x >= 0 && isValidNeighbour; - } - if constexpr (yOff > 0) { - isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().y <= m_fullGridSize.y && isValidNeighbour; - } - if constexpr (yOff < 0) { - isValidNeighbour = cellNgh_global.get().y >= 0 && isValidNeighbour; - } - if constexpr (zOff > 0) { - isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().z <= m_fullGridSize.z && isValidNeighbour; - } - if constexpr (zOff < 0) { - isValidNeighbour = cellNgh_global.get().z >= m_zHaloRadius && isValidNeighbour; - } - return isValidNeighbour; + return helpGetNghIdx(gidx, NghIdx{xOff, yOff, zOff}, gidxNgh); + // gidxNgh = Idx(gidx.getLocation().x + xOff, + // gidx.getLocation().y + yOff, + // gidx.getLocation().z + zOff); + // + // bool isValidNeighbour = true; + // if constexpr (xOff > 0) { + // int constexpr direction = Neon::index_3d::directionX; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + // } + // if constexpr (xOff < 0) { + // int constexpr direction = Neon::index_3d::directionX; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + // } + // if constexpr (yOff > 0) { + // int constexpr direction = Neon::index_3d::directionY; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + // } + // if constexpr (yOff < 0) { + // int constexpr direction = Neon::index_3d::directionY; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + // } + // if constexpr (zOff > 0) { + // int constexpr direction = Neon::index_3d::directionZ; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + // } + // if constexpr (zOff < 0) { + // int constexpr direction = Neon::index_3d::directionZ; + // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + // } + // return isValidNeighbour; } @@ -303,7 +302,7 @@ class dPartition mem() -> T* { - return m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -311,7 +310,7 @@ class dPartition const -> const T* { - return m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -319,7 +318,7 @@ class dPartition int cardinalityIdx) -> T* { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -327,7 +326,7 @@ class dPartition int cardinalityIdx) -> T& { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -335,7 +334,7 @@ class dPartition int cardinalityIdx) const -> const T& { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } template @@ -386,22 +385,35 @@ class dPartition // local.mLocation.y < m_dim.y && // local.mLocation.z < m_dim.z + m_zHaloRadius); - Neon::index_3d result = local.mLocation + m_origin; - result.z -= m_zHaloRadius; + Neon::index_3d result = local.mLocation; + result.z = result.z + mOrigin.z - mZHaloRadius; return result; } + template + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local) + const -> int + { + if constexpr (Neon::index_3d::directionZ != direction) { + return local.mLocation.v[direction]; + } else { + return local.mLocation.v[Neon::index_3d::directionZ] + + mOrigin.v[Neon::index_3d::directionZ] - + mZHaloRadius; + } + } + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() const -> Neon::index_3d { - return m_fullGridSize; + return mFullGridSize; } auto ioToVti(std::string const& fname, std::string const& fieldName) { - auto fnameCommplete = fname + "_" + std::to_string(m_prtID); - auto haloOrigin = Vec_3d(m_origin.x, m_origin.y, m_origin.z - m_zHaloRadius); - auto haloDim = m_dim + Neon::index_3d(0, 0, 2 * m_zHaloRadius) + 1; + auto fnameCommplete = fname + "_" + std::to_string(mPrtID); + auto haloOrigin = Vec_3d(mOrigin.x, mOrigin.y, mOrigin.z - mZHaloRadius); + auto haloDim = mDim + Neon::index_3d(0, 0, 2 * mZHaloRadius) + 1; IoToVTK io(fnameCommplete, haloDim, @@ -413,35 +425,35 @@ class dPartition io.addField([&](const Neon::index_3d& idx, int i) { return operator()(dIndex(idx), i); }, - m_cardinality, "Partition", ioToVTKns::VtiDataType_e::voxel); + mCardinality, "Partition", ioToVTKns::VtiDataType_e::voxel); io.flushAndClear(); return; } auto getDataView() - -> Neon::DataView + const -> Neon::DataView { - return m_dataView; + return mDataView; } - auto helpGetGlobalToLocalOffets() const - -> NghIdx const* + auto helpGetGlobalToLocalOffets() + const -> NghIdx* { return mStencil; } private: - Neon::DataView m_dataView; - T* NEON_RESTRICT m_mem; - Neon::index_3d m_dim; - int m_zHaloRadius; - int m_zBoundaryRadius; - Pitch m_pitch; - int m_prtID; - Neon::index_3d m_origin; - int m_cardinality; - Neon::index_3d m_fullGridSize; + Neon::DataView mDataView; + T* NEON_RESTRICT mMem; + Neon::index_3d mDim; + int mZHaloRadius; + int mZBoundaryRadius; + Pitch mPitch; + int mPrtID; + Neon::index_3d mOrigin; + int mCardinality; + Neon::index_3d mFullGridSize; bool mPeriodicZ; NghIdx* NEON_RESTRICT mStencil; }; diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h index 8f6f9fea..9fb56572 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h @@ -10,29 +10,29 @@ dSpan::setAndValidate(Idx& idx, const -> bool { bool res = false; - idx.set().x = int(x); - idx.set().y = int(y); - idx.set().z = int(z); + idx.setLocation().x = int(x); + idx.setLocation().y = int(y); + idx.setLocation().z = int(z); - if (idx.get() < mDim) { + if (idx.getLocation() < mDim) { res = true; } switch (mDataView) { case Neon::DataView::STANDARD: { - idx.set().z += mZHaloRadius; + idx.setLocation().z += mZHaloRadius; return res; } case Neon::DataView::INTERNAL: { - idx.set().z += mZHaloRadius + mZBoundaryRadius; + idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; return res; } case Neon::DataView::BOUNDARY: { - idx.set().z += idx.get().z < mZBoundaryRadius + idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius ? 0 : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); - idx.set().z += mZHaloRadius; + idx.setLocation().z += mZHaloRadius; return res; } diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h index 61b182b2..7ce3e582 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h @@ -28,21 +28,22 @@ #include "Neon/domain/patterns/PatternScalar.h" #include "dPartitionSoA.h" +#include "dSpanSoA.h" namespace Neon::domain::details::dGridSoA { namespace details { struct dGridSoATransformation { + using FoundationGrid = Neon::domain::details::dGrid::dGrid; + using Idx = dIndexSoA; + using Span = dSpanSoA; template using Partition = dPartitionSoA; - using Span = Neon::domain::details::eGrid::eSpan; - static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; - using FoundationGrid = Neon::domain::details::eGrid::eGrid; - static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; using ExecutionThreadSpanIndexType = int32_t; - using Idx = FoundationGrid::Idx; static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& { @@ -55,7 +56,7 @@ struct dGridSoATransformation Neon::SetIdx setIdx, Neon::DataView dw, Span& span) { - span = foundationGrid.getSpan(execution, setIdx, dw); + span.helpInit(foundationGrid.getSpan(execution, setIdx, dw)); }); } @@ -67,14 +68,14 @@ struct dGridSoATransformation return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); } - static auto helpGetGridIdx(FoundationGrid&, - Neon::SetIdx const&, - FoundationGrid::Idx const& fgIdx) - -> GridTransformation::Idx - { - GridTransformation::Idx tgIdx = fgIdx; - return tgIdx; - } + // static auto helpGetGridIdx(FoundationGrid&, + // Neon::SetIdx const&, + // FoundationGrid::Idx const& fgIdx) + // -> dGridSoATransformation::Idx + // { + // dGridSoATransformation::Idx tgIdx = fgIdx; + // return tgIdx; + // } template static auto initFieldPartition(FoundationGrid::Field& foundationField, diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h index fc4c3642..1cdd75db 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -19,12 +19,13 @@ class dPartitionSoA using Idx = dIndexSoA; using NghData = Neon::domain::NghData; using Pitch = uint32_4d; + using NghIdx = int8_3d; dPartitionSoA() { } - dPartitionSoA(Neon::domain::details::dGrid::dPartition const& dPartitionOriginal) + dPartitionSoA(Neon::domain::details::dGrid::dPartition& dPartitionOriginal) { mDataView = dPartitionOriginal.getDataView(); mMem = dPartitionOriginal.mem(); @@ -34,22 +35,22 @@ class dPartitionSoA mPrtID = dPartitionOriginal.prtID(); mOrigin = dPartitionOriginal.origin(); mCardinality = dPartitionOriginal.cardinality(); - mFullGridSize = dPartitionOriginal.fullGridSize(); - NghIdx* mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets(); + mFullGridSize = dPartitionOriginal.getDomainSize(); + mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets(); } inline NEON_CUDA_HOST_DEVICE auto prtID() const -> int { - return mPrtID(); + return mPrtID; } inline NEON_CUDA_HOST_DEVICE auto cardinality() const -> int { - return mCardinality(); + return mCardinality; } inline NEON_CUDA_HOST_DEVICE auto @@ -62,30 +63,30 @@ class dPartitionSoA inline NEON_CUDA_HOST_DEVICE auto getPitch(const Idx& idx, int cardinality) - -> Idx::Offset + const -> Idx::Offset { - return idx.getLocationOffset() + cardinality * mPitch.w; + return idx.getOffset() + cardinality * mPitch.w; } inline NEON_CUDA_HOST_DEVICE auto dim() const -> const Neon::index_3d { - return mDim(); + return mDim; } inline NEON_CUDA_HOST_DEVICE auto halo() const -> const Neon::index_3d { - return mDPartition.halo(); + return Neon::index_3d(0, 0, mZHaloRadius); } inline NEON_CUDA_HOST_DEVICE auto origin() const -> const Neon::index_3d { - return m_ormDPartition.origin(); + return mOrigin; } NEON_CUDA_HOST_DEVICE inline auto @@ -96,7 +97,7 @@ class dPartitionSoA const -> NghData { Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val = alternativeVal; if (isValidNeighbour) { val = operator()(gidxNgh, card); @@ -111,7 +112,7 @@ class dPartitionSoA const -> NghData { Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, nghOffset, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val; if (isValidNeighbour) { val = operator()(gidxNgh, card); @@ -132,7 +133,7 @@ class dPartitionSoA const -> std::enable_if_t, void> { Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { T val = this->operator()(gidxNgh, card); funIfValid(val); @@ -152,7 +153,7 @@ class dPartitionSoA { NghData res; Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { T val = operator()(gidxNgh, card); res.set(val, true); @@ -171,7 +172,7 @@ class dPartitionSoA { NghData res(defaultValue, false); Idx gidxNgh; - const bool isValidNeighbour = nghIdx(gidx, gidxNgh); + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { T val = operator()(gidxNgh, card); res.set(val, true); @@ -199,31 +200,31 @@ class dPartitionSoA * @return Whether the neighbour is valid */ NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& gidx, - const NghIdx& nghOffset, - Idx& neighbourIdx) + helpGetNghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) const -> bool { - Neon::index_3d cartesian(gidx.get().x + nghOffset.x, - gidx.get().y + nghOffset.y, - gidx.get().z + nghOffset.z); + Neon::index_3d cartesian(gidx.getLocation().x + nghOffset.x, + gidx.getLocation().y + nghOffset.y, + gidx.getLocation().z + nghOffset.z); - neighbourIdx = Idx(cartesian, - gidx.getOffset() + nghOffset.x * getPitchData().x + - nghOffset.y * getPitchData().y + - nghOffset.z * getPitchData().z); + neighbourIdx = Idx(cartesian, gidx.getOffset() + + nghOffset.x * getPitchData().x + + nghOffset.y * getPitchData().y + + nghOffset.z * getPitchData().z); - Idx::Location nghCartesianGlobal = getGlobalIndex(gidxNgh); + Neon::index_3d const nghCartesianIdx = getGlobalIndex(neighbourIdx); bool isValidNeighbour = true; - isValidNeighbour = (gidxNghGlobal.x >= 0) && - (gidxNghGlobal.y >= 0) && - (gidxNghGlobal.z >= 0); + isValidNeighbour = (nghCartesianIdx.x >= 0) && + (nghCartesianIdx.y >= 0) && + (nghCartesianIdx.z >= 0); - isValidNeighbour = (gidxNghGlobal.x < m_fullGridSize.x) && - (gidxNghGlobal.y < m_fullGridSize.y) && - (gidxNghGlobal.z < m_fullGridSize.z) && + isValidNeighbour = (nghCartesianIdx.x < mFullGridSize.x) && + (nghCartesianIdx.y < mFullGridSize.y) && + (nghCartesianIdx.z < mFullGridSize.z) && isValidNeighbour; return isValidNeighbour; @@ -235,37 +236,46 @@ class dPartitionSoA Idx& gidxNgh) const -> bool { - Neon::index_3d cartesian(gidx.get().x + xOff, - gidx.get().y + yOff, - gidx.get().z + zOff); - gidxNgh = Idx(cartesian, - gidx.getOffset() + xOff * getPitchData().x + - yOff * getPitchData().y + - zOff * getPitchData().z); - - Idx::Location nghCartesianGlobal(getGlobalIndex(gidxNgh)); + { + Neon::index_3d cartesian(gidx.getLocation().x + xOff, + gidx.getLocation().y + yOff, + gidx.getLocation().z + zOff); + gidxNgh = Idx(cartesian, gidx.getOffset() + + xOff * getPitchData().x + + yOff * getPitchData().y + + zOff * getPitchData().z); + } bool isValidNeighbour = true; if constexpr (xOff > 0) { - isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour; - isValidNeighbour = nghCartesianGlobal.x <= mDPartition.m_fullGridSize.x && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (xOff < 0) { - isValidNeighbour = nghCartesianGlobal.x >= 0 && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } if constexpr (yOff > 0) { - isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour; - isValidNeighbour = nghCartesianGlobal.y <= mDPartition.m_fullGridSize.y && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (yOff < 0) { - isValidNeighbour = nghCartesianGlobal.y >= 0 && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } if constexpr (zOff > 0) { - isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour; - isValidNeighbour = nghCartesianGlobal.z <= mDPartition.m_fullGridSize.z && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (zOff < 0) { - isValidNeighbour = nghCartesianGlobal.z >= mDPartition.m_zHaloRadius && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } return isValidNeighbour; } @@ -274,14 +284,14 @@ class dPartitionSoA mem() -> T* { - return mDPartition.m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto mem() const -> const T* { - return mDPartition.m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -290,7 +300,7 @@ class dPartitionSoA -> T* { Idx::Offset p = getPitch(cell, cardinalityIdx); - return mDPartition.m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -299,7 +309,7 @@ class dPartitionSoA -> T& { Idx::Offset p = getPitch(cell, cardinalityIdx); - return mDPartition.m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -308,21 +318,35 @@ class dPartitionSoA const -> const T& { Idx::Offset p = getPitch(cell, cardinalityIdx); - return mDPartition.m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local) const -> Neon::index_3d { - Neon::index_3d result = local.mLocation + m_origin; - result.z -= mDPartition.m_zHaloRadius; + Neon::index_3d result = local.mLocation + mOrigin; + result.z -= mZHaloRadius; return result; } + template + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local) + const -> int + { + if constexpr (Neon::index_3d::directionZ != direction) { + return local.mLocation.v[direction] + + mOrigin.v[direction]; + } else { + return local.mLocation.v[Neon::index_3d::directionZ] + + mOrigin.v[Neon::index_3d::directionZ] - + mZHaloRadius; + } + } + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() const -> Neon::index_3d { - return mDPartition.m_fullGridSize; + return mFullGridSize; } Neon::DataView mDataView; diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h index 83d5a2dc..3aee038c 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h @@ -1,6 +1,8 @@ #pragma once #include "Neon/set/DevSet.h" #include "dIndexSoA.h" +#include "Neon/domain/details/dGrid/dSpan.h" + namespace Neon::domain::details::dGridSoA { /** @@ -40,6 +42,9 @@ class dSpanSoA helpGetDim() const -> Neon::index_3d const&; + NEON_CUDA_HOST_DEVICE inline auto + helpInit(Neon::domain::details::dGrid::dSpan const&) ->void; + private: Neon::DataView mDataView; int mZHaloRadius; diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h index a3dff4cf..421a3f27 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h @@ -14,19 +14,19 @@ dSpanSoA::setAndValidate(Idx& idx, idx.setLocation().y = int(y); idx.setLocation().z = int(z); - if (idx.get() < mDim) { + if (idx.getLocation() < mDim) { res = true; } switch (mDataView) { case Neon::DataView::STANDARD: { idx.setLocation().z += mZHaloRadius; - idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; return res; } case Neon::DataView::INTERNAL: { idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; - idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; return res; } case Neon::DataView::BOUNDARY: { @@ -35,7 +35,7 @@ dSpanSoA::setAndValidate(Idx& idx, ? 0 : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); idx.setLocation().z += mZHaloRadius; - idx.setLocationOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; + idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; return res; } default: { @@ -68,4 +68,13 @@ NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDim() return mDim; } +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) ->void +{ + mDataView = dspan.helpGetDataView(); + mZHaloRadius = dspan.helpGetZHaloRadius(); + mZBoundaryRadius = dspan.helpGetZBoundaryRadius(); + mDim = dspan.helpGetDim(); +} + + } // namespace Neon::domain::details::dGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h index 012a3588..62b75981 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h @@ -59,7 +59,7 @@ class ePartition * | * | Connectivity table has the same layout of a field with cardinality equal to * | the number of neighbours and an SoA layout. Let's call this field nghField. - * | nghField(e, nghIdx) is the eIdx_t of the neighbour element as in a STANDARD + * | nghField(e, helpGetNghIdx) is the eIdx_t of the neighbour element as in a STANDARD * | view. * |--) */ diff --git a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h index 90556fb9..47518f7a 100644 --- a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h +++ b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h @@ -1,10 +1,10 @@ #pragma once +#include "Neon/domain/tools/PartitionTable.h" +#include "Neon/domain/tools/SpanTable.h" #include "Neon/domain/tools/gridTransformer/tField.h" #include "Neon/domain/tools/gridTransformer/tGrid.h" #include "Neon/domain/tools/gridTransformer/tGrid_ti.h" -#include "Neon/domain/tools/PartitionTable.h" -#include "Neon/domain/tools/SpanTable.h" namespace Neon::domain::tool { @@ -24,9 +24,10 @@ template class GridTransformer { public: + using Idx = typename GridTransformation::Idx; + using Span = typename GridTransformation::Span; template using Partition = typename GridTransformation::template Partition; - using Span = typename GridTransformation::Span; using FoundationGrid = typename GridTransformation::FoundationGrid; using Grid = details::tGrid; diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h index d6d98be1..bd28e8f5 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h @@ -54,6 +54,15 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate + tGrid(const Neon::Backend& backend /**< Target for computation */, + const Neon::int32_3d& dimension /**< Dimension of the bounding box containing the domain */, + const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, + const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, + const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, + const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */); + tGrid(const tGrid& other); // copy constructor tGrid(tGrid&& other) noexcept; // move constructor tGrid& operator=(const tGrid& other); // copy assignment @@ -109,7 +118,7 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate(bk); } diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h index 4ba1403d..0a0249d7 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h @@ -30,6 +30,34 @@ tGrid::tGrid(FoundationGrid& foundationGrid) foundationGrid.getOrigin()); } +template +template +tGrid::tGrid(const Neon::Backend& bk, + const Neon::int32_3d& dimension, + const SparsityPattern& activeCellLambda, + const Neon::domain::Stencil& stencil, + const Vec_3d& spacing, + const Vec_3d& origin) +{ + mData = std::make_shared(bk); + mData->foundationGrid = FoundationGrid(bk, + dimension, + activeCellLambda, + stencil, + spacing, + origin); + GridTransformation::initSpan(mData->foundationGrid, + NEON_OUT mData->spanTable); + tGrid::GridBase::init("tGrid", + bk, + mData->foundationGrid.getDimension(), + mData->foundationGrid.getStencil(), + mData->foundationGrid.getNumActiveCellsPerPartition(), + mData->foundationGrid.getDefaultBlock(), + mData->foundationGrid.getSpacing(), + mData->foundationGrid.getOrigin()); +} + template tGrid::tGrid() { diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu index 158d3e05..1b94b566 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu @@ -1,5 +1,6 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" @@ -27,18 +28,18 @@ auto defContainer(int streamIdx, return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); Neon::index_3d globalPoint = a.getGlobalIndex(e); - a(e, 0) = globalPoint.x ; + a(e, 0) = globalPoint.x; b(e, 0) = globalPoint.y; c(e, 0) = globalPoint.z; -// if constexpr (std::is_same_v) { -// printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx, -// e.mInDataBlockIdx.x, -// e.mInDataBlockIdx.y, -// e.mInDataBlockIdx.z, -// globalPoint.x, -// globalPoint.y, -// globalPoint.z); -// } + // if constexpr (std::is_same_v) { + // printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx, + // e.mInDataBlockIdx.x, + // e.mInDataBlockIdx.y, + // e.mInDataBlockIdx.z, + // globalPoint.x, + // globalPoint.y, + // globalPoint.z); + // } }; }); } @@ -98,5 +99,6 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h index 0a3b87eb..c766f7ca 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h @@ -3,9 +3,9 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" - namespace globalIdx { using namespace Neon::domain::tool::testing; @@ -15,6 +15,7 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; -} // namespace map +} // namespace globalIdx diff --git a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp index 783830ca..f0ecce78 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp @@ -4,7 +4,7 @@ #include "globalIdx.h" #include "runHelper.h" -TEST(domain_unit_test_globalIdx, dGrid) +TEST(domain_globalIdx, dGrid) { int nGpus = 3; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid) 1); } -TEST(domain_unit_test_globalIdx, eGrid) +TEST(domain_globalIdx, eGrid) { int nGpus = 3; using Type = int64_t; @@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGrid) +TEST(domain_globalIdx, bGrid) { int nGpus = 3; using Type = int64_t; @@ -31,6 +31,15 @@ TEST(domain_unit_test_globalIdx, bGrid) 1); } +TEST(domain_globalIdx, dGridSoA) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp index d0d43b60..50d6e34d 100644 --- a/libNeonDomain/tests/domain-map/src/gtests.cpp +++ b/libNeonDomain/tests/domain-map/src/gtests.cpp @@ -31,6 +31,15 @@ TEST(domain_map, bGrid) 1); } +TEST(domain_map, dGridSoA) +{ + int nGpus = 1; + using Type = int64_t; + runAllTestConfiguration(std::function(map::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu index bd25f178..b001d832 100644 --- a/libNeonDomain/tests/domain-map/src/map.cu +++ b/libNeonDomain/tests/domain-map/src/map.cu @@ -4,6 +4,7 @@ #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" #include "gtest/gtest.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace map { @@ -75,6 +76,7 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h index 611f2046..16073657 100644 --- a/libNeonDomain/tests/domain-map/src/map.h +++ b/libNeonDomain/tests/domain-map/src/map.h @@ -4,6 +4,7 @@ #include "Neon/domain/Grids.h" #include "Neon/domain/tools/TestData.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace map { @@ -14,6 +15,8 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; } // namespace map From ceab2a6f62dd72d4faedfadaea2be33b3ab4f565 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 26 Jun 2023 11:32:35 -0400 Subject: [PATCH 16/94] domain_neighbour_globalIdx for dGridSoA --- .../Neon/domain/details/dGrid/dPartition.h | 75 +++--- .../domain-neighbour-globalIdx/src/gtests.cpp | 55 ++++- .../src/runHelper.h | 1 + .../src/testsAndContainers.cu | 220 ++++++++++++++++-- .../src/testsAndContainers.h | 9 + 5 files changed, 306 insertions(+), 54 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 86faf619..2becc97d 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -258,43 +258,44 @@ class dPartition Idx& gidxNgh) const -> bool { - return helpGetNghIdx(gidx, NghIdx{xOff, yOff, zOff}, gidxNgh); - // gidxNgh = Idx(gidx.getLocation().x + xOff, - // gidx.getLocation().y + yOff, - // gidx.getLocation().z + zOff); - // - // bool isValidNeighbour = true; - // if constexpr (xOff > 0) { - // int constexpr direction = Neon::index_3d::directionX; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; - // } - // if constexpr (xOff < 0) { - // int constexpr direction = Neon::index_3d::directionX; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; - // } - // if constexpr (yOff > 0) { - // int constexpr direction = Neon::index_3d::directionY; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; - // } - // if constexpr (yOff < 0) { - // int constexpr direction = Neon::index_3d::directionY; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; - // } - // if constexpr (zOff > 0) { - // int constexpr direction = Neon::index_3d::directionZ; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; - // } - // if constexpr (zOff < 0) { - // int constexpr direction = Neon::index_3d::directionZ; - // int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); - // isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; - // } - // return isValidNeighbour; + // NghIdx offset(xOff, yOff, zOff); + // return helpGetNghIdx(gidx, offset, gidxNgh); + gidxNgh = Idx(gidx.getLocation().x + xOff, + gidx.getLocation().y + yOff, + gidx.getLocation().z + zOff); + + bool isValidNeighbour = true; + if constexpr (xOff > 0) { + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (xOff < 0) { + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + if constexpr (yOff > 0) { + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (yOff < 0) { + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + if constexpr (zOff > 0) { + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (zOff < 0) { + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + return isValidNeighbour; } diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp index feba5a9b..21bba9b5 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp @@ -1,10 +1,10 @@ +#include "./testsAndContainers.h" #include "Neon/Neon.h" #include "gtest/gtest.h" -#include "./testsAndContainers.h" #include "runHelper.h" -TEST(domain_unit_test_globalIdx, dGrid) +TEST(domain_neighbour_globalIdx, dGrid) { int nGpus = 5; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid) 1); } -TEST(domain_unit_test_globalIdx, eGrid) +TEST(domain_neighbour_globalIdx, eGrid) { int nGpus = 5; using Type = int64_t; @@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGrid) +TEST(domain_neighbour_globalIdx, bGrid) { int nGpus = 5; using Type = int64_t; @@ -31,6 +31,53 @@ TEST(domain_unit_test_globalIdx, bGrid) 1); } +TEST(domain_neighbour_globalIdx, dGridSoA) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + +/////////////////////////////////////////// + +TEST(domain_neighbour_globalIdx, dGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, eGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, bGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, dGridSoA_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h index 0014594c..32a078d6 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h @@ -9,6 +9,7 @@ #include "Neon/domain/dGrid.h" #include "Neon/domain/eGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/Geometries.h" #include "Neon/domain/tools/TestData.h" diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu index 49dd3bd2..7b2c3fef 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu @@ -1,5 +1,6 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" @@ -61,15 +62,15 @@ auto checkNeighbourData(Field const& filedA, Field const& filedB, Field const& filedC, Neon::index_3d testDirection, - Field const& checkFlatA, - Field const& checkFlatB, - Field const& checkFlatC) + Field& checkFlatA, + Field& checkFlatB, + Field& checkFlatC) -> Neon::set::Container { const auto& grid = filedA.getGrid(); return grid.newContainer( "defContainer", - [&](Neon::set::Loader& loader) { + [&, testDirection](Neon::set::Loader& loader) { auto a = loader.load(filedA, Neon::Pattern::STENCIL); auto b = loader.load(filedB, Neon::Pattern::STENCIL); auto c = loader.load(filedC, Neon::Pattern::STENCIL); @@ -102,6 +103,58 @@ auto checkNeighbourData(Field const& filedA, }); } +template +auto checkNeighbourDataTemplate(Field const& filedA, + Field const& filedB, + Field const& filedC, + Field& checkFlatA, + Field& checkFlatB, + Field& checkFlatC) + -> Neon::set::Container +{ + const auto& grid = filedA.getGrid(); + return grid.newContainer( + "defContainer", + [&](Neon::set::Loader& loader) { + auto a = loader.load(filedA, Neon::Pattern::STENCIL); + auto b = loader.load(filedB, Neon::Pattern::STENCIL); + auto c = loader.load(filedC, Neon::Pattern::STENCIL); + + auto resA = loader.load(checkFlatA, Neon::Pattern::MAP); + auto resB = loader.load(checkFlatB, Neon::Pattern::MAP); + auto resC = loader.load(checkFlatC, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { + constexpr Neon::index_3d testDirection(xOff, yOff, zOff); + + // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); + Neon::index_3d globalPoint = a.getGlobalIndex(e); + auto ngh = globalPoint + testDirection; + + decltype(a)* nghInfo[3] = {&a, &b, &c}; + decltype(a)* results[3] = {&resA, &resB, &resC}; + + for (int i = 0; i < 3; i++) { + auto d = nghInfo[i]->template getNghData(e, 0); + // auto d = nghInfo[i]->getNghData(e, testDirection.newType(), 0); + + if (d.isValid()) { + results[i]->operator()(e, 0) = d.getData() == ngh.v[i] ? +1 : -1; + if (d.getData() != ngh.v[i]) { + printf("ERROR: %d %d %d %d %d %d\n", globalPoint.x, globalPoint.y, globalPoint.z, ngh.v[0], ngh.v[1], ngh.v[2]); + d = nghInfo[i]->getNghData(e, testDirection.newType(), 0); + } + } else { + results[i]->operator()(e, 0) = 0; + } + } + }; + }); +} + using namespace Neon::domain::tool::testing; template @@ -165,15 +218,15 @@ auto run(TestData& data) -> void X, Y, Z); }; - // constexpr std::array - // stencil{Ngh3DIdx(1, 0, 0), - // Ngh3DIdx(-1, 0, 0), - // Ngh3DIdx(0, 1, 0), - // Ngh3DIdx(0, -1, 0), - // Ngh3DIdx(0, 0, 1), - // Ngh3DIdx(0, 0, -1)}; - constexpr std::array - stencil{Ngh3DIdx(0, 0, -1)}; + constexpr std::array + stencil{Ngh3DIdx(1, 0, 0), + Ngh3DIdx(-1, 0, 0), + Ngh3DIdx(0, 1, 0), + Ngh3DIdx(0, -1, 0), + Ngh3DIdx(0, 0, 1), + Ngh3DIdx(0, 0, -1)}; + // constexpr std::array + // stencil{Ngh3DIdx(0, 0, -1)}; for (auto const& direction : stencil) { reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx); @@ -214,8 +267,149 @@ auto run(TestData& data) -> void } } +template +auto runTemplate(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + + data.resetValuesToLinear(1, 100); + + auto aField = grid.template newField("a", 1, 0); + auto bField = grid.template newField("a", 1, 0); + auto cField = grid.template newField("a", 1, 0); + + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + auto& Z = data.getField(FieldNames::Z); + + const Neon::index_3d dim = grid.getDimension(); + auto bk = grid.getBackend(); + + { // NEON + { + initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + } + using Ngh3DIdx = Neon::int32_3d; + + auto setGolden = [&](Ngh3DIdx const& direction) { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + auto& Z = data.getIODomain(FieldNames::Z); + + data.forEachActiveIODomain([&](const Neon::index_3d& idx, + int cardinality, + Type& a, + Type& b, + Type& c) { + a = 1; + b = 1; + c = 1; + auto ngh = direction + idx; + if (!(ngh >= 0)) { + a = 0; + b = 0; + c = 0; + } + if (!(dim > ngh)) { + a = 0; + b = 0; + c = 0; + } + }, + X, Y, Z); + }; + + constexpr std::array + stencil{Ngh3DIdx(1, 0, 0), + Ngh3DIdx(-1, 0, 0), + Ngh3DIdx(0, 1, 0), + Ngh3DIdx(0, -1, 0), + Ngh3DIdx(0, 0, 1), + Ngh3DIdx(0, 0, -1)}; + // constexpr std::array + // stencil{Ngh3DIdx(0, 0, -1)}; + + for (auto const& direction : stencil) { + reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + reset(X, Y, Z).run(Neon::Backend::mainStreamIdx); + { // Updating halo with wrong data + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + { + initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + + + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + + if (direction == Neon::index_3d(1, 0, 0)) { + checkNeighbourDataTemplate<1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(-1, 0, 0)) { + checkNeighbourDataTemplate<-1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 1, 0)) { + checkNeighbourDataTemplate<0, 1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, -1, 0)) { + checkNeighbourDataTemplate<0, -1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 0, 1)) { + checkNeighbourDataTemplate<0, 0, 1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 0, -1)) { + checkNeighbourDataTemplate<0, 0, -1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else { + std::cout << "Direction not implemented " << direction << std::endl; + exit(99); + } + setGolden(direction); + + bk.sync(Neon::Backend::mainStreamIdx); + bool isOk = data.compare(FieldNames::X); + isOk = isOk && data.compare(FieldNames::Y); + isOk = isOk && data.compare(FieldNames::Z); + + if (!isOk) { + std::cout << "Direction with errors " << direction << std::endl; + data.getField(FieldNames::X).ioToVtk(grid.getImplementationName() + "X", "X", true); + data.getField(FieldNames::Y).ioToVtk(grid.getImplementationName() + "Y", "Y", true); + data.getField(FieldNames::Z).ioToVtk(grid.getImplementationName() + "Z", "Z", true); + exit(77); + ASSERT_TRUE(isOk); + } + } +} + + template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; + + +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h index 0a3b87eb..bcf503f2 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h @@ -4,6 +4,7 @@ #include "Neon/domain/Grids.h" #include "Neon/domain/tools/TestData.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace globalIdx { @@ -12,9 +13,17 @@ using namespace Neon::domain::tool::testing; template auto run(TestData& data) -> void; +template +auto runTemplate(TestData& data) -> void; + extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; } // namespace map From 13377a4af18430dfc9bf7ec16afe2fcb2d209520 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 27 Jun 2023 10:08:19 -0400 Subject: [PATCH 17/94] Testing block sizes on bGrid --- .../lbm-lid-driven-cavity-flow/src/LbmTools.h | 8 +- .../src/LbmToolsTemplateOnly.h | 440 ++++++++++++++++++ .../src/RunCavityTwoPop.cu | 27 +- .../domain/details/dGridSoA/dSpanSoA_imp.h | 52 ++- .../tests/domain-map/src/runHelper.h | 4 +- 5 files changed, 501 insertions(+), 30 deletions(-) create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h index 5728a5d3..ab79ed2a 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h @@ -35,7 +35,6 @@ struct LbmContainers(i, GOid, 0.0).value; \ @@ -101,8 +100,6 @@ struct LbmContainers; constexpr std::array stencil{ @@ -160,7 +157,6 @@ struct LbmContainers(pop_out_06); + fOut(i, 16) = static_cast(pop_out_opp_06); COMPUTE_GO_AND_BACK(7, 17) COMPUTE_GO_AND_BACK(8, 18) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h new file mode 100644 index 00000000..fc4d7806 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h @@ -0,0 +1,440 @@ +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +#define COMPUTE_CAST(VAR) static_cast((VAR)) + +template +struct LbmContainersTemplateOnly +{ +}; + +/** + * Specialization for Lattice + * @tparam PopulationField + * @tparam LbmComputeType + */ +template +struct LbmContainersTemplateOnly, + PopulationField, + LbmComputeType> +{ + using LbmStoreType = typename PopulationField::Type; + using CellTypeField = typename PopulationField::Grid::template Field; + using Lattice = D3Q19Template; + using Idx = typename PopulationField::Idx; + using Grid = typename PopulationField::Grid; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + +#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ + popIn[GOid] = fin.template read(gidx); \ + } else { \ + popIn[GOid] = fin.template nghVal(gidx).value; \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin.template read(gidx); \ + } else { \ + popIn[BKid] = fin.template nghVal(gidx).value; \ + } \ + } \ + } + static inline NEON_CUDA_HOST_DEVICE auto + loadPopulation(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopulationField::Partition const& fin, + NEON_OUT LbmStoreType popIn[19]) + { + // #pragma omp critical + // { + + LOADPOP(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); + LOADPOP(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); + LOADPOP(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); + LOADPOP(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); + LOADPOP(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); + LOADPOP(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); + LOADPOP(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); + LOADPOP(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); + LOADPOP(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); + // } + // Treat the case of the center (c[k] = {0, 0, 0,}). + { + popIn[Lattice::centerDirection] = fin(i, Lattice::centerDirection); + } + } +#undef LOADPOP + +#define PULL_STREAM(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ + popIn[GOid] = fin(gidx, BKid) + \ + fin.template getNghData(gidx, BKid)(); \ + } else { \ + popIn[GOid] = fin.template getNghData(gidx, GOid)(); \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin(gidx, GOid) + fin.template getNghData(gidx, GOid)(); \ + } else { \ + popIn[BKid] = fin.template getNghData(gidx, BKid)(); \ + } \ + } \ + } + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopulationField::Partition const& fin, + NEON_OUT LbmStoreType popIn[19]) + { + // #pragma omp critical + // { +#if 0 + using TopologyByDirection = std::tuple; + constexpr std::array stencil{ + std::make_tuple(Neon::int32_3d(-1, 0, 0), /* GOid */ 0, /* --- */ Neon::int32_3d(1, 0, 0), /* BKid */ 10), + std::make_tuple(Neon::int32_3d(0, -1, 0), /* GOid */ 1, /* --- */ Neon::int32_3d(0, 1, 0), /* BKid */ 11), + std::make_tuple(Neon::int32_3d(0, 0, -1), /* GOid */ 2, /* --- */ Neon::int32_3d(0, 0, 1), /* BKid */ 12), + std::make_tuple(Neon::int32_3d(-1, -1, 0), /* GOid */ 3, /* --- */ Neon::int32_3d(1, 1, 0), /* BKid */ 13), + std::make_tuple(Neon::int32_3d(-1, 1, 0), /* GOid */ 4, /* --- */ Neon::int32_3d(1, -1, 0), /* BKid */ 14), + std::make_tuple(Neon::int32_3d(-1, 0, -1), /* GOid */ 5, /* --- */ Neon::int32_3d(1, 0, 1), /* BKid */ 15), + std::make_tuple(Neon::int32_3d(-1, 0, 1), /* GOid */ 6, /* --- */ Neon::int32_3d(1, 0, -1), /* BKid */ 16), + std::make_tuple(Neon::int32_3d(0, -1, -1), /* GOid */ 7, /* --- */ Neon::int32_3d(0, 1, 1), /* BKid */ 17), + std::make_tuple(Neon::int32_3d(0, -1, 1), /* GOid */ 8, /* --- */ Neon::int32_3d(0, 1, -1), /* BKid */ 18)}; + + + auto pullStream = [&]() { + static_assert(stencilIdx < 9); + constexpr int GOid = std::get<1>(stencil[stencilIdx]); + constexpr int BKid = std::get<3>(stencil[stencilIdx]); + constexpr Neon::int32_3d GoOffset = std::get<0>(stencil[stencilIdx]); + constexpr Neon::int32_3d BkOffset = std::get<2>(stencil[stencilIdx]); + { + if (wallBitFlag & (uint32_t(1) << GOid)) { + popIn[GOid] = fin(gidx, BKid) + + fin.template getNghData(gidx, BKid)(); + } else { + popIn[GOid] = fin.template getNghData(gidx, GOid)(); + } + } + { /*BK*/ + if (wallBitFlag & (uint32_t(1) << BKid)) { + popIn[BKid] = fin(gidx, GOid) + + fin.template getNghData(gidx, GOid)(); + } else { + popIn[BKid] = fin.template getNghData(gidx, BKid)(); + } + } + }; + pullStream.template operator()<0>(); + pullStream.template operator()<1>(); + pullStream.template operator()<2>(); + pullStream.template operator()<3>(); + pullStream.template operator()<4>(); + pullStream.template operator()<5>(); + pullStream.template operator()<6>(); + pullStream.template operator()<7>(); + pullStream.template operator()<8>(); +#endif + PULL_STREAM(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); + PULL_STREAM(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); + PULL_STREAM(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); + PULL_STREAM(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); + PULL_STREAM(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); + PULL_STREAM(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); + PULL_STREAM(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); + PULL_STREAM(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); + PULL_STREAM(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); + + // } + // Treat the case of the center (c[k] = {0, 0, 0,}). + { + popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection); + } + } +#undef PULL_STREAM + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const LbmStoreType pop[Lattice::Q], + NEON_OUT LbmComputeType& rho, + NEON_OUT std::array& u) + -> void + { +#define POP(IDX) static_cast(pop[IDX]) + + const LbmComputeType X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const LbmComputeType X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const LbmComputeType X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const LbmComputeType Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const LbmComputeType Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const LbmComputeType Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const LbmComputeType Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); + +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! LbmComputeType iterator */, + const LbmStoreType pop[Lattice::Q], + LbmComputeType const& rho /*! Density */, + std::array const& u /*! Velocity */, + LbmComputeType const& usqr /*! Usqr */, + LbmComputeType const& omega /*! Omega */, + typename PopulationField::Partition& fOut /*! Population */) + + -> void + { + const LbmComputeType ck_u03 = u[0] + u[1]; + const LbmComputeType ck_u04 = u[0] - u[1]; + const LbmComputeType ck_u05 = u[0] + u[2]; + const LbmComputeType ck_u06 = u[0] - u[2]; + const LbmComputeType ck_u07 = u[1] + u[2]; + const LbmComputeType ck_u08 = u[1] - u[2]; + + const LbmComputeType eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr); + const LbmComputeType eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr); + const LbmComputeType eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr); + const LbmComputeType eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr); + const LbmComputeType eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr); + const LbmComputeType eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr); + const LbmComputeType eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr); + const LbmComputeType eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr); + const LbmComputeType eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr); + + const LbmComputeType eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0]; + const LbmComputeType eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1]; + const LbmComputeType eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2]; + const LbmComputeType eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03; + const LbmComputeType eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04; + const LbmComputeType eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05; + const LbmComputeType eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06; + const LbmComputeType eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07; + const LbmComputeType eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08; + + const LbmComputeType pop_out_00 = (1. - omega) * static_cast(pop[0]) + omega * eq_00; + const LbmComputeType pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; + const LbmComputeType pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; + const LbmComputeType pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; + const LbmComputeType pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; + const LbmComputeType pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; + const LbmComputeType pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; + const LbmComputeType pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; + const LbmComputeType pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; + + const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; + const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; + const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; + const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; + const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; + const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; + const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; + const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; + const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; + + +#define COMPUTE_GO_AND_BACK(GOid, BKid) \ + { \ + fOut(i, GOid) = static_cast(pop_out_0##GOid); \ + fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ + } + + COMPUTE_GO_AND_BACK(0, 10) + COMPUTE_GO_AND_BACK(1, 11) + COMPUTE_GO_AND_BACK(2, 12) + COMPUTE_GO_AND_BACK(3, 13) + COMPUTE_GO_AND_BACK(4, 14) + COMPUTE_GO_AND_BACK(5, 15) + COMPUTE_GO_AND_BACK(6, 16) + COMPUTE_GO_AND_BACK(7, 17) + COMPUTE_GO_AND_BACK(8, 18) + +#undef COMPUTE_GO_AND_BACK + + { + const LbmComputeType eq_09 = rho * (1. / 3.) * (1. - usqr); + const LbmComputeType pop_out_09 = (1. - omega) * + static_cast(pop[Lattice::centerDirection]) + + omega * eq_09; + fOut(i, Lattice::centerDirection) = static_cast(pop_out_09); + } + } + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopulationField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const LbmComputeType omega /*! LBM omega parameter */, + PopulationField& fOutField /*! output Population field */) + -> Neon::set::Container + { + + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + LbmStoreType popIn[Lattice::Q]; + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + LbmComputeType rho; + std::array u{.0, .0, .0}; + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + LbmComputeType usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + +#define COMPUTE_MASK_WALL(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); \ + } \ + } \ + { /*BK*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << BKid)); \ + } \ + } \ + } + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, + Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + COMPUTE_MASK_WALL(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10) + COMPUTE_MASK_WALL(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11) + COMPUTE_MASK_WALL(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12) + COMPUTE_MASK_WALL(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13) + COMPUTE_MASK_WALL(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14) + COMPUTE_MASK_WALL(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15) + COMPUTE_MASK_WALL(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16) + COMPUTE_MASK_WALL(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17) + COMPUTE_MASK_WALL(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18) + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } +#undef COMPUTE_MASK_WALL + +#define BC_LOAD(GOID, DKID) \ + popIn[GOID] = fIn(gidx, GOID); \ + popIn[DKID] = fIn(gidx, DKID); + + static auto + computeRhoAndU([[maybe_unused]] const PopulationField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + LbmComputeType rho = 0; + std::array u{.0, .0, .0}; + LbmStoreType popIn[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + BC_LOAD(0, 10) + BC_LOAD(1, 11) + BC_LOAD(2, 12) + BC_LOAD(3, 13) + BC_LOAD(4, 14) + BC_LOAD(5, 15) + BC_LOAD(6, 16) + BC_LOAD(7, 17) + BC_LOAD(8, 18) + popIn[9] = fIn(gidx, 9); + + rho = 1.0; + u = std::array{COMPUTE_CAST(popIn[0]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[1]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[2]) / COMPUTE_CAST(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } +}; + +#undef COMPUTE_CAST \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index 29c7573d..e91055f9 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -2,8 +2,8 @@ #include "D3Q19.h" #include "Neon/domain/bGrid.h" #include "Neon/domain/dGrid.h" -#include "Neon/domain/eGrid.h" #include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/eGrid.h" #include "CellType.h" #include "LbmIteration.h" @@ -314,6 +314,31 @@ auto run(Config& config, if (config.gridType == "bGrid") { return details::runFilterStoreType(config, report); } + if (config.gridType == "bGrid_4_4_4") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_8_4") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_8_4") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 4, 8>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_2_8") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } + if (config.gridType == "bGrid_32_8_2") { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } if (config.gridType == "dGridSoA") { return details::runFilterStoreType(config, report); } diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h index 421a3f27..f760adb5 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h @@ -4,71 +4,77 @@ namespace Neon::domain::details::dGridSoA { NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::setAndValidate(Idx& idx, - const uint32_t& x, - const uint32_t& y, - const uint32_t& z) + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) const -> bool { - bool res = false; idx.setLocation().x = int(x); idx.setLocation().y = int(y); idx.setLocation().z = int(z); - if (idx.getLocation() < mDim) { - res = true; - } + bool isValid = idx.getLocation() < mDim; switch (mDataView) { case Neon::DataView::STANDARD: { idx.setLocation().z += mZHaloRadius; - idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; - return res; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; } case Neon::DataView::INTERNAL: { idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; - idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; - return res; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; } case Neon::DataView::BOUNDARY: { - idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius - ? 0 - : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); + ? 0 + : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); idx.setLocation().z += mZHaloRadius; - idx.setOffset() = idx.getLocation().x + idx.getLocation().y * mDim.x + idx.getLocation().z * mDim.x * mDim.y; - return res; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; } default: { } } - return false; + return isValid; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDataView() +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetDataView() const -> Neon::DataView const& { return mDataView; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZHaloRadius() +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetZHaloRadius() const -> int const& { return mZHaloRadius; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetZBoundaryRadius() +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetZBoundaryRadius() const -> int const& { return mZBoundaryRadius; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpGetDim() +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetDim() const -> Neon::index_3d const& { return mDim; } -NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) ->void +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) -> void { mDataView = dspan.helpGetDataView(); mZHaloRadius = dspan.helpGetZHaloRadius(); @@ -77,4 +83,4 @@ NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGr } -} // namespace Neon::domain::details::dGrid \ No newline at end of file +} // namespace Neon::domain::details::dGridSoA \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map/src/runHelper.h b/libNeonDomain/tests/domain-map/src/runHelper.h index 53ea8681..593e31c2 100644 --- a/libNeonDomain/tests/domain-map/src/runHelper.h +++ b/libNeonDomain/tests/domain-map/src/runHelper.h @@ -31,7 +31,7 @@ void runAllTestConfiguration( nGpuTest.push_back(i); } // std::vector nGpuTest{2,4,6,8}; - std::vector cardinalityTest{1}; + std::vector cardinalityTest{1,3,19}; std::vector dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}}; std::vector runtimeE{Neon::Runtime::openmp}; @@ -95,6 +95,7 @@ void runAllTestConfiguration( } } +#if 0 template void runOneTestConfiguration(const std::string& gname, @@ -144,3 +145,4 @@ void runOneTestConfiguration(const std::string& gname, } } } +#endif \ No newline at end of file From 3a36f0c81e830a170712227b463a8c4d7631cf26 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 28 Jun 2023 12:55:22 -0400 Subject: [PATCH 18/94] Adding dGridSoA to the stencil tests --- .../Neon/domain/details/dGridSoA/dPartitionSoA.h | 2 -- .../Neon/domain/tools/gridTransformer/tField.h | 1 + libNeonDomain/tests/domain-stencil/src/gtests.cpp | 11 ++++++++++- libNeonDomain/tests/domain-stencil/src/runHelper.h | 2 +- libNeonDomain/tests/domain-stencil/src/stencil.cu | 1 + libNeonDomain/tests/domain-stencil/src/stencil.h | 3 ++- 6 files changed, 15 insertions(+), 5 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h index 1cdd75db..62fdc9a4 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -353,13 +353,11 @@ class dPartitionSoA T* NEON_RESTRICT mMem; Neon::index_3d mDim; int mZHaloRadius; - int mZBoundaryRadius; Pitch mPitch; int mPrtID; Neon::index_3d mOrigin; int mCardinality; Neon::index_3d mFullGridSize; - bool mPeriodicZ; NghIdx* NEON_RESTRICT mStencil; }; diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h index c9ca59b9..a1b4c90d 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h @@ -26,6 +26,7 @@ class tField : public Neon::domain::interface::FieldBaseTemplate; using Idx = typename Partition::Idx; using NghIdx = typename Partition::NghIdx; // for compatibility with eGrid + using NghData = typename Partition::NghData; // for compatibility with eGrid private: using FoundationGrid = typename GridTransformation::FoundationGrid; diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp index ec6f892a..15816da3 100644 --- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp +++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp @@ -22,7 +22,7 @@ TEST(domain_stencil, eGrid) 1); } -TEST(domain_stencil, bGri ) +TEST(domain_stencil, bGri) { int nGpus = 5; using Type = int64_t; @@ -31,6 +31,15 @@ TEST(domain_stencil, bGri ) 1); } +TEST(domain_stencil, dGridSoA) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-stencil/src/runHelper.h b/libNeonDomain/tests/domain-stencil/src/runHelper.h index e8f286ae..16cefb0f 100644 --- a/libNeonDomain/tests/domain-stencil/src/runHelper.h +++ b/libNeonDomain/tests/domain-stencil/src/runHelper.h @@ -33,7 +33,7 @@ void runAllTestConfiguration( // std::vector nGpuTest{2,4,6,8}; std::vector cardinalityTest{1}; - std::vector dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}}; + std::vector dimTest{{10, 17, 90}, {1, 1, 100}, {17, 1, 77}}; std::vector runtimeE{Neon::Runtime::openmp}; if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) { runtimeE.push_back(Neon::Runtime::stream); diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index a86f1def..d0f19c67 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -203,6 +203,7 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.h b/libNeonDomain/tests/domain-stencil/src/stencil.h index a35d8011..7d74196a 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.h +++ b/libNeonDomain/tests/domain-stencil/src/stencil.h @@ -15,5 +15,6 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; - +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; } // namespace map From a49b27aeaeb83dfdd1ed47debba0fed99221a834 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 29 Jun 2023 11:27:58 -0400 Subject: [PATCH 19/94] WIP --- .../Neon/domain/details/bGrid/bPartition.h | 15 ++ .../domain/details/bGrid/bPartition_imp.h | 34 +++- .../domain/details/dGridSoA/dPartitionSoA.h | 1 + .../Neon/domain/details/eGrid/ePartition.h | 13 ++ .../domain/details/eGrid/ePartition_imp.h | 87 ++++++---- .../tests/domain-stencil/src/gtests.cpp | 52 +++++- .../tests/domain-stencil/src/stencil.cu | 158 +++++++++++++----- .../tests/domain-stencil/src/stencil.h | 20 ++- 8 files changed, 291 insertions(+), 89 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 73ccb914..a03af559 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -98,6 +98,19 @@ class bPartition T defaultValue) const -> NghData; + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t &&( std::is_invocable_v || std::is_same_v), void>; + + /** * Gets the global coordinates of the cartesian point. */ @@ -134,6 +147,8 @@ class bPartition helpGetNghIdx(const Idx& idx) const -> Idx; + + int mCardinality; T* mMem; NghIdx const* NEON_RESTRICT mStencilNghIndex; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index dc4c5880..5fa6f260 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -45,10 +45,10 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: location.x += gidx.mInDataBlockIdx.x; location.y += gidx.mInDataBlockIdx.y; location.z += gidx.mInDataBlockIdx.z; - if constexpr (SBlock::isMultiResMode){ + if constexpr (SBlock::isMultiResMode) { return location * mMultiResDiscreteIdxSpacing; } - return location ; + return location; } template @@ -354,4 +354,34 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: result.set(value, true); return result; } + +template + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + + if (isValid) { + auto const& value = mMem[pitch]; + funIfValid(value); + return; + } + + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} } // namespace Neon::domain::details::bGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h index 62fdc9a4..0572302b 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -20,6 +20,7 @@ class dPartitionSoA using NghData = Neon::domain::NghData; using Pitch = uint32_4d; using NghIdx = int8_3d; + using Type = T; dPartitionSoA() { diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h index 62b75981..05f3101b 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h @@ -188,6 +188,19 @@ class ePartition int card, T defaultValue) const -> NghData; + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t &&( std::is_invocable_v || std::is_same_v), void>; + /** * Check is the * @tparam dataView_ta diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h index 0063ee9e..8565cdc1 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h @@ -37,34 +37,34 @@ ePartition::cardinality() const template NEON_CUDA_HOST_DEVICE inline auto -ePartition::operator()(eIndex eId, int cardinalityIdx) const +ePartition::operator()(eIndex gidx, int cardinalityIdx) const -> T { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem[jump]; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::operator()(eIndex eId, int cardinalityIdx) -> T& +ePartition::operator()(eIndex gidx, int cardinalityIdx) -> T& { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem[jump]; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, +ePartition::getNghData(eIndex gidx, NghIdx nghIdx, int card) const -> NghData { - eIndex eIdxNgh; - const bool isValidNeighbour = isValidNgh(eId, nghIdx, eIdxNgh); + eIndex gidxxNgh; + const bool isValidNeighbour = isValidNgh(gidx, nghIdx, gidxxNgh); if (isValidNeighbour) { - T val = this->operator()(eIdxNgh, card); + T val = this->operator()(gidxxNgh, card); return NghData(val, isValidNeighbour); } return NghData(isValidNeighbour); @@ -73,7 +73,7 @@ ePartition::getNghData(eIndex eId, template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, +ePartition::getNghData(eIndex gidx, const Neon::int8_3d& ngh3dIdx, int card) const -> NghData @@ -82,7 +82,7 @@ ePartition::getNghData(eIndex eId, (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); return res; } @@ -91,15 +91,15 @@ template template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, - int card) +ePartition::getNghData(eIndex gidx, + int card) const -> NghData { int tablePithc = (xOff + mStencilRadius) + (yOff + mStencilRadius) * mStencilTableYPitch + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); return res; } @@ -108,37 +108,66 @@ template template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, - int card, - T defaultVal) +ePartition::getNghData(eIndex gidx, + int card, + T defaultVal) const -> NghData { int tablePithc = (xOff + mStencilRadius) + (yOff + mStencilRadius) * mStencilTableYPitch + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); if (!res.isValid()) { res.set(defaultVal, false); } return res; } +template +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + int tablePithc = (xOff + mStencilRadius) + + (yOff + mStencilRadius) * mStencilTableYPitch + + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(gidx, nghIdx, card); + if (res.isValid()) { + funIfValid(res.getData()); + return; + } + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} + template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghIndex(eIndex eId, +ePartition::getNghIndex(eIndex gidx, const Neon::int8_3d& ngh3dIdx, - eIndex& eIdxNgh) const -> bool + eIndex& gidxxNgh) const -> bool { int tablePithc = (ngh3dIdx.x + mStencilRadius) + (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; eIndex tmpEIdxNgh; - const bool isValidNeighbour = isValidNgh(eId, nghIdx, tmpEIdxNgh); + const bool isValidNeighbour = isValidNgh(gidx, nghIdx, tmpEIdxNgh); if (isValidNeighbour) { - eIdxNgh = tmpEIdxNgh; + gidxxNgh = tmpEIdxNgh; } return isValidNeighbour; } @@ -146,17 +175,17 @@ ePartition::getNghIndex(eIndex eId, template NEON_CUDA_HOST_DEVICE inline auto -ePartition::isValidNgh(eIndex eId, +ePartition::isValidNgh(eIndex gidx, NghIdx nghIdx, eIndex& neighbourIdx) const -> bool { - const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + eId.helpGet(); + const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + gidx.helpGet(); neighbourIdx.helpSet() = NEON_CUDA_CONST_LOAD((mConnectivity + connectivityJumo)); const bool isValidNeighbour = (neighbourIdx.mIdx > -1); - // printf("(prtId %d) getNghData id %d eIdxNgh %d connectivityJumo %d\n", + // printf("(prtId %d) getNghData id %d gidxxNgh %d connectivityJumo %d\n", // mPrtID, - // eId.mIdx, neighbourIdx.mIdx, connectivityJumo); + // gidx.mIdx, neighbourIdx.mIdx, connectivityJumo); return isValidNeighbour; } @@ -201,20 +230,20 @@ ePartition::ePartition(int prtId, template NEON_CUDA_HOST_DEVICE auto -ePartition::pointer(eIndex eId, int cardinalityIdx) const +ePartition::pointer(eIndex gidx, int cardinalityIdx) const -> const Type* { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem + jump; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getOffset(eIndex eId, int cardinalityIdx) const +ePartition::getOffset(eIndex gidx, int cardinalityIdx) const -> Offset { - return Offset(eId.helpGet() * mPitch.x + cardinalityIdx * mPitch.y); + return Offset(gidx.helpGet() * mPitch.x + cardinalityIdx * mPitch.y); } template ), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, eGrid) +TEST(domain_stencil, eGrid_NoTemplate) { int nGpus = 3; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, bGri) +TEST(domain_stencil, bGri_NoTemplate) { int nGpus = 5; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, dGridSoA) +TEST(domain_stencil, dGridSoA_NoTemplate) { int nGpus = 5; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGrid_Template) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, eGrid_Template) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, bGri_Template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGridSoA_Template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), nGpus, 1); } diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index d0f19c67..926153fa 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -9,8 +9,8 @@ namespace map { template -auto stencilContainer_laplace(const Field& filedA, - Field& fieldB) +auto laplaceNoTemplate(const Field& filedA, + Field& fieldB) -> Neon::set::Container { const auto& grid = filedA.getGrid(); @@ -59,15 +59,22 @@ static constexpr std::array stencil{ Ngh3DIdx(0, 0, 1), Ngh3DIdx(0, 0, -1)}; -template -inline auto viaTemplate (const IDX& idx, int i, const Field& a, int& partial, int& count){ - a.template getNghData(idx, i, - [&](typename Field::Type const& val) { - partial += val; - count++; - }); +template +NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Partition& a, Partial& partial, int& count) +{ + Neon::index_3d direction(X, Y, Z); + auto nghData = a.getNghData(idx, direction.newType(), i); + if (nghData.isValid()) { + partial += nghData.getData(); + count++; + } + // a.template getNghData(idx, i, + // [&](typename Partition::Type const& val) { + // partial += val; + // count++; + // }); }; template @@ -88,36 +95,18 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); typename Field::Type partial = 0; int count = 0; + using Ngh3DIdx = Neon::int8_3d; - constexpr std::array stencil{ - Ngh3DIdx(1, 0, 0), - Ngh3DIdx(-1, 0, 0), - Ngh3DIdx(0, 1, 0), - Ngh3DIdx(0, -1, 0), - Ngh3DIdx(0, 0, 1), - Ngh3DIdx(0, 0, -1)}; -#if 0 - auto viaTemplate = [&]() { - if constexpr (std::is_same_v) { - a.template getNghData(idx, i, - [&](Field::Type const& val) { - partial += val; - count++; - }); - } - }; -#endif - viaTemplate<0>(idx, i, a, partial, count); - viaTemplate<1>(idx, i, a, partial, count); - viaTemplate<2>(idx, i, a, partial, count); - viaTemplate<3>(idx, i, a, partial, count); - viaTemplate<4>(idx, i, a, partial, count); - viaTemplate<5>(idx, i, a, partial, count); + viaTemplate<1, 0, 0>(idx, i, a, partial, count); + viaTemplate<-1, 0, 0>(idx, i, a, partial, count); + viaTemplate<0, 1, 0>(idx, i, a, partial, count); + viaTemplate<0, -1, 0>(idx, i, a, partial, count); + viaTemplate<0, 0, 1>(idx, i, a, partial, count); + viaTemplate<0, 0, -1>(idx, i, a, partial, count); - b(idx, i) = a(idx, i) - count * partial; + + b(idx, i) = a(idx, i) - count * partial ; } }; }); @@ -126,7 +115,82 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, using namespace Neon::domain::tool::testing; template -auto run(TestData& data) -> void +auto runNoTemplate(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + const int maxIters = 1; + + NEON_INFO(grid.toString()); + + // data.resetValuesToLinear(1, 100); + data.resetValuesToMasked(1); + + { // NEON + const Neon::index_3d dim = grid.getDimension(); + std::vector elements; + auto bk = grid.getBackend(); + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + for (int iter = maxIters; iter > 0; iter--) { + bk.sync(Neon::Backend::mainStreamIdx); + X.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::put, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(X, Y).run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + Y.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx); + } + data.getBackend().sync(0); + } + + { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + for (int iter = maxIters; iter > 0; iter--) { + data.laplace(X, Y); + data.laplace(Y, X); + } + } + + data.updateHostData(); + + data.getField(FieldNames::X).ioToVtk("X", "X", true); + // data.getField(FieldNames::Y).ioToVtk("Y", "Y", false); + // data.getField(FieldNames::Z).ioToVtk("Z", "Z", false); + // + data.getIODomain(FieldNames::X).ioToVti("X_", "X_"); + // data.getField(FieldNames::Y).ioVtiAllocator("Y_"); + // data.getField(FieldNames::Z).ioVtiAllocator("Z_"); + + bool isOk = data.compare(FieldNames::X); + isOk = data.compare(FieldNames::Y); + if (!isOk) { + auto flagField = data.compareAndGetField(FieldNames::X); + flagField.ioToVti("X_diffFlag", "X_diffFlag"); + flagField = data.compareAndGetField(FieldNames::Y); + flagField.ioToVti("Y_diffFlag", "Y_diffFlag"); + } + ASSERT_TRUE(isOk); + if (!isOk) { + exit(99); + } +} + +template +auto runTemplate(TestData& data) -> void { using Type = typename TestData::Type; @@ -153,7 +217,7 @@ auto run(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainer_laplace(X, Y).run(Neon::Backend::mainStreamIdx); + stencilContainerLaplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); Y.newHaloUpdate(Neon::set::StencilSemantic::standard, @@ -162,7 +226,7 @@ auto run(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainer_laplace(Y, X).run(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx); } data.getBackend().sync(0); } @@ -200,10 +264,14 @@ auto run(TestData& data) -> void } } -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.h b/libNeonDomain/tests/domain-stencil/src/stencil.h index 7d74196a..456f5f01 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.h +++ b/libNeonDomain/tests/domain-stencil/src/stencil.h @@ -11,10 +11,20 @@ namespace map { using namespace Neon::domain::tool::testing; template -auto run(TestData& data) -> void; +auto runNoTemplate(TestData& data) -> void; + +template +auto runTemplate(TestData& data) -> void; + + +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; + +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; -extern template auto run(TestData&) -> void; -extern template auto run(TestData&) -> void; -extern template auto run(TestData&) -> void; -extern template auto run(TestData&) -> void; } // namespace map From fde014d67b87529c5ae18e297b307e4381b4bd65 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 29 Jun 2023 11:33:43 -0400 Subject: [PATCH 20/94] Extending unit test for stencil to dGridSoA --- .../tests/domain-stencil/src/stencil.cu | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index 926153fa..14ae82b1 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -59,22 +59,22 @@ static constexpr std::array stencil{ Ngh3DIdx(0, 0, 1), Ngh3DIdx(0, 0, -1)}; -template +template NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Partition& a, Partial& partial, int& count) { - Neon::index_3d direction(X, Y, Z); - auto nghData = a.getNghData(idx, direction.newType(), i); - if (nghData.isValid()) { - partial += nghData.getData(); - count++; - } - // a.template getNghData(idx, i, - // [&](typename Partition::Type const& val) { - // partial += val; - // count++; - // }); + // Neon::index_3d direction(X, Y, Z); + // auto nghData = a.getNghData(idx, direction.newType(), i); + // if (nghData.isValid()) { + // partial += nghData.getData(); + // count++; + // } + a.template getNghData(idx, i, + [&](typename Partition::Type const& val) { + partial += val; + count++; + }); }; template @@ -98,15 +98,15 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, using Ngh3DIdx = Neon::int8_3d; - viaTemplate<1, 0, 0>(idx, i, a, partial, count); - viaTemplate<-1, 0, 0>(idx, i, a, partial, count); - viaTemplate<0, 1, 0>(idx, i, a, partial, count); - viaTemplate<0, -1, 0>(idx, i, a, partial, count); - viaTemplate<0, 0, 1>(idx, i, a, partial, count); - viaTemplate<0, 0, -1>(idx, i, a, partial, count); + viaTemplate<0>(idx, i, a, partial, count); + viaTemplate<1>(idx, i, a, partial, count); + viaTemplate<2>(idx, i, a, partial, count); + viaTemplate<3>(idx, i, a, partial, count); + viaTemplate<4>(idx, i, a, partial, count); + viaTemplate<5>(idx, i, a, partial, count); - b(idx, i) = a(idx, i) - count * partial ; + b(idx, i) = a(idx, i) - count * partial; } }; }); From b0e74e6c3dc62179c84a9d7d899efa461ecbc115 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 29 Jun 2023 17:14:38 -0400 Subject: [PATCH 21/94] WIP --- libNeonDomain/tests/domain-stencil/src/stencil.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index 14ae82b1..31e937e1 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -78,7 +78,7 @@ NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Parti }; template -auto stencilContainerLaplaceTemplate(const Field& filedA, +auto laplaceTemplate(const Field& filedA, Field& fieldB) -> Neon::set::Container { @@ -217,7 +217,7 @@ auto runTemplate(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainerLaplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx); + laplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); Y.newHaloUpdate(Neon::set::StencilSemantic::standard, @@ -226,7 +226,7 @@ auto runTemplate(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx); + laplaceTemplate(Y, X).run(Neon::Backend::mainStreamIdx); } data.getBackend().sync(0); } From 1dd5abc612caa5b3dc6f0896fea36f02e73f42dc Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 09:07:11 -0400 Subject: [PATCH 22/94] WIP --- .../include/Neon/domain/details/bGrid/bPartition_imp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index 5fa6f260..9a0bab8e 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -100,7 +100,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpGetValidIdxPitchExplicit(const Idx& idx, int card) const -> uint32_t { - uint32_t const blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x + SBlock::memBlockSizeX * idx.mInDataBlockIdx.y + (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z; From 81b352696731adfd70786292d8f0e107a3f0958d Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 10:43:24 -0400 Subject: [PATCH 23/94] WIP --- .../tests/domain-stencil/src/stencil.cu | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index 31e937e1..f6865999 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -77,9 +77,19 @@ NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Parti }); }; + +template +constexpr void constexpr_for(F&& f) +{ + if constexpr (Start < End) { + f(std::integral_constant()); + constexpr_for(f); + } +} + template auto laplaceTemplate(const Field& filedA, - Field& fieldB) + Field& fieldB) -> Neon::set::Container { const auto& grid = filedA.getGrid(); @@ -97,13 +107,23 @@ auto laplaceTemplate(const Field& filedA, int count = 0; using Ngh3DIdx = Neon::int8_3d; - - viaTemplate<0>(idx, i, a, partial, count); - viaTemplate<1>(idx, i, a, partial, count); - viaTemplate<2>(idx, i, a, partial, count); - viaTemplate<3>(idx, i, a, partial, count); - viaTemplate<4>(idx, i, a, partial, count); - viaTemplate<5>(idx, i, a, partial, count); + constexpr_for<0, 6, 1>([&](auto sIdx) { + a.template getNghData(idx, i, + [&](auto const& val) { + partial += val; + count++; + }); + }); + + +// viaTemplate<0>(idx, i, a, partial, count); +// viaTemplate<1>(idx, i, a, partial, count); +// viaTemplate<2>(idx, i, a, partial, count); +// viaTemplate<3>(idx, i, a, partial, count); +// viaTemplate<4>(idx, i, a, partial, count); +// viaTemplate<5>(idx, i, a, partial, count); b(idx, i) = a(idx, i) - count * partial; From 2a2caf7d83bb0c401cc5d7839e2d212132a966c1 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 11:04:00 -0400 Subject: [PATCH 24/94] WIP --- .../include/Neon/core/tools/metaprogramming.h | 1 + .../core/tools/metaprogramming/ConstexprFor.h | 14 +++++++++ .../tests/domain-stencil/src/stencil.cu | 29 +++++++------------ 3 files changed, 25 insertions(+), 19 deletions(-) create mode 100644 libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming.h b/libNeonCore/include/Neon/core/tools/metaprogramming.h index 53678ed6..ea004a43 100644 --- a/libNeonCore/include/Neon/core/tools/metaprogramming.h +++ b/libNeonCore/include/Neon/core/tools/metaprogramming.h @@ -4,3 +4,4 @@ #include "Neon/core/tools/metaprogramming/debugHelp.h" #include "Neon/core/tools/metaprogramming/extractTupleVecType.h" #include "Neon/core/tools/metaprogramming/tupleVecTable.h" +#include "Neon/core/tools/metaprogramming/ConstexprFor.h" \ No newline at end of file diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h new file mode 100644 index 00000000..2e8161e6 --- /dev/null +++ b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h @@ -0,0 +1,14 @@ +#pragma once + +namespace Neon { + +template +constexpr void ConstexprFor(F&& f) +{ + if constexpr (Start < End) { + f(std::integral_constant()); + ConstexprFor(f); + } +} + +} // namespace Neon \ No newline at end of file diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index f6865999..6cd4f6ff 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -78,14 +78,14 @@ NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Parti }; -template -constexpr void constexpr_for(F&& f) -{ - if constexpr (Start < End) { - f(std::integral_constant()); - constexpr_for(f); - } -} +//template +//constexpr void constexpr_for(F&& f) +//{ +// if constexpr (Start < End) { +// f(std::integral_constant()); +// constexpr_for(f); +// } +//} template auto laplaceTemplate(const Field& filedA, @@ -107,7 +107,7 @@ auto laplaceTemplate(const Field& filedA, int count = 0; using Ngh3DIdx = Neon::int8_3d; - constexpr_for<0, 6, 1>([&](auto sIdx) { + Neon::ConstexprFor<0, 6, 1>([&](auto sIdx) { a.template getNghData(idx, i, @@ -116,16 +116,7 @@ auto laplaceTemplate(const Field& filedA, count++; }); }); - - -// viaTemplate<0>(idx, i, a, partial, count); -// viaTemplate<1>(idx, i, a, partial, count); -// viaTemplate<2>(idx, i, a, partial, count); -// viaTemplate<3>(idx, i, a, partial, count); -// viaTemplate<4>(idx, i, a, partial, count); -// viaTemplate<5>(idx, i, a, partial, count); - - + b(idx, i) = a(idx, i) - count * partial; } }; From 103034584aa2a069c83f5861bc7f4f841ee6dd50 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 11:41:36 -0400 Subject: [PATCH 25/94] Adding documentation to ConstexprFor --- .../core/tools/metaprogramming/ConstexprFor.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h index 2e8161e6..a6d8767e 100644 --- a/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h +++ b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h @@ -2,7 +2,24 @@ namespace Neon { -template +/** + * Implementation of a constexpr for loop. + * Reference: https://artificial-mind.net/blog/2020/10/31/constexpr-for + * + * The loop is implemented as a recursive template function. + * It is equicalent to the following code: + * + * for(int i = Start; i < End; i += Inc) { + * f(i); + * // do something + * // ... + * // ... + * } + */ +template constexpr void ConstexprFor(F&& f) { if constexpr (Start < End) { From 73b063ec1a51d7cdac657656a60d29a2e4a1fb57 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 15:27:30 -0400 Subject: [PATCH 26/94] WIP --- .../src/ContainersD3Q19.h | 353 ++++++++++++++ .../lbm-lid-driven-cavity-flow/src/LbmTools.h | 438 ------------------ 2 files changed, 353 insertions(+), 438 deletions(-) create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h delete mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h new file mode 100644 index 00000000..a29953f1 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h @@ -0,0 +1,353 @@ +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +#define COMPUTE_CAST(VAR) static_cast((VAR)) + +template +struct LbmContainers +{ +}; + +/** + * Specialization for Lattice + * @tparam PopulationField + * @tparam Compute + */ +template +struct LbmContainers, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + +#define PULL_STREAM(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ + popIn[GOid] = fin(gidx, BKid) + \ + fin.template getNghData(gidx, BKid)(); \ + } else { \ + popIn[GOid] = fin.template getNghData(gidx, GOid)(); \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin(gidx, GOid) + fin.template getNghData(gidx, GOid)(); \ + } else { \ + popIn[BKid] = fin.template getNghData(gidx, BKid)(); \ + } \ + } \ + } + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopulationField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOid) { + constexpr int BKid = Lattice::oppositeDirection[GOid]; + constexpr int BKx = Lattice::stencil[BKid].x; + constexpr int BKy = Lattice::stencil[BKid].y; + constexpr int BKz = Lattice::stencil[BKid].z; + + if (wallBitFlag & (uint32_t(1) << GOid)) { + popIn[GOid] = fin(gidx, BKid) + + fin.template getNghData(gidx, BKid)(); + } else { + popIn[GOid] = fin.template getNghData(gidx, GOid)(); + } + }); + + { + popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection); + } + } + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const Storage pop[Lattice::Q], + NEON_OUT Compute& rho, + NEON_OUT std::array& u) + -> void + { +#define POP(IDX) static_cast(pop[IDX]) + + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); + +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! Compute iterator */, + const Storage pop[Lattice::Q], + Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + typename PopulationField::Partition& fOut /*! Population */) + + -> void + { + const Compute ck_u03 = u[0] + u[1]; + const Compute ck_u04 = u[0] - u[1]; + const Compute ck_u05 = u[0] + u[2]; + const Compute ck_u06 = u[0] - u[2]; + const Compute ck_u07 = u[1] + u[2]; + const Compute ck_u08 = u[1] - u[2]; + + const Compute eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr); + const Compute eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr); + const Compute eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr); + const Compute eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr); + const Compute eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr); + const Compute eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr); + const Compute eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr); + const Compute eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr); + const Compute eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr); + + const Compute eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0]; + const Compute eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1]; + const Compute eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2]; + const Compute eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03; + const Compute eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04; + const Compute eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05; + const Compute eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06; + const Compute eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07; + const Compute eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08; + + const Compute pop_out_00 = (1. - omega) * static_cast(pop[0]) + omega * eq_00; + const Compute pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; + const Compute pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; + const Compute pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; + const Compute pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; + const Compute pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; + const Compute pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; + const Compute pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; + const Compute pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; + + const Compute pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; + const Compute pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; + const Compute pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; + const Compute pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; + const Compute pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; + const Compute pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; + const Compute pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; + const Compute pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; + const Compute pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; + + +#define COMPUTE_GO_AND_BACK(GOid, BKid) \ + { \ + fOut(i, GOid) = static_cast(pop_out_0##GOid); \ + fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ + } + + COMPUTE_GO_AND_BACK(0, 10) + COMPUTE_GO_AND_BACK(1, 11) + COMPUTE_GO_AND_BACK(2, 12) + COMPUTE_GO_AND_BACK(3, 13) + COMPUTE_GO_AND_BACK(4, 14) + COMPUTE_GO_AND_BACK(5, 15) + COMPUTE_GO_AND_BACK(6, 16) + COMPUTE_GO_AND_BACK(7, 17) + COMPUTE_GO_AND_BACK(8, 18) + +#undef COMPUTE_GO_AND_BACK + + { + const Compute eq_09 = rho * (1. / 3.) * (1. - usqr); + const Compute pop_out_09 = (1. - omega) * + static_cast(pop[Lattice::centerDirection]) + + omega * eq_09; + fOut(i, Lattice::centerDirection) = static_cast(pop_out_09); + } + } + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopulationField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopulationField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + +#define COMPUTE_MASK_WALL(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); \ + } \ + } \ + { /*BK*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << BKid)); \ + } \ + } \ + } + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, + Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + COMPUTE_MASK_WALL(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10) + COMPUTE_MASK_WALL(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11) + COMPUTE_MASK_WALL(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12) + COMPUTE_MASK_WALL(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13) + COMPUTE_MASK_WALL(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14) + COMPUTE_MASK_WALL(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15) + COMPUTE_MASK_WALL(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16) + COMPUTE_MASK_WALL(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17) + COMPUTE_MASK_WALL(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18) + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } +#undef COMPUTE_MASK_WALL + +#define BC_LOAD(GOID, DKID) \ + popIn[GOID] = fIn(gidx, GOID); \ + popIn[DKID] = fIn(gidx, DKID); + + static auto + computeRhoAndU([[maybe_unused]] const PopulationField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + Storage popIn[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + BC_LOAD(0, 10) + BC_LOAD(1, 11) + BC_LOAD(2, 12) + BC_LOAD(3, 13) + BC_LOAD(4, 14) + BC_LOAD(5, 15) + BC_LOAD(6, 16) + BC_LOAD(7, 17) + BC_LOAD(8, 18) + popIn[9] = fIn(gidx, 9); + + rho = 1.0; + u = std::array{COMPUTE_CAST(popIn[0]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[1]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[2]) / COMPUTE_CAST(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } +}; + +#undef COMPUTE_CAST \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h deleted file mode 100644 index ab79ed2a..00000000 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h +++ /dev/null @@ -1,438 +0,0 @@ -#include "CellType.h" -#include "D3Q19.h" -#include "Neon/Neon.h" -#include "Neon/set/Containter.h" - -#define COMPUTE_CAST(VAR) static_cast((VAR)) - -template -struct LbmContainers -{ -}; - -/** - * Specialization for Lattice - * @tparam PopulationField - * @tparam LbmComputeType - */ -template -struct LbmContainers, - PopulationField, - LbmComputeType> -{ - using LbmStoreType = typename PopulationField::Type; - using CellTypeField = typename PopulationField::Grid::template Field; - using Lattice = D3Q19Template; - using Idx = typename PopulationField::Idx; - using Grid = typename PopulationField::Grid; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - -#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - if (wallBitFlag & (uint32_t(1) << GOid)) { \ - popIn[GOid] = fin(i, BKid); \ - } else { \ - popIn[GOid] = fin.template nghVal(i, GOid, 0.0).value; \ - } \ - } \ - { /*BK*/ \ - if (wallBitFlag & (uint32_t(1) << BKid)) { \ - popIn[BKid] = fin(i, GOid); \ - } else { \ - popIn[BKid] = fin.template nghVal(i, BKid, 0.0).value; \ - } \ - } \ - } - static inline NEON_CUDA_HOST_DEVICE auto - loadPopulation(Idx const& i, - const uint32_t& wallBitFlag, - typename PopulationField::Partition const& fin, - NEON_OUT LbmStoreType popIn[19]) - { - // #pragma omp critical - // { - - LOADPOP(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); - LOADPOP(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); - LOADPOP(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); - LOADPOP(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); - LOADPOP(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); - LOADPOP(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); - LOADPOP(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); - LOADPOP(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); - LOADPOP(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); - // } - // Treat the case of the center (c[k] = {0, 0, 0,}). - { - popIn[Lattice::centerDirection] = fin(i, Lattice::centerDirection); - } - } -#undef LOADPOP - -#define PULL_STREAM(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - if (wallBitFlag & (uint32_t(1) << GOid)) { \ - /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ - popIn[GOid] = fin(gidx, BKid) + \ - fin.template getNghData(gidx, BKid)(); \ - } else { \ - popIn[GOid] = fin.template getNghData(gidx, GOid)(); \ - } \ - } \ - { /*BK*/ \ - if (wallBitFlag & (uint32_t(1) << BKid)) { \ - popIn[BKid] = fin(gidx, GOid) + fin.template getNghData(gidx, GOid)(); \ - } else { \ - popIn[BKid] = fin.template getNghData(gidx, BKid)(); \ - } \ - } \ - } - - static inline NEON_CUDA_HOST_DEVICE auto - pullStream(Idx const& gidx, - const uint32_t& wallBitFlag, - typename PopulationField::Partition const& fin, - NEON_OUT LbmStoreType popIn[19]) - { -#if 0 - using TopologyByDirection = std::tuple; - constexpr std::array stencil{ - std::make_tuple(Neon::int32_3d(-1, 0, 0), /* GOid */ 0, /* --- */ Neon::int32_3d(1, 0, 0), /* BKid */ 10), - std::make_tuple(Neon::int32_3d(0, -1, 0), /* GOid */ 1, /* --- */ Neon::int32_3d(0, 1, 0), /* BKid */ 11), - std::make_tuple(Neon::int32_3d(0, 0, -1), /* GOid */ 2, /* --- */ Neon::int32_3d(0, 0, 1), /* BKid */ 12), - std::make_tuple(Neon::int32_3d(-1, -1, 0), /* GOid */ 3, /* --- */ Neon::int32_3d(1, 1, 0), /* BKid */ 13), - std::make_tuple(Neon::int32_3d(-1, 1, 0), /* GOid */ 4, /* --- */ Neon::int32_3d(1, -1, 0), /* BKid */ 14), - std::make_tuple(Neon::int32_3d(-1, 0, -1), /* GOid */ 5, /* --- */ Neon::int32_3d(1, 0, 1), /* BKid */ 15), - std::make_tuple(Neon::int32_3d(-1, 0, 1), /* GOid */ 6, /* --- */ Neon::int32_3d(1, 0, -1), /* BKid */ 16), - std::make_tuple(Neon::int32_3d(0, -1, -1), /* GOid */ 7, /* --- */ Neon::int32_3d(0, 1, 1), /* BKid */ 17), - std::make_tuple(Neon::int32_3d(0, -1, 1), /* GOid */ 8, /* --- */ Neon::int32_3d(0, 1, -1), /* BKid */ 18)}; - - - auto pullStream = [&]() { - static_assert(stencilIdx < 9); - constexpr int GOid = std::get<1>(stencil[stencilIdx]); - constexpr int BKid = std::get<3>(stencil[stencilIdx]); - constexpr Neon::int32_3d GoOffset = std::get<0>(stencil[stencilIdx]); - constexpr Neon::int32_3d BkOffset = std::get<2>(stencil[stencilIdx]); - { - if (wallBitFlag & (uint32_t(1) << GOid)) { - popIn[GOid] = fin(gidx, BKid) + - fin.template getNghData(gidx, BKid)(); - } else { - popIn[GOid] = fin.template getNghData(gidx, GOid)(); - } - } - { /*BK*/ - if (wallBitFlag & (uint32_t(1) << BKid)) { - popIn[BKid] = fin(gidx, GOid) + - fin.template getNghData(gidx, GOid)(); - } else { - popIn[BKid] = fin.template getNghData(gidx, BKid)(); - } - } - }; - pullStream.template operator()<0>(); - pullStream.template operator()<1>(); - pullStream.template operator()<2>(); - pullStream.template operator()<3>(); - pullStream.template operator()<4>(); - pullStream.template operator()<5>(); - pullStream.template operator()<6>(); - pullStream.template operator()<7>(); - pullStream.template operator()<8>(); -#endif - PULL_STREAM(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); - PULL_STREAM(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); - PULL_STREAM(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); - PULL_STREAM(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); - PULL_STREAM(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); - PULL_STREAM(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); - PULL_STREAM(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); - PULL_STREAM(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); - PULL_STREAM(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); - - // Treat the case of the center (c[k] = {0, 0, 0,}). - { - popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection); - } - } -#undef PULL_STREAM - - static inline NEON_CUDA_HOST_DEVICE auto - macroscopic(const LbmStoreType pop[Lattice::Q], - NEON_OUT LbmComputeType& rho, - NEON_OUT std::array& u) - -> void - { -#define POP(IDX) static_cast(pop[IDX]) - - const LbmComputeType X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); - const LbmComputeType X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); - const LbmComputeType X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); - - const LbmComputeType Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); - const LbmComputeType Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); - - const LbmComputeType Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); - const LbmComputeType Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); - -#undef POP - - rho = X_M1 + X_P1 + X_0; - u[0] = (X_P1 - X_M1) / rho; - u[1] = (Y_P1 - Y_M1) / rho; - u[2] = (Z_P1 - Z_M1) / rho; - } - - - static inline NEON_CUDA_HOST_DEVICE auto - collideBgkUnrolled(Idx const& i /*! LbmComputeType iterator */, - const LbmStoreType pop[Lattice::Q], - LbmComputeType const& rho /*! Density */, - std::array const& u /*! Velocity */, - LbmComputeType const& usqr /*! Usqr */, - LbmComputeType const& omega /*! Omega */, - typename PopulationField::Partition& fOut /*! Population */) - - -> void - { - const LbmComputeType ck_u03 = u[0] + u[1]; - const LbmComputeType ck_u04 = u[0] - u[1]; - const LbmComputeType ck_u05 = u[0] + u[2]; - const LbmComputeType ck_u06 = u[0] - u[2]; - const LbmComputeType ck_u07 = u[1] + u[2]; - const LbmComputeType ck_u08 = u[1] - u[2]; - - const LbmComputeType eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr); - const LbmComputeType eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr); - const LbmComputeType eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr); - const LbmComputeType eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr); - const LbmComputeType eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr); - const LbmComputeType eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr); - const LbmComputeType eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr); - const LbmComputeType eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr); - const LbmComputeType eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr); - - const LbmComputeType eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0]; - const LbmComputeType eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1]; - const LbmComputeType eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2]; - const LbmComputeType eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03; - const LbmComputeType eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04; - const LbmComputeType eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05; - const LbmComputeType eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06; - const LbmComputeType eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07; - const LbmComputeType eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08; - - const LbmComputeType pop_out_00 = (1. - omega) * static_cast(pop[0]) + omega * eq_00; - const LbmComputeType pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; - const LbmComputeType pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; - const LbmComputeType pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; - const LbmComputeType pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; - const LbmComputeType pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; - const LbmComputeType pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; - const LbmComputeType pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; - const LbmComputeType pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; - - const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; - const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; - const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; - const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; - const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; - const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; - const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; - const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; - const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; - - -#define COMPUTE_GO_AND_BACK(GOid, BKid) \ - { \ - fOut(i, GOid) = static_cast(pop_out_0##GOid); \ - fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ - } - - COMPUTE_GO_AND_BACK(0, 10) - COMPUTE_GO_AND_BACK(1, 11) - COMPUTE_GO_AND_BACK(2, 12) - COMPUTE_GO_AND_BACK(3, 13) - COMPUTE_GO_AND_BACK(4, 14) - COMPUTE_GO_AND_BACK(5, 15) - // COMPUTE_GO_AND_BACK(6, 16) - fOut(i, 6) = static_cast(pop_out_06); - fOut(i, 16) = static_cast(pop_out_opp_06); - COMPUTE_GO_AND_BACK(7, 17) - COMPUTE_GO_AND_BACK(8, 18) - -#undef COMPUTE_GO_AND_BACK - - { - const LbmComputeType eq_09 = rho * (1. / 3.) * (1. - usqr); - const LbmComputeType pop_out_09 = (1. - omega) * - static_cast(pop[Lattice::centerDirection]) + - omega * eq_09; - fOut(i, Lattice::centerDirection) = static_cast(pop_out_09); - } - } - - static auto - iteration(Neon::set::StencilSemantic stencilSemantic, - const PopulationField& fInField /*! inpout population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - const LbmComputeType omega /*! LBM omega parameter */, - PopulationField& fOutField /*! output Population field */) - -> Neon::set::Container - { - - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&, omega](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL, stencilSemantic); - auto& fOut = L.load(fOutField); - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - if (cellInfo.classification == CellType::bulk) { - - LbmStoreType popIn[Lattice::Q]; - pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - - LbmComputeType rho; - std::array u{.0, .0, .0}; - macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - - LbmComputeType usqr = 1.5 * (u[0] * u[0] + - u[1] * u[1] + - u[2] * u[2]); - - collideBgkUnrolled(gidx, - popIn, - rho, u, - usqr, omega, - NEON_OUT fOut); - } - }; - }); - return container; - } - -#define COMPUTE_MASK_WALL(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ - if (nghCellType.classification != CellType::bulk) { \ - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); \ - } \ - } \ - { /*BK*/ \ - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ - if (nghCellType.classification != CellType::bulk) { \ - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << BKid)); \ - } \ - } \ - } - - static auto - computeWallNghMask(const CellTypeField& infoInField, - CellTypeField& infoOutpeField) - - -> Neon::set::Container - { - Neon::set::Container container = infoInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& infoIn = L.load(infoInField, - Neon::Pattern::STENCIL); - auto& infoOut = L.load(infoOutpeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { - CellType cellType = infoIn(gidx, 0); - cellType.wallNghBitflag = 0; - - if (cellType.classification == CellType::bulk) { - COMPUTE_MASK_WALL(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10) - COMPUTE_MASK_WALL(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11) - COMPUTE_MASK_WALL(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12) - COMPUTE_MASK_WALL(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13) - COMPUTE_MASK_WALL(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14) - COMPUTE_MASK_WALL(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15) - COMPUTE_MASK_WALL(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16) - COMPUTE_MASK_WALL(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17) - COMPUTE_MASK_WALL(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18) - - infoOut(gidx, 0) = cellType; - } - }; - }); - return container; - } -#undef COMPUTE_MASK_WALL - -#define BC_LOAD(GOID, DKID) \ - popIn[GOID] = fIn(gidx, GOID); \ - popIn[DKID] = fIn(gidx, DKID); - - static auto - computeRhoAndU([[maybe_unused]] const PopulationField& fInField /*! inpout population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - Rho& rhoField /*! output Population field */, - U& uField /*! output Population field */) - - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL); - auto& rhoXpu = L.load(rhoField); - auto& uXpu = L.load(uField); - - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - LbmComputeType rho = 0; - std::array u{.0, .0, .0}; - LbmStoreType popIn[Lattice::Q]; - - if (cellInfo.classification == CellType::bulk) { - pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - } else { - if (cellInfo.classification == CellType::movingWall) { - BC_LOAD(0, 10) - BC_LOAD(1, 11) - BC_LOAD(2, 12) - BC_LOAD(3, 13) - BC_LOAD(4, 14) - BC_LOAD(5, 15) - BC_LOAD(6, 16) - BC_LOAD(7, 17) - BC_LOAD(8, 18) - popIn[9] = fIn(gidx, 9); - - rho = 1.0; - u = std::array{COMPUTE_CAST(popIn[0]) / COMPUTE_CAST(6. * 1. / 18.), - COMPUTE_CAST(popIn[1]) / COMPUTE_CAST(6. * 1. / 18.), - COMPUTE_CAST(popIn[2]) / COMPUTE_CAST(6. * 1. / 18.)}; - } - } - - rhoXpu(gidx, 0) = static_cast(rho); - uXpu(gidx, 0) = static_cast(u[0]); - uXpu(gidx, 1) = static_cast(u[1]); - uXpu(gidx, 2) = static_cast(u[2]); - }; - }); - return container; - } -}; - -#undef COMPUTE_CAST \ No newline at end of file From 1169538fcd6a4bb045251c3a854b367f96d932be Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 15:27:36 -0400 Subject: [PATCH 27/94] WIP --- .../src/ContainerFactory.h | 13 + .../src/ContainersD3Q19.h | 284 ++++-------------- .../lbm-lid-driven-cavity-flow/src/D3Q19.h | 196 +++--------- .../src/DeviceD3Q19.h | 169 +++++++++++ .../src/LbmIteration.h | 32 +- .../src/LbmToolsTemplateOnly.h | 4 +- .../src/Precision.h | 13 + .../src/RunCavityTwoPop.cu | 4 +- 8 files changed, 317 insertions(+), 398 deletions(-) create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h new file mode 100644 index 00000000..6ee35bbb --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h @@ -0,0 +1,13 @@ +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +template +struct LbmContainers +{ +}; + +#include "ContainersD3Q19.h" \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h index a29953f1..0381cda2 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h @@ -1,26 +1,23 @@ #include "CellType.h" #include "D3Q19.h" +#include "DeviceD3Q19.h" #include "Neon/Neon.h" #include "Neon/set/Containter.h" -#define COMPUTE_CAST(VAR) static_cast((VAR)) - template -struct LbmContainers +struct ContainerFactory { }; /** - * Specialization for Lattice - * @tparam PopulationField - * @tparam Compute + * Specialization for D3Q19 */ template -struct LbmContainers, - Grid_> +struct ContainerFactory, + Grid_> { using Lattice = D3Q19; using Precision = Precision_; @@ -35,223 +32,50 @@ struct LbmContainers; using U = typename Grid::template Field; - -#define PULL_STREAM(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - if (wallBitFlag & (uint32_t(1) << GOid)) { \ - /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ - popIn[GOid] = fin(gidx, BKid) + \ - fin.template getNghData(gidx, BKid)(); \ - } else { \ - popIn[GOid] = fin.template getNghData(gidx, GOid)(); \ - } \ - } \ - { /*BK*/ \ - if (wallBitFlag & (uint32_t(1) << BKid)) { \ - popIn[BKid] = fin(gidx, GOid) + fin.template getNghData(gidx, GOid)(); \ - } else { \ - popIn[BKid] = fin.template getNghData(gidx, BKid)(); \ - } \ - } \ - } - - static inline NEON_CUDA_HOST_DEVICE auto - pullStream(Idx const& gidx, - const uint32_t& wallBitFlag, - typename PopulationField::Partition const& fin, - NEON_OUT Storage popIn[Lattice::Q]) - { - - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOid) { - constexpr int BKid = Lattice::oppositeDirection[GOid]; - constexpr int BKx = Lattice::stencil[BKid].x; - constexpr int BKy = Lattice::stencil[BKid].y; - constexpr int BKz = Lattice::stencil[BKid].z; - - if (wallBitFlag & (uint32_t(1) << GOid)) { - popIn[GOid] = fin(gidx, BKid) + - fin.template getNghData(gidx, BKid)(); - } else { - popIn[GOid] = fin.template getNghData(gidx, GOid)(); - } - }); - - { - popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection); - } - } - - static inline NEON_CUDA_HOST_DEVICE auto - macroscopic(const Storage pop[Lattice::Q], - NEON_OUT Compute& rho, - NEON_OUT std::array& u) - -> void - { -#define POP(IDX) static_cast(pop[IDX]) - - const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); - const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); - const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); - - const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); - const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); - - const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); - const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); - -#undef POP - - rho = X_M1 + X_P1 + X_0; - u[0] = (X_P1 - X_M1) / rho; - u[1] = (Y_P1 - Y_M1) / rho; - u[2] = (Z_P1 - Z_M1) / rho; - } - - - static inline NEON_CUDA_HOST_DEVICE auto - collideBgkUnrolled(Idx const& i /*! Compute iterator */, - const Storage pop[Lattice::Q], - Compute const& rho /*! Density */, - std::array const& u /*! Velocity */, - Compute const& usqr /*! Usqr */, - Compute const& omega /*! Omega */, - typename PopulationField::Partition& fOut /*! Population */) - - -> void - { - const Compute ck_u03 = u[0] + u[1]; - const Compute ck_u04 = u[0] - u[1]; - const Compute ck_u05 = u[0] + u[2]; - const Compute ck_u06 = u[0] - u[2]; - const Compute ck_u07 = u[1] + u[2]; - const Compute ck_u08 = u[1] - u[2]; - - const Compute eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr); - const Compute eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr); - const Compute eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr); - const Compute eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr); - const Compute eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr); - const Compute eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr); - const Compute eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr); - const Compute eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr); - const Compute eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr); - - const Compute eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0]; - const Compute eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1]; - const Compute eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2]; - const Compute eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03; - const Compute eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04; - const Compute eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05; - const Compute eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06; - const Compute eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07; - const Compute eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08; - - const Compute pop_out_00 = (1. - omega) * static_cast(pop[0]) + omega * eq_00; - const Compute pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; - const Compute pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; - const Compute pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; - const Compute pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; - const Compute pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; - const Compute pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; - const Compute pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; - const Compute pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; - - const Compute pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; - const Compute pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; - const Compute pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; - const Compute pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; - const Compute pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; - const Compute pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; - const Compute pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; - const Compute pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; - const Compute pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; - - -#define COMPUTE_GO_AND_BACK(GOid, BKid) \ - { \ - fOut(i, GOid) = static_cast(pop_out_0##GOid); \ - fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ - } - - COMPUTE_GO_AND_BACK(0, 10) - COMPUTE_GO_AND_BACK(1, 11) - COMPUTE_GO_AND_BACK(2, 12) - COMPUTE_GO_AND_BACK(3, 13) - COMPUTE_GO_AND_BACK(4, 14) - COMPUTE_GO_AND_BACK(5, 15) - COMPUTE_GO_AND_BACK(6, 16) - COMPUTE_GO_AND_BACK(7, 17) - COMPUTE_GO_AND_BACK(8, 18) - -#undef COMPUTE_GO_AND_BACK - - { - const Compute eq_09 = rho * (1. / 3.) * (1. - usqr); - const Compute pop_out_09 = (1. - omega) * - static_cast(pop[Lattice::centerDirection]) + - omega * eq_09; - fOut(i, Lattice::centerDirection) = static_cast(pop_out_09); - } - } + using Functions = DeviceD3Q19; static auto iteration(Neon::set::StencilSemantic stencilSemantic, - const PopulationField& fInField /*! Input population field */, + const PopField& fInField /*! Input population field */, const CellTypeField& cellTypeField /*! Cell type field */, const Compute omega /*! LBM omega parameter */, - PopulationField& fOutField /*! Output Population field */) + PopField& fOutField /*! Output Population field */) -> Neon::set::Container { Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", + "D3Q19_TwoPop", [&, omega](Neon::set::Loader& L) -> auto { auto& fIn = L.load(fInField, Neon::Pattern::STENCIL, stencilSemantic); auto& fOut = L.load(fOutField); const auto& cellInfoPartition = L.load(cellTypeField); - return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { Storage popIn[Lattice::Q]; - pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); Compute rho; std::array u{.0, .0, .0}; - macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); Compute usqr = 1.5 * (u[0] * u[0] + u[1] * u[1] + u[2] * u[2]); - collideBgkUnrolled(gidx, - popIn, - rho, u, - usqr, omega, - NEON_OUT fOut); + Functions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); } }; }); return container; } -#define COMPUTE_MASK_WALL(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ - if (nghCellType.classification != CellType::bulk) { \ - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); \ - } \ - } \ - { /*BK*/ \ - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ - if (nghCellType.classification != CellType::bulk) { \ - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << BKid)); \ - } \ - } \ - } static auto computeWallNghMask(const CellTypeField& infoInField, @@ -262,24 +86,27 @@ struct LbmContainers auto { - auto& infoIn = L.load(infoInField, - Neon::Pattern::STENCIL); + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); auto& infoOut = L.load(infoOutpeField); - return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { CellType cellType = infoIn(gidx, 0); cellType.wallNghBitflag = 0; if (cellType.classification == CellType::bulk) { - COMPUTE_MASK_WALL(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10) - COMPUTE_MASK_WALL(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11) - COMPUTE_MASK_WALL(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12) - COMPUTE_MASK_WALL(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13) - COMPUTE_MASK_WALL(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14) - COMPUTE_MASK_WALL(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15) - COMPUTE_MASK_WALL(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16) - COMPUTE_MASK_WALL(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17) - COMPUTE_MASK_WALL(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18) + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOid) { + if constexpr (GOid != Lattice::center) { + constexpr int BKid = Lattice::oppositeDirection[GOid]; + constexpr int BKx = Lattice::stencil[BKid].x; + constexpr int BKy = Lattice::stencil[BKid].y; + constexpr int BKz = Lattice::stencil[BKid].z; + + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification != CellType::bulk) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); + } + } + }); infoOut(gidx, 0) = cellType; } @@ -287,17 +114,13 @@ struct LbmContainers Neon::set::Container { @@ -311,32 +134,31 @@ struct LbmContainers u{.0, .0, .0}; Storage popIn[Lattice::Q]; if (cellInfo.classification == CellType::bulk) { - pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { if (cellInfo.classification == CellType::movingWall) { - BC_LOAD(0, 10) - BC_LOAD(1, 11) - BC_LOAD(2, 12) - BC_LOAD(3, 13) - BC_LOAD(4, 14) - BC_LOAD(5, 15) - BC_LOAD(6, 16) - BC_LOAD(7, 17) - BC_LOAD(8, 18) - popIn[9] = fIn(gidx, 9); + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOid) { + if constexpr (GOid == Lattice::center) { + popIn[Lattice::center] = fIn(gidx, Lattice::center); + } else { + popIn[GOid] = fIn(gidx, GOid); + } + }); rho = 1.0; - u = std::array{COMPUTE_CAST(popIn[0]) / COMPUTE_CAST(6. * 1. / 18.), - COMPUTE_CAST(popIn[1]) / COMPUTE_CAST(6. * 1. / 18.), - COMPUTE_CAST(popIn[2]) / COMPUTE_CAST(6. * 1. / 18.)}; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; } } @@ -349,5 +171,3 @@ struct LbmContainers -struct D3Q19Template +template +struct D3Q19 { public: static constexpr int Q = 19; /** number of directions */ static constexpr int D = 3; /** Space dimension */ - - static constexpr int centerDirection = 9; /** Position of direction {0,0,0} */ - static constexpr int goRangeBegin = 0; /** Symmetry is represented as "go" direction and the "back" their opposite */ + using Precision = Precision_; + + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1)}; + + private: + static constexpr int center = 9; /** Position of direction {0,0,0} */ + static constexpr int goRangeBegin = 0; /** Symmetry is represented as "go" direction and the "back" their opposite */ static constexpr int goRangeEnd = 8; static constexpr int goBackOffset = 10; /** Offset to compute apply symmetry */ - - explicit D3Q19Template(const Neon::Backend& backend) + public: + explicit D3Q19(const Neon::Backend& backend) { - // The discrete velocities of the Lattice mesh. - c_vect = std::vector( - { - {-1, 0, 0} /*! 0 Symmetry first section (GO) */, - {0, -1, 0} /*! 1 */, - {0, 0, -1} /*! 2 */, - {-1, -1, 0} /*! 3 */, - {-1, 1, 0} /*! 4 */, - {-1, 0, -1} /*! 5 */, - {-1, 0, 1} /*! 6 */, - {0, -1, -1} /*! 7 */, - {0, -1, 1} /*! 8 */, - {0, 0, 0} /*! 9 The center */, - {1, 0, 0} /*! 10 Symmetry mirror section (BK) */, - {0, 1, 0} /*! 11 */, - {0, 0, 1} /*! 12 */, - {1, 1, 0} /*! 13 */, - {1, -1, 0} /*! 14 */, - {1, 0, 1} /*! 15 */, - {1, 0, -1} /*! 16 */, - {0, 1, 1} /*! 17 */, - {0, 1, -1} /*! 18 */, - }); - - auto c_neon = backend.devSet().newMemSet( - Neon::DataUse::HOST_DEVICE, - 1, - Neon::MemoryOptions(), - backend.devSet().newDataSet([&](Neon::SetIdx const&, auto& val) { - val = c_vect.size(); - })); - - for (Neon::SetIdx i = 0; i < backend.devSet().setCardinality(); i++) { - for (int j = 0; j < int(c_vect.size()); j++) { - c_neon.eRef(i, j).x = static_cast(c_vect[j].x); - c_neon.eRef(i, j).y = static_cast(c_vect[j].y); - c_neon.eRef(i, j).z = static_cast(c_vect[j].z); - } - } - // The opposite of a given direction. - std::vector opp_vect = { - 10 /*! 0 */, - 11 /*! 1 */, - 12 /*! 2 */, - 13 /*! 3 */, - 14 /*! 4 */, - 15 /*! 5 */, - 16 /*! 6 */, - 17 /*! 7 */, - 18 /*! 8 */, - 9 /*! 9 */, - 0 /*! 10 */, - 1 /*! 11 */, - 2 /*! 12 */, - 3 /*! 13 */, - 4 /*! 14 */, - 5 /*! 15 */, - 6 /*! 16 */, - 7 /*! 17 */, - 8 /*! 18 */, - }; - - { // Check correctness of opposite - for (int i = 0; i < static_cast(c_vect.size()); i++) { - auto point = c_vect[i]; - auto opposite = point * -1; - if (opposite != c_vect[opp_vect[i]]) { - Neon::NeonException exp(""); - exp << "Incompatible opposite"; - NEON_THROW(exp); - } - } - } - - this->opp = backend.devSet().newMemSet( - Neon::DataUse::HOST_DEVICE, - 1, - Neon::MemoryOptions(), - backend.devSet().newDataSet([&](Neon::SetIdx const&, auto& val) { - val = opp_vect.size(); - })); - - - for (Neon::SetIdx i = 0; i < backend.devSet().setCardinality(); i++) { - for (size_t j = 0; j < opp_vect.size(); j++) { - this->opp.eRef(i, j, 0) = opp_vect[j]; - } - } - - // The lattice weights. - t_vect = { - 1. / 18. /*! 0 */, - 1. / 18. /*! 1 */, - 1. / 18. /*! 2 */, - 1. / 36. /*! 3 */, - 1. / 36. /*! 4 */, - 1. / 36. /*! 5 */, - 1. / 36. /*! 6 */, - 1. / 36. /*! 7 */, - 1. / 36. /*! 8 */, - 1. / 3. /*! 9 */, - 1. / 18. /*! 10 */, - 1. / 18. /*! 11 */, - 1. / 18. /*! 12 */, - 1. / 36. /*! 13 */, - 1. / 36. /*! 14 */, - 1. / 36. /*! 15 */, - 1. / 36. /*! 16 */, - 1. / 36. /*! 17 */, - 1. / 36. /*! 18 */, - }; - - this->t = backend.devSet().newMemSet( - Neon::DataUse::HOST_DEVICE, - 1, - Neon::MemoryOptions(), - backend.devSet().newDataSet([&](Neon::SetIdx const&, auto&val) { - val= opp_vect.size(); - })); - - - for (Neon::SetIdx i = 0; i < backend.devSet().setCardinality(); i++) { - for (size_t j = 0; j < t_vect.size(); j++) { - this->t.eRef(i, j, 0) = t_vect[j]; - } - } - - if (backend.runtime() == Neon::Runtime::stream) { - this->c.template update(backend.streamSet(0), Neon::DeviceType::CUDA); - this->opp.template update(backend.streamSet(0), Neon::DeviceType::CUDA); - this->t.template update(backend.streamSet(0), Neon::DeviceType::CUDA); - } } - template static constexpr auto getOpposite() -> int { - if constexpr (go == centerDirection) - return centerDirection; + if constexpr (go == center) + return center; if constexpr (go <= goRangeEnd) return go + goBackOffset; if constexpr (go <= goRangeEnd + goBackOffset) return go - goBackOffset; } + static constexpr std::array opposite = []() { + std::array resOpposite; + for (int i = 0; i < Q; ++i) { + resOpposite[i] = getOpposite(i); + } + return resOpposite; + }(); - Neon::set::MemSet c; - Neon::set::MemSet opp; - Neon::set::MemSet t; - std::vector t_vect; - std::vector c_vect; + static auto getDirectionAsVector() + -> std::vector + { + std::vector vec; + for (auto const& a : stencil) { + vec.push_back(a); + } + return vec; + } }; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h new file mode 100644 index 00000000..97a6b9e0 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h @@ -0,0 +1,169 @@ +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + + +template +struct DeviceD3Q19 +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOid) { + if constexpr (GOid == Lattice::center) { + popIn[Lattice::center] = fin(gidx, Lattice::center); + } else { + constexpr int BKid = Lattice::oppositeDirection[GOid]; + constexpr int BKx = Lattice::stencil[BKid].x; + constexpr int BKy = Lattice::stencil[BKid].y; + constexpr int BKz = Lattice::stencil[BKid].z; + + if (wallBitFlag & (uint32_t(1) << GOid)) { + popIn[GOid] = fin(gidx, BKid) + + fin.template getNghData(gidx, BKid)(); + } else { + popIn[GOid] = fin.template getNghData(gidx, GOid)(); + } + } + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const Storage pop[Lattice::Q], + NEON_OUT Compute& rho, + NEON_OUT std::array& u) + -> void + { + +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! Compute iterator */, + const Storage pop[Lattice::Q], + Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + typename PopField::Partition& fOut /*! Population */) + + -> void + { + const Compute ck_u03 = u[0] + u[1]; + const Compute ck_u04 = u[0] - u[1]; + const Compute ck_u05 = u[0] + u[2]; + const Compute ck_u06 = u[0] - u[2]; + const Compute ck_u07 = u[1] + u[2]; + const Compute ck_u08 = u[1] - u[2]; + + constexpr Compute c1over18 = 1. / 18.; + constexpr Compute c1over36 = 1. / 36.; + constexpr Compute c4dot5 = 4.5; + constexpr Compute c3 = 3.; + constexpr Compute c1 = 1.; + constexpr Compute c6 = 6.; + + const Compute eq_00 = rho * c1over18 * (c1 - c6 * u[0] + c4dot5 * u[0] * u[0] - usqr); + const Compute eq_01 = rho * c1over18 * (c1 - c6 * u[1] + c4dot5 * u[1] * u[1] - usqr); + const Compute eq_02 = rho * c1over18 * (c1 - c6 * u[2] + c4dot5 * u[2] * u[2] - usqr); + const Compute eq_03 = rho * c1over36 * (c1 - c6 * ck_u03 + c4dot5 * ck_u03 * ck_u03 - usqr); + const Compute eq_04 = rho * c1over36 * (c1 - c6 * ck_u04 + c4dot5 * ck_u04 * ck_u04 - usqr); + const Compute eq_05 = rho * c1over36 * (c1 - c6 * ck_u05 + c4dot5 * ck_u05 * ck_u05 - usqr); + const Compute eq_06 = rho * c1over36 * (c1 - c6 * ck_u06 + c4dot5 * ck_u06 * ck_u06 - usqr); + const Compute eq_07 = rho * c1over36 * (c1 - c6 * ck_u07 + c4dot5 * ck_u07 * ck_u07 - usqr); + const Compute eq_08 = rho * c1over36 * (c1 - c6 * ck_u08 + c4dot5 * ck_u08 * ck_u08 - usqr); + + const Compute eqopp_00 = eq_00 + rho * c1over18 * c6 * u[0]; + const Compute eqopp_01 = eq_01 + rho * c1over18 * c6 * u[1]; + const Compute eqopp_02 = eq_02 + rho * c1over18 * c6 * u[2]; + const Compute eqopp_03 = eq_03 + rho * c1over36 * c6 * ck_u03; + const Compute eqopp_04 = eq_04 + rho * c1over36 * c6 * ck_u04; + const Compute eqopp_05 = eq_05 + rho * c1over36 * c6 * ck_u05; + const Compute eqopp_06 = eq_06 + rho * c1over36 * c6 * ck_u06; + const Compute eqopp_07 = eq_07 + rho * c1over36 * c6 * ck_u07; + const Compute eqopp_08 = eq_08 + rho * c1over36 * c6 * ck_u08; + + const Compute pop_out_00 = (c1 - omega) * static_cast(pop[0]) + omega * eq_00; + const Compute pop_out_01 = (c1 - omega) * static_cast(pop[1]) + omega * eq_01; + const Compute pop_out_02 = (c1 - omega) * static_cast(pop[2]) + omega * eq_02; + const Compute pop_out_03 = (c1 - omega) * static_cast(pop[3]) + omega * eq_03; + const Compute pop_out_04 = (c1 - omega) * static_cast(pop[4]) + omega * eq_04; + const Compute pop_out_05 = (c1 - omega) * static_cast(pop[5]) + omega * eq_05; + const Compute pop_out_06 = (c1 - omega) * static_cast(pop[6]) + omega * eq_06; + const Compute pop_out_07 = (c1 - omega) * static_cast(pop[7]) + omega * eq_07; + const Compute pop_out_08 = (c1 - omega) * static_cast(pop[8]) + omega * eq_08; + + const Compute pop_out_opp_00 = (c1 - omega) * static_cast(pop[10]) + omega * eqopp_00; + const Compute pop_out_opp_01 = (c1 - omega) * static_cast(pop[11]) + omega * eqopp_01; + const Compute pop_out_opp_02 = (c1 - omega) * static_cast(pop[12]) + omega * eqopp_02; + const Compute pop_out_opp_03 = (c1 - omega) * static_cast(pop[13]) + omega * eqopp_03; + const Compute pop_out_opp_04 = (c1 - omega) * static_cast(pop[14]) + omega * eqopp_04; + const Compute pop_out_opp_05 = (c1 - omega) * static_cast(pop[15]) + omega * eqopp_05; + const Compute pop_out_opp_06 = (c1 - omega) * static_cast(pop[16]) + omega * eqopp_06; + const Compute pop_out_opp_07 = (c1 - omega) * static_cast(pop[17]) + omega * eqopp_07; + const Compute pop_out_opp_08 = (c1 - omega) * static_cast(pop[18]) + omega * eqopp_08; + + +#define COMPUTE_GO_AND_BACK(GOid, BKid) \ + { \ + fOut(i, GOid) = static_cast(pop_out_0##GOid); \ + fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ + } + + COMPUTE_GO_AND_BACK(0, 10) + COMPUTE_GO_AND_BACK(1, 11) + COMPUTE_GO_AND_BACK(2, 12) + COMPUTE_GO_AND_BACK(3, 13) + COMPUTE_GO_AND_BACK(4, 14) + COMPUTE_GO_AND_BACK(5, 15) + COMPUTE_GO_AND_BACK(6, 16) + COMPUTE_GO_AND_BACK(7, 17) + COMPUTE_GO_AND_BACK(8, 18) + +#undef COMPUTE_GO_AND_BACK + + { + const Compute eq_09 = rho * (c1 / c3) * (c1 - usqr); + const Compute pop_out_09 = (c1 - omega) * static_cast(pop[Lattice::center]) + + omega * eq_09; + fOut(i, Lattice::center) = static_cast(pop_out_09); + } + } +}; + +#undef CAST_TO_COMPUTE \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h index b92d9acc..2e8684b2 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h @@ -1,36 +1,38 @@ #include "CellType.h" +#include "ContainersD3Q19.h" #include "D3Q19.h" -#include "LbmTools.h" #include "Neon/Neon.h" #include "Neon/set/Backend.h" #include "Neon/set/Containter.h" #include "Neon/skeleton/Skeleton.h" +#include "ContainerFactory.h" -template +template struct LbmSkeleton { }; -template -struct LbmIterationD3Q19 +template +struct LbmSkeleton, + Grid_> { using LbmStoreType = typename PopulationField::Type; using CellTypeField = typename PopulationField::Grid::template Field; - using D3Q19 = D3Q19Template; + using D3Q19 = D3Q19; using LbmTools = LbmContainers; - LbmIterationD3Q19(Neon::set::StencilSemantic stencilSemantic, - Neon::skeleton::Occ occ, - Neon::set::TransferMode transfer, - PopulationField& fIn /*! inpout population field */, - PopulationField& fOut, - CellTypeField& cellTypeField /*! Cell type field */, - LbmComputeType omega /*! LBM omega parameter */) + LbmSkeleton(Neon::set::StencilSemantic stencilSemantic, + Neon::skeleton::Occ occ, + Neon::set::TransferMode transfer, + PopulationField& fIn /*! inpout population field */, + PopulationField& fOut, + CellTypeField& cellTypeField /*! Cell type field */, + LbmComputeType omega /*! LBM omega parameter */) { pop[0] = fIn; pop[1] = fOut; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h index fc4d7806..489b3782 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h @@ -19,13 +19,13 @@ struct LbmContainersTemplateOnly */ template -struct LbmContainersTemplateOnly, +struct LbmContainersTemplateOnly, PopulationField, LbmComputeType> { using LbmStoreType = typename PopulationField::Type; using CellTypeField = typename PopulationField::Grid::template Field; - using Lattice = D3Q19Template; + using Lattice = D3Q19; using Idx = typename PopulationField::Idx; using Grid = typename PopulationField::Grid; using Rho = typename Grid::template Field; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h new file mode 100644 index 00000000..a45ff69e --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h @@ -0,0 +1,13 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" + +template +struct Precision +{ + using Storage = StorageFP; + using Compute = ComputeFP; +}; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index e91055f9..5640aa7e 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -21,7 +21,7 @@ template void { - using Lattice = D3Q19Template; + using Lattice = D3Q19; using PopulationField = typename Grid::template Field; @@ -52,7 +52,7 @@ auto run(Config& config, Grid grid( bk, {config.N, config.N, config.N}, [](const Neon::index_3d&) { return true; }, - lattice.c_vect); + lattice.getDirectionAsVector()); PopulationField pop0 = grid.template newField("Population", Lattice::Q, StorageFP(0.0)); PopulationField pop1 = grid.template newField("Population", Lattice::Q, StorageFP(0.0)); From 3404f038b42a1ed3274faae247ec32a73bb47911 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 30 Jun 2023 17:25:24 -0400 Subject: [PATCH 28/94] WIP --- .../src/ContainersD3Q19.h | 8 ++- .../lbm-lid-driven-cavity-flow/src/D3Q19.h | 33 +++++++--- .../src/DeviceD3Q19.h | 9 +-- .../src/{LbmIteration.h => LbmSkeleton.h} | 40 +++++++----- .../src/RunCavityTwoPop.cu | 65 +++++++++++-------- 5 files changed, 97 insertions(+), 58 deletions(-) rename benchmarks/lbm-lid-driven-cavity-flow/src/{LbmIteration.h => LbmSkeleton.h} (69%) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h index 0381cda2..808c94de 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h @@ -1,3 +1,5 @@ +#pragma once + #include "CellType.h" #include "D3Q19.h" #include "DeviceD3Q19.h" @@ -25,12 +27,12 @@ struct ContainerFactory; + using PopField = typename Grid::template Field; using CellTypeField = typename Grid::template Field; using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; using Functions = DeviceD3Q19; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h index 4e5aa52d..b9a1a404 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h @@ -12,7 +12,7 @@ struct D3Q19 static constexpr int Q = 19; /** number of directions */ static constexpr int D = 3; /** Space dimension */ using Precision = Precision_; - + using Self = D3Q19; static constexpr std::array stencil{ Neon::index_3d(-1, 0, 0), Neon::index_3d(0, -1, 0), @@ -34,8 +34,8 @@ struct D3Q19 Neon::index_3d(0, 1, 1), Neon::index_3d(0, 1, -1)}; - private: static constexpr int center = 9; /** Position of direction {0,0,0} */ + private: static constexpr int goRangeBegin = 0; /** Symmetry is represented as "go" direction and the "back" their opposite */ static constexpr int goRangeEnd = 8; static constexpr int goBackOffset = 10; /** Offset to compute apply symmetry */ @@ -57,13 +57,28 @@ struct D3Q19 return go - goBackOffset; } - static constexpr std::array opposite = []() { - std::array resOpposite; - for (int i = 0; i < Q; ++i) { - resOpposite[i] = getOpposite(i); - } - return resOpposite; - }(); + static constexpr std::array opposite { + Self::template getOpposite<0>(), + Self::template getOpposite<1>(), + Self::template getOpposite<2>(), + Self::template getOpposite<3>(), + Self::template getOpposite<4>(), + Self::template getOpposite<5>(), + Self::template getOpposite<6>(), + Self::template getOpposite<7>(), + Self::template getOpposite<8>(), + Self::template getOpposite<9>(), + Self::template getOpposite<10>(), + Self::template getOpposite<11>(), + Self::template getOpposite<12>(), + Self::template getOpposite<13>(), + Self::template getOpposite<14>(), + Self::template getOpposite<15>(), + Self::template getOpposite<16>(), + Self::template getOpposite<17>(), + Self::template getOpposite<18>() + }; + static auto getDirectionAsVector() -> std::vector diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h index 97a6b9e0..6333106a 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h @@ -1,3 +1,4 @@ +#pragma once #include "CellType.h" #include "D3Q19.h" #include "Neon/Neon.h" @@ -13,12 +14,12 @@ struct DeviceD3Q19 using Storage = typename Precision::Storage; using Grid = Grid_; - using PopField = typename Grid::template Field; + using PopField = typename Grid::template Field; using CellTypeField = typename Grid::template Field; using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; static inline NEON_CUDA_HOST_DEVICE auto @@ -32,7 +33,7 @@ struct DeviceD3Q19 if constexpr (GOid == Lattice::center) { popIn[Lattice::center] = fin(gidx, Lattice::center); } else { - constexpr int BKid = Lattice::oppositeDirection[GOid]; + constexpr int BKid = Lattice::opposite[GOid]; constexpr int BKx = Lattice::stencil[BKid].x; constexpr int BKy = Lattice::stencil[BKid].y; constexpr int BKz = Lattice::stencil[BKid].z; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h similarity index 69% rename from benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h rename to benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h index 2e8684b2..9dfb7355 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h @@ -1,11 +1,11 @@ #include "CellType.h" +#include "ContainerFactory.h" #include "ContainersD3Q19.h" #include "D3Q19.h" #include "Neon/Neon.h" #include "Neon/set/Backend.h" #include "Neon/set/Containter.h" #include "Neon/skeleton/Skeleton.h" -#include "ContainerFactory.h" template , Grid_> { - using LbmStoreType = typename PopulationField::Type; - using CellTypeField = typename PopulationField::Grid::template Field; - using D3Q19 = D3Q19; - using LbmTools = LbmContainers; + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using ContainerFactory = ContainerFactory; LbmSkeleton(Neon::set::StencilSemantic stencilSemantic, Neon::skeleton::Occ occ, Neon::set::TransferMode transfer, - PopulationField& fIn /*! inpout population field */, - PopulationField& fOut, + PopField& fIn /*! inpout population field */, + PopField& fOut, CellTypeField& cellTypeField /*! Cell type field */, - LbmComputeType omega /*! LBM omega parameter */) + Compute omega /*! LBM omega parameter */) { pop[0] = fIn; pop[1] = fOut; @@ -42,14 +51,15 @@ struct LbmSkeleton PopulationField& + -> PopField& { return pop[parity]; } auto getOutput() - -> PopulationField& + -> PopField& { int other = parity == 0 ? 1 : 0; return pop[other]; @@ -79,15 +89,15 @@ struct LbmSkeleton ops; lbmTwoPop[target] = Neon::skeleton::Skeleton(inField.getBackend()); Neon::skeleton::Options opt(occ, transfer); - ops.push_back(LbmTools::iteration(stencilSemantic, + ops.push_back(ContainerFactory::iteration(stencilSemantic, inField, cellTypeField, omega, @@ -98,6 +108,6 @@ struct LbmSkeleton + typename Storage_, + typename Compute_> auto run(Config& config, Report& report) -> void { - using Lattice = D3Q19; - using PopulationField = typename Grid::template Field; + using Storage = Storage_; + using Compute = Compute_; + using Precision = Precision; + using Lattice = D3Q19; + using PopulationField = typename Grid::template Field; + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using RhoField = typename Grid::template Field; + using UField = typename Grid::template Field; + + using Skeleton = LbmSkeleton; + using ContainerFactory = ContainerFactory; Neon::Backend bk = [&] { if (config.deviceType == "cpu") { @@ -54,31 +66,30 @@ auto run(Config& config, [](const Neon::index_3d&) { return true; }, lattice.getDirectionAsVector()); - PopulationField pop0 = grid.template newField("Population", Lattice::Q, StorageFP(0.0)); - PopulationField pop1 = grid.template newField("Population", Lattice::Q, StorageFP(0.0)); + PopulationField pop0 = grid.template newField("Population", Lattice::Q, Storage(0.0)); + PopulationField pop1 = grid.template newField("Population", Lattice::Q, Storage(0.0)); - typename Grid::template Field rho; - typename Grid::template Field u; + typename Grid::template Field rho; + typename Grid::template Field u; if (!config.benchmark) { std::cout << "Allocating rho and u" << std::endl; - rho = grid.template newField("rho", 1, StorageFP(0.0)); - u = grid.template newField("u", 3, StorageFP(0.0)); + rho = grid.template newField("rho", 1, Storage(0.0)); + u = grid.template newField("u", 3, Storage(0.0)); } CellType defaultCelltype; auto flag = grid.template newField("Material", 1, defaultCelltype); - auto lbmParameters = config.getLbmParameters(); + auto lbmParameters = config.getLbmParameters(); - LbmIterationD3Q19 - iteration(config.stencilSemantic, - config.occ, - config.transferMode, - pop0, - pop1, - flag, - lbmParameters.omega); + Skeleton iteration(config.stencilSemantic, + config.occ, + config.transferMode, + pop0, + pop1, + flag, + lbmParameters.omega); auto exportRhoAndU = [&bk, &rho, &u, &iteration, &flag, &grid, &ulid](int iterationId) { if ((iterationId) % 100 == 0) { @@ -92,7 +103,7 @@ auto run(Config& config, bk.syncAll(); } - auto container = LbmContainers::computeRhoAndU(f, flag, rho, u); + auto container = ContainerFactory::computeRhoAndU(f, flag, rho, u); container.run(Neon::Backend::mainStreamIdx); u.updateHostData(Neon::Backend::mainStreamIdx); rho.updateHostData(Neon::Backend::mainStreamIdx); @@ -168,7 +179,7 @@ auto run(Config& config, inPop.forEachActiveCell([&c, &t, &dim, &flag, &ulid, &config](const Neon::index_3d& idx, const int& k, - StorageFP& val) { + Storage& val) { val = t.at(k); if (idx.x == 0 || idx.x == dim.x - 1 || @@ -188,7 +199,7 @@ auto run(Config& config, outPop.forEachActiveCell([&c, &t, &dim, &flag, &ulid, &config](const Neon::index_3d& idx, const int& k, - StorageFP& val) { + Storage& val) { val = t.at(k); if (idx.x == 0 || idx.x == dim.x - 1 || @@ -237,7 +248,7 @@ auto run(Config& config, bk.syncAll(); } - auto container = LbmContainers::computeWallNghMask(flag, flag); + auto container = LbmContainers::computeWallNghMask(flag, flag); container.run(Neon::Backend::mainStreamIdx); bk.syncAll(); } @@ -276,14 +287,14 @@ auto run(Config& config, metrics::recordMetrics(bk, config, report, start, clock_iter); } -template +template auto runFilterComputeType(Config& config, Report& report) -> void { if (config.computeType == "double") { - return run(config, report); + return run(config, report); } if (config.computeType == "float") { - return run(config, report); + return run(config, report); } NEON_DEV_UNDER_CONSTRUCTION(""); } From a2ed8f6fdb0daea43f688d3f7becf5de7cca9c9f Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 4 Jul 2023 11:55:01 -0400 Subject: [PATCH 29/94] Refactoring of the LBM benchmark --- .../src/ContainerFactory.h | 2 +- .../src/ContainersD3Q19.h | 35 +-- .../lbm-lid-driven-cavity-flow/src/D3Q19.h | 241 ++++++++++++++---- .../src/DeviceD3Q19.h | 45 ++-- .../src/LbmSkeleton.h | 6 +- .../src/RunCavityTwoPop.cu | 10 +- .../Neon/core/types/vec/vec3d_integer.tdecl.h | 14 +- .../Neon/core/types/vec/vec3d_integer.timp.h | 14 +- .../domain/details/dGridSoA/dPartitionSoA.h | 6 +- 9 files changed, 261 insertions(+), 112 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h index 6ee35bbb..c4f9a107 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h @@ -6,7 +6,7 @@ template -struct LbmContainers +struct ContainerFactory { }; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h index 808c94de..f5c1778b 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h @@ -6,13 +6,6 @@ #include "Neon/Neon.h" #include "Neon/set/Containter.h" -template -struct ContainerFactory -{ -}; - /** * Specialization for D3Q19 */ @@ -27,12 +20,12 @@ struct ContainerFactory; + using PopField = typename Grid::template Field; using CellTypeField = typename Grid::template Field; using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; using Functions = DeviceD3Q19; @@ -96,16 +89,16 @@ struct ContainerFactory([&](auto GOid) { - if constexpr (GOid != Lattice::center) { - constexpr int BKid = Lattice::oppositeDirection[GOid]; - constexpr int BKx = Lattice::stencil[BKid].x; - constexpr int BKy = Lattice::stencil[BKid].y; - constexpr int BKz = Lattice::stencil[BKid].z; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId != Lattice::Memory::center) { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); if (nghCellType.classification != CellType::bulk) { - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId)); } } }); @@ -149,11 +142,11 @@ struct ContainerFactory([&](auto GOid) { - if constexpr (GOid == Lattice::center) { - popIn[Lattice::center] = fIn(gidx, Lattice::center); + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GORegisterId) { + if constexpr (GORegisterId == Lattice::Registers::center) { + popIn[Lattice::Registers::center] = fIn(gidx, Lattice::Memory::center); } else { - popIn[GOid] = fIn(gidx, GOid); + popIn[GORegisterId] = fIn(gidx, Lattice::Memory::template mapFromRegisters()); } }); diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h index b9a1a404..7e43e410 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h @@ -5,59 +5,177 @@ #include "Neon/set/memory/memSet.h" #include "Precision.h" + +/** In each lattice we define two indexing schema + * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code. + * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields. + * + * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions. + * + */ template struct D3Q19 { public: + D3Q19() = delete; + static constexpr int Q = 19; /** number of directions */ static constexpr int D = 3; /** Space dimension */ using Precision = Precision_; using Self = D3Q19; - static constexpr std::array stencil{ - Neon::index_3d(-1, 0, 0), - Neon::index_3d(0, -1, 0), - Neon::index_3d(0, 0, -1), - Neon::index_3d(-1, -1, 0), - Neon::index_3d(-1, 1, 0), - Neon::index_3d(-1, 0, -1), - Neon::index_3d(-1, 0, 1), - Neon::index_3d(0, -1, -1), - Neon::index_3d(0, -1, 1), - Neon::index_3d(0, 0, 0), - Neon::index_3d(1, 0, 0), - Neon::index_3d(0, 1, 0), - Neon::index_3d(0, 0, 1), - Neon::index_3d(1, 1, 0), - Neon::index_3d(1, -1, 0), - Neon::index_3d(1, 0, 1), - Neon::index_3d(1, 0, -1), - Neon::index_3d(0, 1, 1), - Neon::index_3d(0, 1, -1)}; - - static constexpr int center = 9; /** Position of direction {0,0,0} */ - private: - static constexpr int goRangeBegin = 0; /** Symmetry is represented as "go" direction and the "back" their opposite */ - static constexpr int goRangeEnd = 8; - static constexpr int goBackOffset = 10; /** Offset to compute apply symmetry */ - public: - explicit D3Q19(const Neon::Backend& backend) + static constexpr int RegisterMapping = 1; + static constexpr int MemoryMapping = 2; + + struct Registers { - } + using Self = D3Q19::Registers; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1)}; + + static constexpr int center = 9; /** Position of direction {0,0,0} */ + + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; + } + } + } + + static constexpr std::array opposite{ + Self::template getOpposite<0>(), + Self::template getOpposite<1>(), + Self::template getOpposite<2>(), + Self::template getOpposite<3>(), + Self::template getOpposite<4>(), + Self::template getOpposite<5>(), + Self::template getOpposite<6>(), + Self::template getOpposite<7>(), + Self::template getOpposite<8>(), + Self::template getOpposite<9>(), + Self::template getOpposite<10>(), + Self::template getOpposite<11>(), + Self::template getOpposite<12>(), + Self::template getOpposite<13>(), + Self::template getOpposite<14>(), + Self::template getOpposite<15>(), + Self::template getOpposite<16>(), + Self::template getOpposite<17>(), + Self::template getOpposite<18>()}; - template - static constexpr auto getOpposite() - -> int + static constexpr std::array t{ + 1. / 18. /*! 0 */, + 1. / 18. /*! 1 */, + 1. / 18. /*! 2 */, + 1. / 36. /*! 3 */, + 1. / 36. /*! 4 */, + 1. / 36. /*! 5 */, + 1. / 36. /*! 6 */, + 1. / 36. /*! 7 */, + 1. / 36. /*! 8 */, + 1. / 3. /*! 9 */, + 1. / 18. /*! 10 */, + 1. / 18. /*! 11 */, + 1. / 18. /*! 12 */, + 1. / 36. /*! 13 */, + 1. / 36. /*! 14 */, + 1. / 36. /*! 15 */, + 1. / 36. /*! 16 */, + 1. / 36. /*! 17 */, + 1. / 36. /*! 18 */, + }; + }; + + struct Memory { - if constexpr (go == center) - return center; - if constexpr (go <= goRangeEnd) - return go + goBackOffset; - if constexpr (go <= goRangeEnd + goBackOffset) - return go - goBackOffset; - } + using Self = D3Q19::Memory; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1)}; + + + static constexpr int center = 9; /** Position of direction {0,0,0} */ + static constexpr int goRangeBegin = 0; /** Symmetry is represented as "go" direction and the "back" their opposite */ + static constexpr int goRangeEnd = 8; + static constexpr int goBackOffset = 10; /** Offset to compute apply symmetry */ - static constexpr std::array opposite { + template + static constexpr auto mapToRegisters() + -> int + { + auto direction = stencil[go]; + for (int i = 0; i < Q; ++i) { + if (Registers::stencil[i] == direction) { + return i; + } + } + } + + template + static constexpr auto mapFromRegisters() + -> int + { + auto direction = Registers::stencil[go]; + for (int i = 0; i < Q; ++i) { + if (Self::stencil[i] == direction) { + return i; + } + } + } + + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; + } + } + } + + static constexpr std::array opposite{ Self::template getOpposite<0>(), Self::template getOpposite<1>(), Self::template getOpposite<2>(), @@ -76,16 +194,53 @@ struct D3Q19 Self::template getOpposite<15>(), Self::template getOpposite<16>(), Self::template getOpposite<17>(), - Self::template getOpposite<18>() + Self::template getOpposite<18>()}; + + template + static constexpr auto helpGetValueforT() + -> typename Precision::Storage + { + auto goInRegisterSpace = Self::template mapToRegisters(); + return Registers::t[goInRegisterSpace]; + } + + static constexpr std::array t{ + Self::template helpGetValueforT<0>(), + Self::template helpGetValueforT<1>(), + Self::template helpGetValueforT<2>(), + Self::template helpGetValueforT<3>(), + Self::template helpGetValueforT<4>(), + Self::template helpGetValueforT<5>(), + Self::template helpGetValueforT<6>(), + Self::template helpGetValueforT<7>(), + Self::template helpGetValueforT<8>(), + Self::template helpGetValueforT<9>(), + Self::template helpGetValueforT<10>(), + Self::template helpGetValueforT<11>(), + Self::template helpGetValueforT<12>(), + Self::template helpGetValueforT<13>(), + Self::template helpGetValueforT<14>(), + Self::template helpGetValueforT<15>(), + Self::template helpGetValueforT<16>(), + Self::template helpGetValueforT<17>(), + Self::template helpGetValueforT<18>()}; }; + public: + template static auto getDirectionAsVector() -> std::vector { std::vector vec; - for (auto const& a : stencil) { - vec.push_back(a); + if constexpr (mappingType == RegisterMapping) { + for (auto const& a : Registers::stencil) { + vec.push_back(a); + } + } else if constexpr (mappingType == MemoryMapping) { + for (auto const& a : Memory::stencil) { + vec.push_back(a); + } } return vec; } diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h index 6333106a..2ad6a62a 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h @@ -14,12 +14,12 @@ struct DeviceD3Q19 using Storage = typename Precision::Storage; using Grid = Grid_; - using PopField = typename Grid::template Field; + using PopField = typename Grid::template Field; using CellTypeField = typename Grid::template Field; using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; static inline NEON_CUDA_HOST_DEVICE auto @@ -29,20 +29,23 @@ struct DeviceD3Q19 NEON_OUT Storage popIn[Lattice::Q]) { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOid) { - if constexpr (GOid == Lattice::center) { - popIn[Lattice::center] = fin(gidx, Lattice::center); + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId == Lattice::Memory::center) { + popIn[Lattice::Registers::center] = fin(gidx, Lattice::Memory::center); } else { - constexpr int BKid = Lattice::opposite[GOid]; - constexpr int BKx = Lattice::stencil[BKid].x; - constexpr int BKy = Lattice::stencil[BKid].y; - constexpr int BKz = Lattice::stencil[BKid].z; - - if (wallBitFlag & (uint32_t(1) << GOid)) { - popIn[GOid] = fin(gidx, BKid) + - fin.template getNghData(gidx, BKid)(); + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; + constexpr int GORegistersId = Lattice::Memory::template mapToRegisters(); + + if (wallBitFlag & (uint32_t(1) << GOMemoryId)) { + popIn[GORegistersId] = + fin(gidx, BKMemoryId) + + fin.template getNghData(gidx, BKMemoryId)(); } else { - popIn[GOid] = fin.template getNghData(gidx, GOid)(); + popIn[GORegistersId] = + fin.template getNghData(gidx, GOMemoryId)(); } } }); @@ -140,10 +143,10 @@ struct DeviceD3Q19 const Compute pop_out_opp_08 = (c1 - omega) * static_cast(pop[18]) + omega * eqopp_08; -#define COMPUTE_GO_AND_BACK(GOid, BKid) \ - { \ - fOut(i, GOid) = static_cast(pop_out_0##GOid); \ - fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ +#define COMPUTE_GO_AND_BACK(GOid, BKid) \ + { \ + fOut(i, Lattice::Memory::template mapFromRegisters()) = static_cast(pop_out_0##GOid); \ + fOut(i, Lattice::Memory::template mapFromRegisters()) = static_cast(pop_out_opp_0##GOid); \ } COMPUTE_GO_AND_BACK(0, 10) @@ -160,9 +163,9 @@ struct DeviceD3Q19 { const Compute eq_09 = rho * (c1 / c3) * (c1 - usqr); - const Compute pop_out_09 = (c1 - omega) * static_cast(pop[Lattice::center]) + + const Compute pop_out_09 = (c1 - omega) * static_cast(pop[Lattice::Registers::center]) + omega * eq_09; - fOut(i, Lattice::center) = static_cast(pop_out_09); + fOut(i, Lattice::Memory::center) = static_cast(pop_out_09); } } }; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h index 9dfb7355..3408bee4 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h @@ -26,12 +26,12 @@ struct LbmSkeleton; + using PopField = typename Grid::template Field; using CellTypeField = typename Grid::template Field; using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; using ContainerFactory = ContainerFactory; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index cf868d2c..a3840db4 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -57,14 +57,12 @@ auto run(Config& config, } Neon::double_3d ulid(1., 0., 0.); - Lattice lattice(bk); - // Neon Grid and Fields initialization auto [start, clock_iter] = metrics::restartClock(bk, true); Grid grid( bk, {config.N, config.N, config.N}, [](const Neon::index_3d&) { return true; }, - lattice.getDirectionAsVector()); + Lattice::template getDirectionAsVector()); PopulationField pop0 = grid.template newField("Population", Lattice::Q, Storage(0.0)); PopulationField pop1 = grid.template newField("Population", Lattice::Q, Storage(0.0)); @@ -174,8 +172,8 @@ auto run(Config& config, Neon::index_3d dim(config.N, config.N, config.N); - const auto& t = lattice.t_vect; - const auto& c = lattice.c_vect; + const auto& t = Lattice::Memory::t; + const auto& c = Lattice::Memory::stencil; inPop.forEachActiveCell([&c, &t, &dim, &flag, &ulid, &config](const Neon::index_3d& idx, const int& k, @@ -248,7 +246,7 @@ auto run(Config& config, bk.syncAll(); } - auto container = LbmContainers::computeWallNghMask(flag, flag); + auto container = ContainerFactory::computeWallNghMask(flag, flag); container.run(Neon::Backend::mainStreamIdx); bk.syncAll(); } diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h index ae475c6e..3193d63d 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h @@ -328,10 +328,10 @@ class Vec_3d * @return Resulting point is C =(A.x / B.x, A.y / B.y, A.z / B.z) * */ template - NEON_CUDA_HOST_DEVICE inline self_t operator*(const Vec_3d& B) const; + NEON_CUDA_HOST_DEVICE inline constexpr self_t operator*(const Vec_3d& B) const; template - NEON_CUDA_HOST_DEVICE inline self_t operator*(const K_tt& alpha) const; + NEON_CUDA_HOST_DEVICE inline constexpr self_t operator*(const K_tt& alpha) const; /** * Compute the division between two points A and B, component by component (A.x/B.x, A.y/B.y, A.z/B.z). * Be careful!!! if the type is int, the division will be an integer division!!! @@ -368,15 +368,15 @@ class Vec_3d * @param[in] B: second point for the operation. * @return True if A.x <= B.x && A.y <= B.y && A.z <= B.z */ - NEON_CUDA_HOST_DEVICE inline bool operator==(const self_t& B) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator==(const self_t& B) const; - NEON_CUDA_HOST_DEVICE inline bool operator==(const Integer other[self_t::num_axis]) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator==(const Integer other[self_t::num_axis]) const; - NEON_CUDA_HOST_DEVICE inline bool operator==(const Integer otherScalar) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator==(const Integer otherScalar) const; - NEON_CUDA_HOST_DEVICE inline bool operator!=(const self_t& B) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator!=(const self_t& B) const; - NEON_CUDA_HOST_DEVICE inline bool operator!=(const Integer other[self_t::num_axis]) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator!=(const Integer other[self_t::num_axis]) const; NEON_CUDA_HOST_DEVICE inline self_t operator-() const; diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h index fe7222eb..c5ceea55 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h +++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h @@ -458,7 +458,7 @@ NEON_CUDA_HOST_DEVICE inline Vec_3d Vec_3d template -NEON_CUDA_HOST_DEVICE inline Vec_3d Vec_3d::operator*(const Vec_3d& B) const +NEON_CUDA_HOST_DEVICE inline constexpr Vec_3d Vec_3d::operator*(const Vec_3d& B) const { const Vec_3d& A = *this; // Vec_3d C((Integer)(A.x * B.x), (Integer)(A.y * B.y), (Integer)(A.z * B.z)); @@ -468,7 +468,7 @@ NEON_CUDA_HOST_DEVICE inline Vec_3d Vec_3d template -NEON_CUDA_HOST_DEVICE inline Vec_3d Vec_3d::operator*(const K_tt& alpha) const +NEON_CUDA_HOST_DEVICE inline constexpr Vec_3d Vec_3d::operator*(const K_tt& alpha) const { const Vec_3d& A = *this; const auto alpha_c = static_cast(alpha); @@ -526,35 +526,35 @@ NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator< template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator==(const Vec_3d& B) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator==(const Vec_3d& B) const { const Vec_3d& A = *this; return A.x == B.x && A.y == B.y && A.z == B.z; } template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator==(const IntegerType_ta other[Vec_3d::num_axis]) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator==(const IntegerType_ta other[Vec_3d::num_axis]) const { const Vec_3d& A = *this; return A.x == other[0] && A.y == other[1] && A.z == other[2]; } template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator==(const IntegerType_ta otherScalar) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator==(const IntegerType_ta otherScalar) const { const Vec_3d& A = *this; return A.x == otherScalar && A.y == otherScalar && A.z == otherScalar; } template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator!=(const Vec_3d& B) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator!=(const Vec_3d& B) const { const Vec_3d& A = *this; return !(A == B); } template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator!=(const IntegerType_ta other[Vec_3d::num_axis]) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator!=(const IntegerType_ta other[Vec_3d::num_axis]) const { const Vec_3d& A = *this; return A.x != other[0] || A.y != other[1] || A.z != other[2]; diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h index 0572302b..15c914a3 100644 --- a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -242,9 +242,9 @@ class dPartitionSoA gidx.getLocation().y + yOff, gidx.getLocation().z + zOff); gidxNgh = Idx(cartesian, gidx.getOffset() + - xOff * getPitchData().x + - yOff * getPitchData().y + - zOff * getPitchData().z); + xOff * static_cast(getPitchData().x) + + yOff * static_cast(getPitchData().y) + + zOff * static_cast(getPitchData().z)); } bool isValidNeighbour = true; From 0a56cf468db4e026eae1e2368348931cf2877c1d Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 5 Jul 2023 09:00:34 -0400 Subject: [PATCH 30/94] WIP - D3Q27 --- .../src/ContainerFactory.h | 3 +- .../src/ContainersD3Q19.h | 58 ++++ .../src/ContainersD3Q27.h | 226 +++++++++++++++ .../lbm-lid-driven-cavity-flow/src/D3Q19.h | 16 ++ .../lbm-lid-driven-cavity-flow/src/D3Q27.h | 268 ++++++++++++++++++ .../src/DeviceD3Q27.h | 217 ++++++++++++++ .../src/RunCavityTwoPop.cu | 77 +---- .../Neon/domain/details/bGrid/bField_imp.h | 3 +- .../Neon/domain/details/bGrid/bPartition.h | 12 +- .../domain/details/bGrid/bPartition_imp.h | 14 +- .../Neon/domain/details/dGrid/dPartition.h | 3 +- .../Neon/domain/details/eGrid/eField_imp.h | 3 +- .../Neon/domain/details/eGrid/ePartition.h | 17 +- .../domain/details/eGrid/ePartition_imp.h | 13 +- 14 files changed, 848 insertions(+), 82 deletions(-) create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h index c4f9a107..26f7a5a4 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h @@ -10,4 +10,5 @@ struct ContainerFactory { }; -#include "ContainersD3Q19.h" \ No newline at end of file +#include "ContainersD3Q19.h" +#include "ContainersD3Q27.h" \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h index f5c1778b..d2ca08eb 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h @@ -165,4 +165,62 @@ struct ContainerFactory Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, ulid, ulb](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, Neon::Pattern::MAP); + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globlalIdx = fIn.getGlobalIndex(gidx); + const auto domainDim = fIn.getDomainSize(); + CellType flagVal; + flagVal.classification = CellType::bulk; + flagVal.wallNghBitflag = 0; + typename Lattice::Precision::Storage val = 0; + + if (globlalIdx.x == 0 || globlalIdx.x == domainDim.x - 1 || + globlalIdx.y == 0 || globlalIdx.y == domainDim.y - 1 || + globlalIdx.z == 0 || globlalIdx.z == domainDim.z - 1) { + flagVal.classification = CellType::bounceBack; + + if (globlalIdx.y == domainDim.y - 1) { + flagVal.classification = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + if (globlalIdx.y == domainDim.y - 1) { + val = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); + } else { + val = 0; + } + fIn(gidx, q) = val; + fOut(gidx, q) = val; + }); + } else { + flagVal.classification = CellType::bulk; + cellInfoPartition(gidx, 0) = flagVal; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + fIn(gidx, q) = Lattice::Memory::template getT(); + fOut(gidx, q) = Lattice::Memory::template getT(); + }); + } + }; + }); + return container; + } }; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h new file mode 100644 index 00000000..deea13ab --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h @@ -0,0 +1,226 @@ +#pragma once + +#include "CellType.h" +#include "D3Q27.h" +#include "DeviceD3Q27.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +/** + * Specialization for D3Q27 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q27; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using Functions = DeviceD3Q19; + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + Functions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId != Lattice::Memory::center) { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; + + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification != CellType::bulk) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId)); + } + } + }); + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } + + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + Storage popIn[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + + Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + } else { + if (cellInfo.classification == CellType::movingWall) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GORegisterId) { + if constexpr (GORegisterId == Lattice::Registers::center) { + popIn[Lattice::Registers::center] = fIn(gidx, Lattice::Memory::center); + } else { + popIn[GORegisterId] = fIn(gidx, Lattice::Memory::template mapFromRegisters()); + } + }); + + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + + static auto + problemSetup(PopField& fInField /*! inpout population field */, + PopField& fOutField, + CellTypeField& cellTypeField, + Neon::double_3d ulid, + double ulb) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, ulid, ulb](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, Neon::Pattern::MAP); + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globlalIdx = fIn.getGlobalIndex(gidx); + const auto domainDim = fIn.getDomainSize(); + CellType flagVal; + flagVal.classification = CellType::bulk; + flagVal.wallNghBitflag = 0; + typename Lattice::Precision::Storage val = 0; + + if (globlalIdx.x == 0 || globlalIdx.x == domainDim.x - 1 || + globlalIdx.y == 0 || globlalIdx.y == domainDim.y - 1 || + globlalIdx.z == 0 || globlalIdx.z == domainDim.z - 1) { + flagVal.classification = CellType::bounceBack; + + if (globlalIdx.y == domainDim.y - 1) { + flagVal.classification = CellType::movingWall; + } + + for (int q = 0; q < Lattice::Q; q++) { + if (globlalIdx.y == domainDim.y - 1) { + val = -6. * Lattice::Memory::t.at(q) * ulb * + (Lattice::Memory::stencil.at(q).v[0] * ulid.v[0] + + Lattice::Memory::stencil.at(q).v[1] * ulid.v[1] + + Lattice::Memory::stencil.at(q).v[2] * ulid.v[2]); + } else { + val = 0; + } + fIn(gidx, q) = val; + fOut(gidx, q) = val; + } + } else { + flagVal.classification = CellType::bulk; + cellInfoPartition(gidx, 0) = flagVal; + for (int q = 0; q < Lattice::Q; q++) { + fIn(gidx, q) = Lattice::Memory::t.at(q); + fOut(gidx, q) = Lattice::Memory::t.at(q); + } + } + }; + }); + return container; + } +}; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h index 7e43e410..52035a35 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h @@ -112,6 +112,7 @@ struct D3Q19 struct Memory { using Self = D3Q19::Memory; + static constexpr std::array stencil{ Neon::index_3d(-1, 0, 0), Neon::index_3d(0, -1, 0), @@ -224,8 +225,23 @@ struct D3Q19 Self::template helpGetValueforT<16>(), Self::template helpGetValueforT<17>(), Self::template helpGetValueforT<18>()}; + + template + NEON_CUDA_HOST_DEVICE static constexpr auto getT() + -> typename Precision::Storage + { + return t[direction]; + } + template + NEON_CUDA_HOST_DEVICE static constexpr auto getDirection() + -> typename Neon::index_3d + { + return stencil[direction]; + } }; + + public: template diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h new file mode 100644 index 00000000..803f06d2 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h @@ -0,0 +1,268 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" +#include "Precision.h" + + +/** In each lattice we define two indexing schema + * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code. + * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields. + * + * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions. + * + */ +template +struct D3Q27 +{ + public: + D3Q27() = delete; + + static constexpr int Q = 27; /** number of directions */ + static constexpr int D = 3; /** Space dimension */ + using Precision = Precision_; + using Self = D3Q27; + + static constexpr int RegisterMapping = 1; + static constexpr int MemoryMapping = 2; + + struct Registers + { + using Self = D3Q27::Registers; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(-1, -1, -1), + Neon::index_3d(-1, -1, 1), + Neon::index_3d(-1, 1, -1), + Neon::index_3d(-1, 1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1), + Neon::index_3d(1, 1, 1), + Neon::index_3d(1, 1, -1), + Neon::index_3d(1, -1, 1), + Neon::index_3d(1, -1, -1)}; + + static constexpr int center = 13; /** Position of direction {0,0,0} */ + + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; + } + } + } + + static constexpr std::array opposite{ + Self::template getOpposite<0>(), + Self::template getOpposite<1>(), + Self::template getOpposite<2>(), + Self::template getOpposite<3>(), + Self::template getOpposite<4>(), + Self::template getOpposite<5>(), + Self::template getOpposite<6>(), + Self::template getOpposite<7>(), + Self::template getOpposite<8>(), + Self::template getOpposite<9>(), + Self::template getOpposite<10>(), + Self::template getOpposite<11>(), + Self::template getOpposite<12>(), + Self::template getOpposite<13>(), + Self::template getOpposite<14>(), + Self::template getOpposite<15>(), + Self::template getOpposite<16>(), + Self::template getOpposite<17>(), + Self::template getOpposite<18>(), + Self::template getOpposite<19>(), + Self::template getOpposite<20>(), + Self::template getOpposite<21>(), + Self::template getOpposite<22>(), + Self::template getOpposite<23>(), + Self::template getOpposite<24>(), + Self::template getOpposite<25>(), + Self::template getOpposite<26>()}; + + static constexpr std::array t{ + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216., + 8. / 27., + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; + }; + + struct Memory + { + using Self = D3Q27::Memory; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(-1, -1, -1), + Neon::index_3d(-1, -1, 1), + Neon::index_3d(-1, 1, -1), + Neon::index_3d(-1, 1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1), + Neon::index_3d(1, 1, 1), + Neon::index_3d(1, 1, -1), + Neon::index_3d(1, -1, 1), + Neon::index_3d(1, -1, -1)}; + + + static constexpr int center = 13; /** Position of direction {0,0,0} */ + + template + static constexpr auto mapToRegisters() + -> int + { + auto direction = stencil[go]; + for (int i = 0; i < Q; ++i) { + if (Registers::stencil[i] == direction) { + return i; + } + } + } + + template + static constexpr auto mapFromRegisters() + -> int + { + auto direction = Registers::stencil[go]; + for (int i = 0; i < Q; ++i) { + if (Self::stencil[i] == direction) { + return i; + } + } + } + + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; + } + } + } + + static constexpr std::array opposite{ + Self::template getOpposite<0>(), + Self::template getOpposite<1>(), + Self::template getOpposite<2>(), + Self::template getOpposite<3>(), + Self::template getOpposite<4>(), + Self::template getOpposite<5>(), + Self::template getOpposite<6>(), + Self::template getOpposite<7>(), + Self::template getOpposite<8>(), + Self::template getOpposite<9>(), + Self::template getOpposite<10>(), + Self::template getOpposite<11>(), + Self::template getOpposite<12>(), + Self::template getOpposite<13>(), + Self::template getOpposite<14>(), + Self::template getOpposite<15>(), + Self::template getOpposite<16>(), + Self::template getOpposite<17>(), + Self::template getOpposite<18>(), + Self::template getOpposite<19>(), + Self::template getOpposite<20>(), + Self::template getOpposite<21>(), + Self::template getOpposite<22>(), + Self::template getOpposite<23>(), + Self::template getOpposite<24>(), + Self::template getOpposite<25>(), + Self::template getOpposite<26>()}; + + template + static constexpr auto helpGetValueforT() + -> typename Precision::Storage + { + auto goInRegisterSpace = Self::template mapToRegisters(); + return Registers::t[goInRegisterSpace]; + } + + static constexpr std::array t{ + Self::template helpGetValueforT<0>(), + Self::template helpGetValueforT<1>(), + Self::template helpGetValueforT<2>(), + Self::template helpGetValueforT<3>(), + Self::template helpGetValueforT<4>(), + Self::template helpGetValueforT<5>(), + Self::template helpGetValueforT<6>(), + Self::template helpGetValueforT<7>(), + Self::template helpGetValueforT<8>(), + Self::template helpGetValueforT<9>(), + Self::template helpGetValueforT<10>(), + Self::template helpGetValueforT<11>(), + Self::template helpGetValueforT<12>(), + Self::template helpGetValueforT<13>(), + Self::template helpGetValueforT<14>(), + Self::template helpGetValueforT<15>(), + Self::template helpGetValueforT<16>(), + Self::template helpGetValueforT<17>(), + Self::template helpGetValueforT<18>(), + Self::template helpGetValueforT<19>(), + Self::template helpGetValueforT<20>(), + Self::template helpGetValueforT<21>(), + Self::template helpGetValueforT<22>(), + Self::template helpGetValueforT<23>(), + Self::template helpGetValueforT<24>(), + Self::template helpGetValueforT<25>(), + Self::template helpGetValueforT<26>()}; + }; + + public: + template + static auto getDirectionAsVector() + -> std::vector + { + std::vector vec; + if constexpr (mappingType == RegisterMapping) { + for (auto const& a : Registers::stencil) { + vec.push_back(a); + } + } else if constexpr (mappingType == MemoryMapping) { + for (auto const& a : Memory::stencil) { + vec.push_back(a); + } + } + return vec; + } +}; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h new file mode 100644 index 00000000..f977492b --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h @@ -0,0 +1,217 @@ +#pragma once +#include "CellType.h" +#include "D3Q27.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + + +template +struct DeviceD3Q27 +{ + using Lattice = D3Q27; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId == Lattice::Memory::center) { + popIn[Lattice::Registers::center] = fin(gidx, Lattice::Memory::center); + } else { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; + constexpr int GORegistersId = Lattice::Memory::template mapToRegisters(); + + if (wallBitFlag & (uint32_t(1) << GOMemoryId)) { + popIn[GORegistersId] = + fin(gidx, BKMemoryId) + + fin.template getNghData(gidx, BKMemoryId)(); + } else { + popIn[GORegistersId] = + fin.template getNghData(gidx, GOMemoryId)(); + } + } + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const Storage pop[Lattice::Q], + NEON_OUT Compute& rho, + NEON_OUT std::array& u) + -> void + { + +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6) + POP(9) + POP(10) + POP(11) + POP(12); + const Compute X_P1 = POP(14) + POP(17) + POP(18) + POP(19) + POP(20) + POP(23) + POP(24) + POP(25) + POP(26); + const Compute X_0 = POP(1) + POP(2) + POP(7) + POP(8) + POP(13) + POP(15) + POP(16) + POP(21) + POP(22); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(9) + POP(10) + POP(18) + POP(25) + POP(26); + const Compute Y_P1 = POP(15) + POP(17) + POP(21) + POP(22) + POP(23) + POP(24) + POP(4) + POP(11) + POP(12); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(9) + POP(11) + POP(20) + POP(22) + POP(24) + POP(26); + const Compute Z_P1 = POP(16) + POP(19) + POP(21) + POP(23) + POP(25) + POP(6) + POP(8) + POP(10) + POP(12); +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[0] = (Y_P1 - Y_M1) / rho; + u[0] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! Compute iterator */, + const Storage pop[Lattice::Q], + Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + typename PopField::Partition& fOut /*! Population */) + + -> void + { + const Compute cku1 = u[0] + u[1]; + const Compute cku2 = -u[0] + u[1]; + const Compute cku3 = u[0] + u[2]; + const Compute cku4 = -u[0] + u[2]; + const Compute cku5 = u[1] + u[2]; + const Compute cku6 = -u[1] + u[2]; + const Compute cku7 = u[0] + u[1] + u[2]; + const Compute cku8 = -u[0] + u[1] + u[2]; + const Compute cku9 = u[0] - u[1] + u[2]; + const Compute cku0 = u[0] + u[1] - u[2]; + + std::array feqRM; + + constexpr int F000 = 13; + constexpr int FM00 = 0; + constexpr int F0M0 = 1; + constexpr int F00M = 2; + constexpr int FMM0 = 3; + constexpr int FMP0 = 4; + constexpr int FM0M = 5; + constexpr int FM0P = 6; + constexpr int F0MM = 7; + constexpr int F0MP = 8; + constexpr int FMMM = 9; + constexpr int FMMP = 10; + constexpr int FMPM = 11; + constexpr int FMPP = 12; + constexpr int FP00 = 14; + constexpr int F0P0 = 15; + constexpr int F00P = 16; + constexpr int FPP0 = 17; + constexpr int FPM0 = 18; + constexpr int FP0P = 19; + constexpr int FP0M = 20; + constexpr int F0PP = 21; + constexpr int F0PM = 22; + constexpr int FPPP = 23; + constexpr int FPPM = 24; + constexpr int FPMP = 25; + constexpr int FPMM = 26; + + constexpr Compute c1over18 = 1. / 18.; + constexpr Compute c1over36 = 1. / 36.; + constexpr Compute c4dot5 = 4.5; + constexpr Compute c3 = 3.; + constexpr Compute c1 = 1.; + constexpr Compute c6 = 6.; + + feqRM[F000] = rho * Lattice::Registers::t[F000] * (c1- usqr); + + feqRM[FM00] = rho * Lattice::Registers::t[FM00] * (c1- c3* u[0] + c4dot5* u[0] * u[0] - usqr); + feqRM[FP00] = rho * Lattice::Registers::t[FP00] * (c6 * u[0]) + feqRM[FM00]; + + feqRM[F0M0] = rho * Lattice::Registers::t[F0M0] * (c1- c3* u[1] + c4dot5* u[1] * u[1] - usqr); + feqRM[F0P0] = rho * Lattice::Registers::t[F0P0] * (c6 * u[1]) + feqRM[F0M0]; + + feqRM[F00M] = rho * Lattice::Registers::t[F00M] * (c1- c3* u[2] + c4dot5* u[2] * u[2] - usqr); + feqRM[F00P] = rho * Lattice::Registers::t[F00P] * (c6 * u[2]) + feqRM[F00M]; + + feqRM[FMM0] = rho * Lattice::Registers::t[FMM0] * (c1- c3* cku1 + c4dot5* cku1 * cku1 - usqr); + feqRM[FPP0] = rho * Lattice::Registers::t[FPP0] * (c6 * cku1) + feqRM[FMM0]; + feqRM[FPM0] = rho * Lattice::Registers::t[FPM0] * (c1- c3* cku2 + c4dot5* cku2 * cku2 - usqr); + feqRM[FMP0] = rho * Lattice::Registers::t[FMP0] * (c6 * cku2) + feqRM[FPM0]; + + feqRM[FM0M] = rho * Lattice::Registers::t[FM0M] * (c1- c3* cku3 + c4dot5* cku3 * cku3 - usqr); + feqRM[FP0P] = rho * Lattice::Registers::t[FP0P] * (c6 * cku3) + feqRM[FM0M]; + feqRM[FP0M] = rho * Lattice::Registers::t[FP0M] * (c1- c3* cku4 + c4dot5* cku4 * cku4 - usqr); + feqRM[FM0P] = rho * Lattice::Registers::t[FM0P] * (c6 * cku4) + feqRM[FP0M]; + + feqRM[F0MM] = rho * Lattice::Registers::t[F0MM] * (c1- c3* cku5 + c4dot5* cku5 * cku5 - usqr); + feqRM[F0PP] = rho * Lattice::Registers::t[F0PP] * (c6 * cku5) + feqRM[F0MM]; + feqRM[F0PM] = rho * Lattice::Registers::t[F0PM] * (c1- c3* cku6 + c4dot5* cku6 * cku6 - usqr); + feqRM[F0MP] = rho * Lattice::Registers::t[F0MP] * (c6 * cku6) + feqRM[F0PM]; + + feqRM[FMMM] = rho * Lattice::Registers::t[FMMM] * (c1- c3* cku7 + c4dot5* cku7 * cku7 - usqr); + feqRM[FPPP] = rho * Lattice::Registers::t[FPPP] * (c6 * cku7) + feqRM[FMMM]; + feqRM[FPMM] = rho * Lattice::Registers::t[FPMM] * (c1- c3* cku8 + c4dot5* cku8 * cku8 - usqr); + feqRM[FMPP] = rho * Lattice::Registers::t[FMPP] * (c6 * cku8) + feqRM[FPMM]; + feqRM[FMPM] = rho * Lattice::Registers::t[FMPM] * (c1- c3* cku9 + c4dot5* cku9 * cku9 - usqr); + feqRM[FPMP] = rho * Lattice::Registers::t[FPMP] * (c6 * cku9) + feqRM[FMPM]; + feqRM[FMMP] = rho * Lattice::Registers::t[FMMP] * (c1- c3* cku0 + c4dot5* cku0 * cku0 - usqr); + feqRM[FPPM] = rho * Lattice::Registers::t[FPPM] * (c6 * cku0) + feqRM[FMMP]; + + // BGK Collision based on the second-order equilibrium + std::array foutRM; + + foutRM[F000] = (c1- omega) * static_cast(pop[F000]) + omega * feqRM[F000]; + + foutRM[FP00] = (c1- omega) * static_cast(pop[FP00]) + omega * feqRM[FP00]; + foutRM[FM00] = (c1- omega) * static_cast(pop[FM00]) + omega * feqRM[FM00]; + + foutRM[F0P0] = (c1- omega) * static_cast(pop[F0P0]) + omega * feqRM[F0P0]; + foutRM[F0M0] = (c1- omega) * static_cast(pop[F0M0]) + omega * feqRM[F0M0]; + + foutRM[F00P] = (c1- omega) * static_cast(pop[F00P]) + omega * feqRM[F00P]; + foutRM[F00M] = (c1- omega) * static_cast(pop[F00M]) + omega * feqRM[F00M]; + + foutRM[FPP0] = (c1- omega) * static_cast(pop[FPP0]) + omega * feqRM[FPP0]; + foutRM[FMP0] = (c1- omega) * static_cast(pop[FMP0]) + omega * feqRM[FMP0]; + foutRM[FPM0] = (c1- omega) * static_cast(pop[FPM0]) + omega * feqRM[FPM0]; + foutRM[FMM0] = (c1- omega) * static_cast(pop[FMM0]) + omega * feqRM[FMM0]; + + foutRM[FP0P] = (c1- omega) * static_cast(pop[FP0P]) + omega * feqRM[FP0P]; + foutRM[FM0P] = (c1- omega) * static_cast(pop[FM0P]) + omega * feqRM[FM0P]; + foutRM[FP0M] = (c1- omega) * static_cast(pop[FP0M]) + omega * feqRM[FP0M]; + foutRM[FM0M] = (c1- omega) * static_cast(pop[FM0M]) + omega * feqRM[FM0M]; + + foutRM[F0PP] = (c1- omega) * static_cast(pop[F0PP]) + omega * feqRM[F0PP]; + foutRM[F0MP] = (c1- omega) * static_cast(pop[F0MP]) + omega * feqRM[F0MP]; + foutRM[F0PM] = (c1- omega) * static_cast(pop[F0PM]) + omega * feqRM[F0PM]; + foutRM[F0MM] = (c1- omega) * static_cast(pop[F0MM]) + omega * feqRM[F0MM]; + + foutRM[FPPP] = (c1- omega) * static_cast(pop[FPPP]) + omega * feqRM[FPPP]; + foutRM[FMPP] = (c1- omega) * static_cast(pop[FMPP]) + omega * feqRM[FMPP]; + foutRM[FPMP] = (c1- omega) * static_cast(pop[FPMP]) + omega * feqRM[FPMP]; + foutRM[FPPM] = (c1- omega) * static_cast(pop[FPPM]) + omega * feqRM[FPPM]; + foutRM[FMMP] = (c1- omega) * static_cast(pop[FMMP]) + omega * feqRM[FMMP]; + foutRM[FMPM] = (c1- omega) * static_cast(pop[FMPM]) + omega * feqRM[FMPM]; + foutRM[FPMM] = (c1- omega) * static_cast(pop[FPMM]) + omega * feqRM[FPMM]; + foutRM[FMMM] = (c1- omega) * static_cast(pop[FMMM]) + omega * feqRM[FMMM]; + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + fOut(i, GOMemoryId) = static_cast(foutRM[Lattice::Memory::template mapToRegisters()]); + }); + } +}; + diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index a3840db4..50c55c15 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -27,7 +27,7 @@ auto run(Config& config, using Lattice = D3Q19; using PopulationField = typename Grid::template Field; - using PopField = typename Grid::template Field; + using PopField = typename Grid::template Field; using CellTypeField = typename Grid::template Field; using Idx = typename PopField::Idx; @@ -172,71 +172,20 @@ auto run(Config& config, Neon::index_3d dim(config.N, config.N, config.N); - const auto& t = Lattice::Memory::t; - const auto& c = Lattice::Memory::stencil; - - inPop.forEachActiveCell([&c, &t, &dim, &flag, &ulid, &config](const Neon::index_3d& idx, - const int& k, - Storage& val) { - val = t.at(k); - - if (idx.x == 0 || idx.x == dim.x - 1 || - idx.y == 0 || idx.y == dim.y - 1 || - idx.z == 0 || idx.z == dim.z - 1) { - - if (idx.y == dim.y - 1) { - val = -6. * t.at(k) * config.ulb * - (c.at(k).v[0] * ulid.v[0] + - c.at(k).v[1] * ulid.v[1] + - c.at(k).v[2] * ulid.v[2]); - } else { - val = 0; - } - } - }); - - outPop.forEachActiveCell([&c, &t, &dim, &flag, &ulid, &config](const Neon::index_3d& idx, - const int& k, - Storage& val) { - val = t.at(k); - - if (idx.x == 0 || idx.x == dim.x - 1 || - idx.y == 0 || idx.y == dim.y - 1 || - idx.z == 0 || idx.z == dim.z - 1) { - - if (idx.y == dim.y - 1) { - val = -6. * t.at(k) * config.ulb * - (c.at(k).v[0] * ulid.v[0] + - c.at(k).v[1] * ulid.v[1] + - c.at(k).v[2] * ulid.v[2]); - } else { - val = 0; - } - } - }); - - flag.forEachActiveCell([&dim](const Neon::index_3d& idx, - const int&, - CellType& flagVal) { - flagVal.classification = CellType::bulk; - flagVal.wallNghBitflag = 0; +// const auto& t = Lattice::Memory::t; +// const auto& c = Lattice::Memory::stencil; - if (idx.x == 0 || idx.x == dim.x - 1 || - idx.y == 0 || idx.y == dim.y - 1 || - idx.z == 0 || idx.z == dim.z - 1) { - - flagVal.classification = CellType::bounceBack; - - if (idx.y == dim.y - 1) { - flagVal.classification = CellType::movingWall; - } - } - }); + ContainerFactory::problemSetup(inPop, + outPop, + flag, + ulid, + config.ulb) + .run(Neon::Backend::mainStreamIdx); - inPop.updateDeviceData(Neon::Backend::mainStreamIdx); - outPop.updateDeviceData(Neon::Backend::mainStreamIdx); - flag.updateDeviceData(Neon::Backend::mainStreamIdx); + inPop.updateHostData(Neon::Backend::mainStreamIdx); + outPop.updateHostData(Neon::Backend::mainStreamIdx); + flag.updateHostData(Neon::Backend::mainStreamIdx); { bk.syncAll(); flag.newHaloUpdate(Neon::set::StencilSemantic::standard /*semantic*/, @@ -246,7 +195,7 @@ auto run(Config& config, bk.syncAll(); } - auto container = ContainerFactory::computeWallNghMask(flag, flag); + auto container = ContainerFactory::computeWallNghMask(flag, flag); container.run(Neon::Backend::mainStreamIdx); bk.syncAll(); } diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 52802f1c..fc48d712 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -62,7 +62,8 @@ bField::bField(const std::string& fieldUserName, blockConnectivity.mem(), bitmask.mem(), dataBlockOrigins.mem(), - mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx)); + mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx), + mData->grid->getDimension()); }); } diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index a03af559..8cb9bfe2 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -36,7 +36,8 @@ class bPartition typename Idx::DataBlockIdx* mBlockConnectivity, typename SBlock::BitMask const* NEON_RESTRICT mMask, Neon::int32_3d* mOrigin, - NghIdx* mStencilNghIndex); + NghIdx* mStencilNghIndex, + Neon::int32_3d mDomainSize); /** * Retrieve the cardinality of the field. @@ -108,7 +109,7 @@ class bPartition int card, LambdaVALID funIfValid, LambdaNOTValid funIfNOTValid = nullptr) - const -> std::enable_if_t &&( std::is_invocable_v || std::is_same_v), void>; + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void>; /** @@ -118,6 +119,11 @@ class bPartition getGlobalIndex(const Idx& cell) const -> Neon::index_3d; + + NEON_CUDA_HOST_DEVICE inline auto + getDomainSize() + const -> Neon::index_3d; + /** * Gets the Idx for in the block view space. */ @@ -148,7 +154,6 @@ class bPartition const -> Idx; - int mCardinality; T* mMem; NghIdx const* NEON_RESTRICT mStencilNghIndex; @@ -157,6 +162,7 @@ class bPartition Neon::int32_3d const* NEON_RESTRICT mOrigin; int mSetIdx; int mMultiResDiscreteIdxSpacing = 1; + Neon::int32_3d mDomainSize; }; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index 9a0bab8e..b0e5db1d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -25,14 +25,16 @@ bPartition:: typename Idx::DataBlockIdx* blockConnectivity, typename SBlock::BitMask const* NEON_RESTRICT mask, Neon::int32_3d* origin, - NghIdx* stencilNghIndex) + NghIdx* stencilNghIndex, + Neon::int32_3d mDomainSize) : mCardinality(cardinality), mMem(mem), mStencilNghIndex(stencilNghIndex), mBlockConnectivity(blockConnectivity), mMask(mask), mOrigin(origin), - mSetIdx(setIdx) + mSetIdx(setIdx), + mDomainSize(mDomainSize) { } @@ -51,6 +53,14 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: return location; } +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getDomainSize() + const -> Neon::index_3d +{ + return mDomainSize; +} + template NEON_CUDA_HOST_DEVICE inline auto bPartition:: getBlockViewIdx(const Idx& gidx) diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 2becc97d..41c64e8b 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -377,7 +377,8 @@ class dPartition } } - NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local) const -> Neon::index_3d + NEON_CUDA_HOST_DEVICE inline auto + getGlobalIndex(const Idx& local) const -> Neon::index_3d { // assert(local.mLocation.x >= 0 && // local.mLocation.y >= 0 && diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h index 1843c4df..7dc51430 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h @@ -64,7 +64,8 @@ eField::eField(const std::string& fieldUserName, mData->grid->getConnectivityField().getPartition(execution, setIdx, Neon::DataView::STANDARD).mem(), mData->grid->getGlobalMappingField().getPartition(execution, setIdx, Neon::DataView::STANDARD).mem(), mData->grid->getStencil3dTo1dOffset().rawMem(execution, setIdx), - mData->grid->getStencil().getRadius()); + mData->grid->getStencil().getRadius(), + mData->grid->getDimension()); }); } diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h index 05f3101b..4381a24c 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h @@ -186,7 +186,7 @@ class ePartition NEON_CUDA_HOST_DEVICE inline auto getNghData(Idx eId, int card, - T defaultValue) + T defaultValue) const -> NghData; template std::enable_if_t &&( std::is_invocable_v || std::is_same_v), void>; + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void>; /** * Check is the @@ -224,6 +224,10 @@ class ePartition getGlobalIndex(Idx Idx) const -> Neon::index_3d; + NEON_CUDA_HOST_DEVICE inline auto + getDomainSize() + const -> Neon::index_3d; + NEON_CUDA_HOST_DEVICE inline auto mem() const -> const T*; @@ -244,7 +248,8 @@ class ePartition Offset* connRaw, Neon::index_3d* toGlobal, int8_t* stencil3dTo1dOffset, - int32_t stencilRadius); + int32_t stencilRadius, + Neon::index_3d domainSize); /** * Returns a pointer to element eId with target cardinality cardinalityIdx @@ -269,11 +274,6 @@ class ePartition getOffset(Idx eId, int cardinalityIdx) const -> Offset; - /** - * Returns raw pointer of the field - * @tparam dataView_ta - * @return - */ protected: //-- [INTERNAL DATA] ---------------------------------------------------------------------------- @@ -291,6 +291,7 @@ class ePartition int8_t* mStencil3dTo1dOffset = {nullptr}; int32_t mStencilTableYPitch; int32_t mStencilRadius; // Shift to be applied to all 3d offset component to access mStencil3dTo1dOffset table + Neon::index_3d mDomainSize; }; } // namespace Neon::domain::details::eGrid diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h index 8565cdc1..1611106f 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h @@ -210,7 +210,8 @@ ePartition::ePartition(int prtId, Offset* connRaw, Neon::index_3d* toGlobal, int8_t* stencil3dTo1dOffset, - int32_t stencilRadius) + int32_t stencilRadius, + Neon::index_3d domainSize) { mPrtID = prtId; mMem = mem; @@ -225,6 +226,7 @@ ePartition::ePartition(int prtId, mStencilTableYPitch = 2 * stencilRadius + 1; mStencilRadius = stencilRadius; + mDomainSize = domainSize; } template ::mem() const return mMem; } +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getDomainSize() + const -> Neon::index_3d +{ + return mDomainSize; +} + } // namespace Neon::domain::details::eGrid From c5520945db84f95e689c3f40b34579abaec6a3eb Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 5 Jul 2023 09:20:03 -0400 Subject: [PATCH 31/94] WIP - D3Q27 --- .../lbm-lid-driven-cavity-flow/src/D3Q19.h | 124 ++++++------------ .../lbm-lid-driven-cavity-flow/src/D3Q27.h | 94 ++----------- .../src/RunCavityTwoPop.cu | 74 ++++++++--- 3 files changed, 104 insertions(+), 188 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h index 52035a35..83f922df 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h @@ -66,25 +66,7 @@ struct D3Q19 } static constexpr std::array opposite{ - Self::template getOpposite<0>(), - Self::template getOpposite<1>(), - Self::template getOpposite<2>(), - Self::template getOpposite<3>(), - Self::template getOpposite<4>(), - Self::template getOpposite<5>(), - Self::template getOpposite<6>(), - Self::template getOpposite<7>(), - Self::template getOpposite<8>(), - Self::template getOpposite<9>(), - Self::template getOpposite<10>(), - Self::template getOpposite<11>(), - Self::template getOpposite<12>(), - Self::template getOpposite<13>(), - Self::template getOpposite<14>(), - Self::template getOpposite<15>(), - Self::template getOpposite<16>(), - Self::template getOpposite<17>(), - Self::template getOpposite<18>()}; + 10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8}; static constexpr std::array t{ 1. / 18. /*! 0 */, @@ -105,7 +87,7 @@ struct D3Q19 1. / 36. /*! 15 */, 1. / 36. /*! 16 */, 1. / 36. /*! 17 */, - 1. / 36. /*! 18 */, + 1. / 36. /*! 18 */ }; }; @@ -135,67 +117,37 @@ struct D3Q19 Neon::index_3d(0, 1, -1)}; - static constexpr int center = 9; /** Position of direction {0,0,0} */ - static constexpr int goRangeBegin = 0; /** Symmetry is represented as "go" direction and the "back" their opposite */ - static constexpr int goRangeEnd = 8; - static constexpr int goBackOffset = 10; /** Offset to compute apply symmetry */ + static constexpr int center = 9; /** Position of direction {0,0,0} */ + + static constexpr std::array toRegisters{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; + + static constexpr std::array toMemory{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; template - static constexpr auto mapToRegisters() + NEON_CUDA_HOST_DEVICE static constexpr auto mapToRegisters() -> int { - auto direction = stencil[go]; - for (int i = 0; i < Q; ++i) { - if (Registers::stencil[i] == direction) { - return i; - } - } + return toRegisters[go]; } template - static constexpr auto mapFromRegisters() + NEON_CUDA_HOST_DEVICE static constexpr auto mapFromRegisters() -> int { - auto direction = Registers::stencil[go]; - for (int i = 0; i < Q; ++i) { - if (Self::stencil[i] == direction) { - return i; - } - } + return toMemory[go]; } template - static constexpr auto getOpposite() + NEON_CUDA_HOST_DEVICE static constexpr auto getOpposite() -> int { - auto opposite3d = stencil[go] * -1; - for (int i = 0; i < Q; ++i) { - if (stencil[i] == opposite3d) { - return i; - } - } + return opposite[go]; } static constexpr std::array opposite{ - Self::template getOpposite<0>(), - Self::template getOpposite<1>(), - Self::template getOpposite<2>(), - Self::template getOpposite<3>(), - Self::template getOpposite<4>(), - Self::template getOpposite<5>(), - Self::template getOpposite<6>(), - Self::template getOpposite<7>(), - Self::template getOpposite<8>(), - Self::template getOpposite<9>(), - Self::template getOpposite<10>(), - Self::template getOpposite<11>(), - Self::template getOpposite<12>(), - Self::template getOpposite<13>(), - Self::template getOpposite<14>(), - Self::template getOpposite<15>(), - Self::template getOpposite<16>(), - Self::template getOpposite<17>(), - Self::template getOpposite<18>()}; + 10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8}; template static constexpr auto helpGetValueforT() @@ -206,44 +158,42 @@ struct D3Q19 } static constexpr std::array t{ - Self::template helpGetValueforT<0>(), - Self::template helpGetValueforT<1>(), - Self::template helpGetValueforT<2>(), - Self::template helpGetValueforT<3>(), - Self::template helpGetValueforT<4>(), - Self::template helpGetValueforT<5>(), - Self::template helpGetValueforT<6>(), - Self::template helpGetValueforT<7>(), - Self::template helpGetValueforT<8>(), - Self::template helpGetValueforT<9>(), - Self::template helpGetValueforT<10>(), - Self::template helpGetValueforT<11>(), - Self::template helpGetValueforT<12>(), - Self::template helpGetValueforT<13>(), - Self::template helpGetValueforT<14>(), - Self::template helpGetValueforT<15>(), - Self::template helpGetValueforT<16>(), - Self::template helpGetValueforT<17>(), - Self::template helpGetValueforT<18>()}; + 1. / 18. /*! 0 */, + 1. / 18. /*! 1 */, + 1. / 18. /*! 2 */, + 1. / 36. /*! 3 */, + 1. / 36. /*! 4 */, + 1. / 36. /*! 5 */, + 1. / 36. /*! 6 */, + 1. / 36. /*! 7 */, + 1. / 36. /*! 8 */, + 1. / 3. /*! 9 */, + 1. / 18. /*! 10 */, + 1. / 18. /*! 11 */, + 1. / 18. /*! 12 */, + 1. / 36. /*! 13 */, + 1. / 36. /*! 14 */, + 1. / 36. /*! 15 */, + 1. / 36. /*! 16 */, + 1. / 36. /*! 17 */, + 1. / 36. /*! 18 */}; template NEON_CUDA_HOST_DEVICE static constexpr auto getT() -> typename Precision::Storage { - return t[direction]; + return t[direction]; } template NEON_CUDA_HOST_DEVICE static constexpr auto getDirection() -> typename Neon::index_3d { - return stencil[direction]; + return stencil[direction]; } }; - public: - template static auto getDirectionAsVector() -> std::vector diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h index 803f06d2..9f2c7f95 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h @@ -74,33 +74,10 @@ struct D3Q27 } static constexpr std::array opposite{ - Self::template getOpposite<0>(), - Self::template getOpposite<1>(), - Self::template getOpposite<2>(), - Self::template getOpposite<3>(), - Self::template getOpposite<4>(), - Self::template getOpposite<5>(), - Self::template getOpposite<6>(), - Self::template getOpposite<7>(), - Self::template getOpposite<8>(), - Self::template getOpposite<9>(), - Self::template getOpposite<10>(), - Self::template getOpposite<11>(), - Self::template getOpposite<12>(), - Self::template getOpposite<13>(), - Self::template getOpposite<14>(), - Self::template getOpposite<15>(), - Self::template getOpposite<16>(), - Self::template getOpposite<17>(), - Self::template getOpposite<18>(), - Self::template getOpposite<19>(), - Self::template getOpposite<20>(), - Self::template getOpposite<21>(), - Self::template getOpposite<22>(), - Self::template getOpposite<23>(), - Self::template getOpposite<24>(), - Self::template getOpposite<25>(), - Self::template getOpposite<26>()}; + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 13, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 + }; static constexpr std::array t{ 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., @@ -182,33 +159,10 @@ struct D3Q27 } static constexpr std::array opposite{ - Self::template getOpposite<0>(), - Self::template getOpposite<1>(), - Self::template getOpposite<2>(), - Self::template getOpposite<3>(), - Self::template getOpposite<4>(), - Self::template getOpposite<5>(), - Self::template getOpposite<6>(), - Self::template getOpposite<7>(), - Self::template getOpposite<8>(), - Self::template getOpposite<9>(), - Self::template getOpposite<10>(), - Self::template getOpposite<11>(), - Self::template getOpposite<12>(), - Self::template getOpposite<13>(), - Self::template getOpposite<14>(), - Self::template getOpposite<15>(), - Self::template getOpposite<16>(), - Self::template getOpposite<17>(), - Self::template getOpposite<18>(), - Self::template getOpposite<19>(), - Self::template getOpposite<20>(), - Self::template getOpposite<21>(), - Self::template getOpposite<22>(), - Self::template getOpposite<23>(), - Self::template getOpposite<24>(), - Self::template getOpposite<25>(), - Self::template getOpposite<26>()}; + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 13, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 + }; template static constexpr auto helpGetValueforT() @@ -219,33 +173,11 @@ struct D3Q27 } static constexpr std::array t{ - Self::template helpGetValueforT<0>(), - Self::template helpGetValueforT<1>(), - Self::template helpGetValueforT<2>(), - Self::template helpGetValueforT<3>(), - Self::template helpGetValueforT<4>(), - Self::template helpGetValueforT<5>(), - Self::template helpGetValueforT<6>(), - Self::template helpGetValueforT<7>(), - Self::template helpGetValueforT<8>(), - Self::template helpGetValueforT<9>(), - Self::template helpGetValueforT<10>(), - Self::template helpGetValueforT<11>(), - Self::template helpGetValueforT<12>(), - Self::template helpGetValueforT<13>(), - Self::template helpGetValueforT<14>(), - Self::template helpGetValueforT<15>(), - Self::template helpGetValueforT<16>(), - Self::template helpGetValueforT<17>(), - Self::template helpGetValueforT<18>(), - Self::template helpGetValueforT<19>(), - Self::template helpGetValueforT<20>(), - Self::template helpGetValueforT<21>(), - Self::template helpGetValueforT<22>(), - Self::template helpGetValueforT<23>(), - Self::template helpGetValueforT<24>(), - Self::template helpGetValueforT<25>(), - Self::template helpGetValueforT<26>()}; + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216., + 8. / 27., + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; }; public: diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index 50c55c15..f436e8bc 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -172,8 +172,8 @@ auto run(Config& config, Neon::index_3d dim(config.N, config.N, config.N); -// const auto& t = Lattice::Memory::t; -// const auto& c = Lattice::Memory::stencil; + // const auto& t = Lattice::Memory::t; + // const auto& c = Lattice::Memory::stencil; ContainerFactory::problemSetup(inPop, outPop, @@ -260,6 +260,12 @@ auto runFilterStoreType(Config& config, } } // namespace details +#ifdef NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS +constexpr bool skipTest = true; +#else +constexpr bool skipTest = false; +#endif + auto run(Config& config, Report& report) -> void { @@ -267,38 +273,66 @@ auto run(Config& config, return details::runFilterStoreType(config, report); } if (config.gridType == "eGrid") { - return details::runFilterStoreType(config, report); + if constexpr (!skipTest) { + return details::runFilterStoreType(config, report); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } } if (config.gridType == "bGrid") { - return details::runFilterStoreType(config, report); + return details::runFilterStoreType(config, report); } if (config.gridType == "bGrid_4_4_4") { - using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); + if constexpr (!skipTest) { + using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } } if (config.gridType == "bGrid_32_8_4") { - using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); + if constexpr (!skipTest) { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } } if (config.gridType == "bGrid_32_8_4") { - using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 4, 8>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); + if constexpr (!skipTest) { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 4, 8>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } } if (config.gridType == "bGrid_32_2_8") { - using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); + if constexpr (!skipTest) { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } } if (config.gridType == "bGrid_32_8_2") { - using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); + if constexpr (!skipTest) { + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } } if (config.gridType == "dGridSoA") { - return details::runFilterStoreType(config, report); + if constexpr (!skipTest) { + return details::runFilterStoreType(config, report); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } } } } // namespace CavityTwoPop From 5f07bca628037555d4da811a7efe130cee30a55c Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 5 Jul 2023 11:55:05 -0400 Subject: [PATCH 32/94] WIP - D3Q27 --- .../src/ContainersD3Q27.h | 20 +++++++++---------- .../src/RunCavityTwoPop.cu | 11 +++++++++- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h index deea13ab..e41b9cfe 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h @@ -199,25 +199,25 @@ struct ContainerFactory([&](auto q) { if (globlalIdx.y == domainDim.y - 1) { - val = -6. * Lattice::Memory::t.at(q) * ulb * - (Lattice::Memory::stencil.at(q).v[0] * ulid.v[0] + - Lattice::Memory::stencil.at(q).v[1] * ulid.v[1] + - Lattice::Memory::stencil.at(q).v[2] * ulid.v[2]); + val = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); } else { val = 0; } fIn(gidx, q) = val; fOut(gidx, q) = val; - } + }); } else { flagVal.classification = CellType::bulk; cellInfoPartition(gidx, 0) = flagVal; - for (int q = 0; q < Lattice::Q; q++) { - fIn(gidx, q) = Lattice::Memory::t.at(q); - fOut(gidx, q) = Lattice::Memory::t.at(q); - } + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + fIn(gidx, q) = Lattice::Memory::template getT(); + fOut(gidx, q) = Lattice::Memory::template getT(); + }); } }; }); diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index f436e8bc..43aefed3 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -261,7 +261,7 @@ auto runFilterStoreType(Config& config, } // namespace details #ifdef NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS -constexpr bool skipTest = true; +constexpr bool skipTest = false; #else constexpr bool skipTest = false; #endif @@ -291,6 +291,15 @@ auto run(Config& config, NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") } } + if (config.gridType == "bGrid_2_2_2") { + if constexpr (!skipTest) { + using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } + } if (config.gridType == "bGrid_32_8_4") { if constexpr (!skipTest) { using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; From 2665122971f34f6bdf848b4bd9faa458b1aeb0cb Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 5 Jul 2023 12:37:57 -0400 Subject: [PATCH 33/94] WIP - D3Q27 --- benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index 43aefed3..449e31ce 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -293,7 +293,7 @@ auto run(Config& config, } if (config.gridType == "bGrid_2_2_2") { if constexpr (!skipTest) { - using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>; using Grid = Neon::domain::details::bGrid::bGrid; return details::runFilterStoreType(config, report); } else { From c4cc53606408baf004ef0877d998e86b285c7bfd Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 6 Jul 2023 09:09:08 -0400 Subject: [PATCH 34/94] WIP - D3Q27 --- .../lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index 449e31ce..6a14cd8f 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -274,13 +274,13 @@ auto run(Config& config, } if (config.gridType == "eGrid") { if constexpr (!skipTest) { - return details::runFilterStoreType(config, report); + return details::runFilterStoreType(config, report); } else { NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") } } if (config.gridType == "bGrid") { - return details::runFilterStoreType(config, report); + return details::runFilterStoreType(config, report); } if (config.gridType == "bGrid_4_4_4") { if constexpr (!skipTest) { @@ -311,7 +311,7 @@ auto run(Config& config, } if (config.gridType == "bGrid_32_8_4") { if constexpr (!skipTest) { - using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 4, 8>; + using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; using Grid = Neon::domain::details::bGrid::bGrid; return details::runFilterStoreType(config, report); } else { From 06774be077b647c427d6cd08dc2fd0d93455a3a3 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 11 Jul 2023 16:04:29 -0400 Subject: [PATCH 35/94] Encoding and decoding tools for Morton and Hilbert curves. --- .../include/Neon/domain/tools/SpaceCurves.h | 331 ++++++++++++++++++ .../src/domain/tools/SpaceCurves.cpp | 60 ++++ libNeonDomain/tests/CMakeLists.txt | 1 + .../src/runHelper.h | 6 +- .../CMakeLists.txt | 19 + .../src/TestInformation.h | 17 + .../src/domain-space-filling-curves.cu | 74 ++++ .../src/domain-space-filling-curves.h | 18 + .../src/goldenEncoding.h | 11 + .../src/gtests.cpp | 50 +++ .../src/runHelper.h | 100 ++++++ 11 files changed, 684 insertions(+), 3 deletions(-) create mode 100644 libNeonDomain/include/Neon/domain/tools/SpaceCurves.h create mode 100644 libNeonDomain/src/domain/tools/SpaceCurves.cpp create mode 100644 libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt create mode 100644 libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h create mode 100644 libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu create mode 100644 libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h create mode 100644 libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h create mode 100644 libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp create mode 100644 libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h diff --git a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h new file mode 100644 index 00000000..f1fda5a5 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h @@ -0,0 +1,331 @@ +#pragma once +#include "Neon/Neon.h" + +namespace Neon::domain::tool::spaceCurves { + + +enum struct EncoderType +{ + sweep = 0, + morton = 1, + hilbert = 2, +}; + + +/** + * Set of utilities for DataView options. + */ +struct EncoderTypeUtil +{ + /** + * Number of configurations for the enum + */ + static const int nConfig{static_cast(3)}; + + /** + * Convert enum value to string + * + * @param dataView + * @return + */ + static auto toString(EncoderType encoderType) -> std::string; + + /** + * Returns all valid configuration for DataView + * @return + */ + static auto validOptions() -> std::array; + + static auto fromInt(int val) -> EncoderType; + + static auto toInt(EncoderType encoderType) -> int; +}; + + +/** + * operator<< + * + * @param os + * @param m + * @return + */ +std::ostream& operator<<(std::ostream& os, Neon::DataView const& m); + +class Encoder +{ + private: + static constexpr uint8_t mortonToHilbertTable[] = { + 48, + 33, + 27, + 34, + 47, + 78, + 28, + 77, + 66, + 29, + 51, + 52, + 65, + 30, + 72, + 63, + 76, + 95, + 75, + 24, + 53, + 54, + 82, + 81, + 18, + 3, + 17, + 80, + 61, + 4, + 62, + 15, + 0, + 59, + 71, + 60, + 49, + 50, + 86, + 85, + 84, + 83, + 5, + 90, + 79, + 56, + 6, + 89, + 32, + 23, + 1, + 94, + 11, + 12, + 2, + 93, + 42, + 41, + 13, + 14, + 35, + 88, + 36, + 31, + 92, + 37, + 87, + 38, + 91, + 74, + 8, + 73, + 46, + 45, + 9, + 10, + 7, + 20, + 64, + 19, + 70, + 25, + 39, + 16, + 69, + 26, + 44, + 43, + 22, + 55, + 21, + 68, + 57, + 40, + 58, + 67, + }; + + static constexpr uint8_t hilbertToMortonTable[] = { + 48, + 33, + 35, + 26, + 30, + 79, + 77, + 44, + 78, + 68, + 64, + 50, + 51, + 25, + 29, + 63, + 27, + 87, + 86, + 74, + 72, + 52, + 53, + 89, + 83, + 18, + 16, + 1, + 5, + 60, + 62, + 15, + 0, + 52, + 53, + 57, + 59, + 87, + 86, + 66, + 61, + 95, + 91, + 81, + 80, + 2, + 6, + 76, + 32, + 2, + 6, + 12, + 13, + 95, + 91, + 17, + 93, + 41, + 40, + 36, + 38, + 10, + 11, + 31, + 14, + 79, + 77, + 92, + 88, + 33, + 35, + 82, + 70, + 10, + 11, + 23, + 21, + 41, + 40, + 4, + 19, + 25, + 29, + 47, + 46, + 68, + 64, + 34, + 45, + 60, + 62, + 71, + 67, + 18, + 16, + 49, + }; + + static inline auto transformCurve(uint64_t in, uint64_t bits, const uint8_t* lookupTable) + { + uint64_t transform = 0; + uint64_t out = 0; + + for (int32_t i = 3 * (bits - 1); i >= 0; i -= 3) { + transform = lookupTable[transform | ((in >> i) & 7)]; + out = (out << 3) | (transform & 7); + transform &= ~7; + } + + return out; + } + + static inline auto mortonToHilbert3D(uint64_t mortonIndex, uint64_t bits) + { + return transformCurve(mortonIndex, bits, mortonToHilbertTable); + } + + static inline auto hilbertToMorton3D(uint64_t hilbertIndex, uint64_t bits) + { + return transformCurve(hilbertIndex, bits, hilbertToMortonTable); + } + + + static inline auto splitBy3(unsigned int a) + { + uint64_t x = a & 0x1fffff; // we only care about 21 bits + x = (x | x << 32) & 0x1f00000000ffff; // shift left 32 bits, mask out bits 21-31 + x = (x | x << 16) & 0x1f0000ff0000ff; // shift left 16 bits, mask out bits 11-20, 43-52 + x = (x | x << 8) & 0x100f00f00f00f00f; // shift left 8 bits, mask out bits 5-10, 21-26, 37-42, 53-58 + x = (x | x << 4) & 0x10c30c30c30c30c3; // shift left 4 bits, mask out bits 3-4, 11-12, 19-20, 27-28, 35-36, 43-44, 51-52, 59-60 + x = (x | x << 2) & 0x1249249249249249; // shift left 2 bits, mask out bits 2, 6-7, 10, 14-15, 18, 22-23, 26, 30-31, 34, 38-39, 42, 46-47, 50, 54-55, 58 + return x; + } + + public: + + static inline auto mortonEncode([[maybe_unused]] Neon::index_3d dim, Neon::index_3d idx) + -> uint64_t + { + auto idxU64 = idx.newType(); + return splitBy3(idxU64.x) | (splitBy3(idxU64.y) << 1) | (splitBy3(idxU64.z) << 2); + } + + static inline auto encodeHilbert(Neon::index_3d dim, Neon::index_3d idx) + -> uint64_t + { + uint64_t mortonEncoded = mortonEncode(dim, idx); + uint64_t bits = std::ceil(std::log2(dim.newType().rMax())); + return mortonToHilbert3D(mortonEncoded, bits); + } + + static inline auto encodeSweep(Neon::index_3d dim, Neon::index_3d idx) + -> uint64_t + { + auto idxU64 = idx.newType(); + auto dimU64 = dim.newType(); + + uint64_t res = idxU64.x + idxU64.y * dimU64.x + idxU64.z * dimU64.x * dimU64.y; + return res; + } + + static inline auto encode(EncoderType type, Neon::index_3d dim, Neon::index_3d idx){ + switch (type) { + case EncoderType::morton: + return mortonEncode(dim, idx); + case EncoderType::hilbert: + return encodeHilbert(dim, idx); + case EncoderType::sweep: + return encodeSweep(dim, idx); + default: + NEON_THROW_UNSUPPORTED_OPERATION("Encoder type not supported"); + } + } +}; +} // namespace Neon::domain::tool::spaceCurves diff --git a/libNeonDomain/src/domain/tools/SpaceCurves.cpp b/libNeonDomain/src/domain/tools/SpaceCurves.cpp new file mode 100644 index 00000000..9bd700dd --- /dev/null +++ b/libNeonDomain/src/domain/tools/SpaceCurves.cpp @@ -0,0 +1,60 @@ +#include "Neon/domain/tools/SpaceCurves.h" +#include "Neon/core/types/Exceptions.h" + +namespace Neon::domain::tool::spaceCurves { + +auto EncoderTypeUtil::validOptions() -> std::array +{ + std::array options = {EncoderType::sweep, + EncoderType::morton, + EncoderType::hilbert}; + return options; +} + +auto EncoderTypeUtil::toString(EncoderType e) -> std::string +{ + switch (e) { + case EncoderType::sweep: { + return "sweep"; + } + case EncoderType::morton: { + return "morton"; + } + case EncoderType::hilbert: { + return "hilbert"; + } + default: { + NEON_THROW_UNSUPPORTED_OPTION("EncoderTypeUtil"); + } + } +} + +auto EncoderTypeUtil::fromInt(int val) -> EncoderType +{ + switch (val) { + case static_cast(EncoderType::sweep): { + return EncoderType::sweep; + } + case static_cast(EncoderType::morton): { + return EncoderType::morton; + } + case static_cast(EncoderType::hilbert): { + return EncoderType::hilbert; + } + default: { + NEON_THROW_UNSUPPORTED_OPTION("EncoderTypeUtil"); + } + } +} + +auto EncoderTypeUtil::toInt(EncoderType dataView) -> int +{ + return static_cast(dataView); +} + +std::ostream& operator<<(std::ostream& os, EncoderType const& m) +{ + return os << std::string(EncoderTypeUtil::toString(m)); +} + +} // namespace Neon::domain::tool::spaceCurves diff --git a/libNeonDomain/tests/CMakeLists.txt b/libNeonDomain/tests/CMakeLists.txt index aca5edc9..4f246224 100644 --- a/libNeonDomain/tests/CMakeLists.txt +++ b/libNeonDomain/tests/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory("domain-neighbour-globalIdx") add_subdirectory("domain-halos") add_subdirectory("domain-stencil") add_subdirectory("domain-bGrid-tray") +add_subdirectory("domain-space-filling-curves") add_subdirectory("domainUt_sGrid") add_subdirectory("domain-unit-test-eGrid") diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h index 32a078d6..d74db246 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h @@ -8,8 +8,8 @@ #include "Neon/core/types/DeviceType.h" #include "Neon/domain/dGrid.h" -#include "Neon/domain/eGrid.h" #include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/eGrid.h" #include "Neon/domain/tools/Geometries.h" #include "Neon/domain/tools/TestData.h" @@ -83,8 +83,8 @@ void runAllTestConfiguration( if (dim.z < 8 * ngpu * 3) { dim.z = ngpu * 3 * 8; } - if(memoryLayout == Neon::MemoryLayout::arrayOfStructs){ - continue ; + if (memoryLayout == Neon::MemoryLayout::arrayOfStructs) { + continue; } } diff --git a/libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt b/libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt new file mode 100644 index 00000000..76af1689 --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.19 FATAL_ERROR) + +set(APP_NAME domain-space-filling-curves) +file(GLOB_RECURSE SrcFiles src/*.*) + +add_executable(${APP_NAME} ${SrcFiles}) + +target_link_libraries(${APP_NAME} + PUBLIC libNeonDomain + PUBLIC gtest_main) + +set_target_properties(${APP_NAME} PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) + +set_target_properties(${APP_NAME} PROPERTIES FOLDER "libNeonDomain") +source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "${APP_NAME}" FILES ${SrcFiles}) + +add_test(NAME ${APP_NAME} COMMAND ${APP_NAME}) \ No newline at end of file diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h b/libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h new file mode 100644 index 00000000..3ac50ecd --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h @@ -0,0 +1,17 @@ +#pragma once +namespace { +struct TestInformation +{ + static auto prefix() + -> std::string + { + return "domain-unit-test-map"; + } + + static auto fullName(const std::string& gridName) + -> std::string + { + return prefix() + "-" + gridName; + } +}; +} // namespace \ No newline at end of file diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu new file mode 100644 index 00000000..b43ca7f4 --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu @@ -0,0 +1,74 @@ +#include +#include +#include +#include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/tools/SpaceCurves.h" +#include "Neon/domain/tools/TestData.h" +#include "TestInformation.h" +#include "gtest/gtest.h" + +#include +#include + +namespace space_filling_curves { + +template +auto defHostContainer(Field& filedSweep, + Field& filedMorton, + Field& filedHilbert) + -> Neon::set::Container +{ + const auto& grid = filedSweep.getGrid(); + return grid.template newContainer( + "defContainer", + [&](Neon::set::Loader& loader) { + auto sweep = loader.load(filedSweep); + auto morton = loader.load(filedMorton); + auto hilbert = loader.load(filedHilbert); + + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& gidx) mutable { + Neon::index_3d p = sweep.getGlobalIndex(gidx); + Neon::index_3d dim = sweep.getDomainSize(); + using namespace Neon::domain::tool::spaceCurves; + sweep(gidx, 0) = Encoder::encode(EncoderType::sweep, dim, p); + morton(gidx, 0) = Encoder::encode(EncoderType::morton, dim, p); + hilbert(gidx, 0) = Encoder::encode(EncoderType::hilbert, dim, p); + }; + }); +} + + +using namespace Neon::domain::tool::testing; + +template +auto run(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + + data.resetValuesToLinear(1, 100); + + { // NEON + const Neon::index_3d dim = grid.getDimension(); + std::vector elements; + + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + auto& Z = data.getField(FieldNames::Z); + + defHostContainer(X, Y, Z).run(0); + data.getBackend().sync(0); + + data.getField(FieldNames::X).ioToVtk("spaceCurveSweep", "code", false); + data.getField(FieldNames::Y).ioToVtk("spaceCurveMorton", "code", false); + data.getField(FieldNames::Z).ioToVtk("spaceCurveHilbert", "code", false); + } +} + +template auto run(TestData&) -> void; + + +} // namespace space_filling_curves \ No newline at end of file diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h new file mode 100644 index 00000000..a5b9fd3a --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h @@ -0,0 +1,18 @@ + +#pragma once +#include + +#include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/tools/TestData.h" + +namespace space_filling_curves { +using namespace Neon::domain::tool::testing; + +template +auto run(TestData& data) -> void; + +extern template auto run(TestData&) -> void; + + +} // namespace globalIdx diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h b/libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h new file mode 100644 index 00000000..d6292c4b --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h @@ -0,0 +1,11 @@ + +#include "Neon/Neon.h" +#include "domain-space-filling-curves.h" +#include "gtest/gtest.h" +#include "runHelper.h" + +uint64_t morton_grid_16_16_16[16 * 16* 16] = { + 0, 4, 32, 36, 256, 260, 288, 292, 2048, 2052, 2080, 2084, 2304, 2308, 2336, 2340, 2, 6, 34, 38, 258, 262, 290, 294, 2050, 2054, 2082, 2086, 2306, 2310, 2338, 2342, 16, 20, 48, 52, 272, 276, 304, 308, 2064, 2068, 2096, 2100, 2320, 2324, 2352, 2356, 18, 22, 50, 54, 274, 278, 306, 310, 2066, 2070, 2098, 2102, 2322, 2326, 2354, 2358, 128, 132, 160, 164, 384, 388, 416, 420, 2176, 2180, 2208, 2212, 2432, 2436, 2464, 2468, 130, 134, 162, 166, 386, 390, 418, 422, 2178, 2182, 2210, 2214, 2434, 2438, 2466, 2470, 144, 148, 176, 180, 400, 404, 432, 436, 2192, 2196, 2224, 2228, 2448, 2452, 2480, 2484, 146, 150, 178, 182, 402, 406, 434, 438, 2194, 2198, 2226, 2230, 2450, 2454, 2482, 2486, 1024, 1028, 1056, 1060, 1280, 1284, 1312, 1316, 3072, 3076, 3104, 3108, 3328, 3332, 3360, 3364, 1026, 1030, 1058, 1062, 1282, 1286, 1314, 1318, 3074, 3078, 3106, 3110, 3330, 3334, 3362, 3366, 1040, 1044, 1072, 1076, 1296, 1300, 1328, 1332, 3088, 3092, 3120, 3124, 3344, 3348, 3376, 3380, 1042, 1046, 1074, 1078, 1298, 1302, 1330, 1334, 3090, 3094, 3122, 3126, 3346, 3350, 3378, 3382, 1152, 1156, 1184, 1188, 1408, 1412, 1440, 1444, 3200, 3204, 3232, 3236, 3456, 3460, 3488, 3492, 1154, 1158, 1186, 1190, 1410, 1414, 1442, 1446, 3202, 3206, 3234, 3238, 3458, 3462, 3490, 3494, 1168, 1172, 1200, 1204, 1424, 1428, 1456, 1460, 3216, 3220, 3248, 3252, 3472, 3476, 3504, 3508, 1170, 1174, 1202, 1206, 1426, 1430, 1458, 1462, 3218, 3222, 3250, 3254, 3474, 3478, 3506, 3510, 1, 5, 33, 37, 257, 261, 289, 293, 2049, 2053, 2081, 2085, 2305, 2309, 2337, 2341, 3, 7, 35, 39, 259, 263, 291, 295, 2051, 2055, 2083, 2087, 2307, 2311, 2339, 2343, 17, 21, 49, 53, 273, 277, 305, 309, 2065, 2069, 2097, 2101, 2321, 2325, 2353, 2357, 19, 23, 51, 55, 275, 279, 307, 311, 2067, 2071, 2099, 2103, 2323, 2327, 2355, 2359, 129, 133, 161, 165, 385, 389, 417, 421, 2177, 2181, 2209, 2213, 2433, 2437, 2465, 2469, 131, 135, 163, 167, 387, 391, 419, 423, 2179, 2183, 2211, 2215, 2435, 2439, 2467, 2471, 145, 149, 177, 181, 401, 405, 433, 437, 2193, 2197, 2225, 2229, 2449, 2453, 2481, 2485, 147, 151, 179, 183, 403, 407, 435, 439, 2195, 2199, 2227, 2231, 2451, 2455, 2483, 2487, 1025, 1029, 1057, 1061, 1281, 1285, 1313, 1317, 3073, 3077, 3105, 3109, 3329, 3333, 3361, 3365, 1027, 1031, 1059, 1063, 1283, 1287, 1315, 1319, 3075, 3079, 3107, 3111, 3331, 3335, 3363, 3367, 1041, 1045, 1073, 1077, 1297, 1301, 1329, 1333, 3089, 3093, 3121, 3125, 3345, 3349, 3377, 3381, 1043, 1047, 1075, 1079, 1299, 1303, 1331, 1335, 3091, 3095, 3123, 3127, 3347, 3351, 3379, 3383, 1153, 1157, 1185, 1189, 1409, 1413, 1441, 1445, 3201, 3205, 3233, 3237, 3457, 3461, 3489, 3493, 1155, 1159, 1187, 1191, 1411, 1415, 1443, 1447, 3203, 3207, 3235, 3239, 3459, 3463, 3491, 3495, 1169, 1173, 1201, 1205, 1425, 1429, 1457, 1461, 3217, 3221, 3249, 3253, 3473, 3477, 3505, 3509, 1171, 1175, 1203, 1207, 1427, 1431, 1459, 1463, 3219, 3223, 3251, 3255, 3475, 3479, 3507, 3511, 8, 12, 40, 44, 264, 268, 296, 300, 2056, 2060, 2088, 2092, 2312, 2316, 2344, 2348, 10, 14, 42, 46, 266, 270, 298, 302, 2058, 2062, 2090, 2094, 2314, 2318, 2346, 2350, 24, 28, 56, 60, 280, 284, 312, 316, 2072, 2076, 2104, 2108, 2328, 2332, 2360, 2364, 26, 30, 58, 62, 282, 286, 314, 318, 2074, 2078, 2106, 2110, 2330, 2334, 2362, 2366, 136, 140, 168, 172, 392, 396, 424, 428, 2184, 2188, 2216, 2220, 2440, 2444, 2472, 2476, 138, 142, 170, 174, 394, 398, 426, 430, 2186, 2190, 2218, 2222, 2442, 2446, 2474, 2478, 152, 156, 184, 188, 408, 412, 440, 444, 2200, 2204, 2232, 2236, 2456, 2460, 2488, 2492, 154, 158, 186, 190, 410, 414, 442, 446, 2202, 2206, 2234, 2238, 2458, 2462, 2490, 2494, 1032, 1036, 1064, 1068, 1288, 1292, 1320, 1324, 3080, 3084, 3112, 3116, 3336, 3340, 3368, 3372, 1034, 1038, 1066, 1070, 1290, 1294, 1322, 1326, 3082, 3086, 3114, 3118, 3338, 3342, 3370, 3374, 1048, 1052, 1080, 1084, 1304, 1308, 1336, 1340, 3096, 3100, 3128, 3132, 3352, 3356, 3384, 3388, 1050, 1054, 1082, 1086, 1306, 1310, 1338, 1342, 3098, 3102, 3130, 3134, 3354, 3358, 3386, 3390, 1160, 1164, 1192, 1196, 1416, 1420, 1448, 1452, 3208, 3212, 3240, 3244, 3464, 3468, 3496, 3500, 1162, 1166, 1194, 1198, 1418, 1422, 1450, 1454, 3210, 3214, 3242, 3246, 3466, 3470, 3498, 3502, 1176, 1180, 1208, 1212, 1432, 1436, 1464, 1468, 3224, 3228, 3256, 3260, 3480, 3484, 3512, 3516, 1178, 1182, 1210, 1214, 1434, 1438, 1466, 1470, 3226, 3230, 3258, 3262, 3482, 3486, 3514, 3518, 9, 13, 41, 45, 265, 269, 297, 301, 2057, 2061, 2089, 2093, 2313, 2317, 2345, 2349, 11, 15, 43, 47, 267, 271, 299, 303, 2059, 2063, 2091, 2095, 2315, 2319, 2347, 2351, 25, 29, 57, 61, 281, 285, 313, 317, 2073, 2077, 2105, 2109, 2329, 2333, 2361, 2365, 27, 31, 59, 63, 283, 287, 315, 319, 2075, 2079, 2107, 2111, 2331, 2335, 2363, 2367, 137, 141, 169, 173, 393, 397, 425, 429, 2185, 2189, 2217, 2221, 2441, 2445, 2473, 2477, 139, 143, 171, 175, 395, 399, 427, 431, 2187, 2191, 2219, 2223, 2443, 2447, 2475, 2479, 153, 157, 185, 189, 409, 413, 441, 445, 2201, 2205, 2233, 2237, 2457, 2461, 2489, 2493, 155, 159, 187, 191, 411, 415, 443, 447, 2203, 2207, 2235, 2239, 2459, 2463, 2491, 2495, 1033, 1037, 1065, 1069, 1289, 1293, 1321, 1325, 3081, 3085, 3113, 3117, 3337, 3341, 3369, 3373, 1035, 1039, 1067, 1071, 1291, 1295, 1323, 1327, 3083, 3087, 3115, 3119, 3339, 3343, 3371, 3375, 1049, 1053, 1081, 1085, 1305, 1309, 1337, 1341, 3097, 3101, 3129, 3133, 3353, 3357, 3385, 3389, 1051, 1055, 1083, 1087, 1307, 1311, 1339, 1343, 3099, 3103, 3131, 3135, 3355, 3359, 3387, 3391, 1161, 1165, 1193, 1197, 1417, 1421, 1449, 1453, 3209, 3213, 3241, 3245, 3465, 3469, 3497, 3501, 1163, 1167, 1195, 1199, 1419, 1423, 1451, 1455, 3211, 3215, 3243, 3247, 3467, 3471, 3499, 3503, 1177, 1181, 1209, 1213, 1433, 1437, 1465, 1469, 3225, 3229, 3257, 3261, 3481, 3485, 3513, 3517, 1179, 1183, 1211, 1215, 1435, 1439, 1467, 1471, 3227, 3231, 3259, 3263, 3483, 3487, 3515, 3519, 64, 68, 96, 100, 320, 324, 352, 356, 2112, 2116, 2144, 2148, 2368, 2372, 2400, 2404, 66, 70, 98, 102, 322, 326, 354, 358, 2114, 2118, 2146, 2150, 2370, 2374, 2402, 2406, 80, 84, 112, 116, 336, 340, 368, 372, 2128, 2132, 2160, 2164, 2384, 2388, 2416, 2420, 82, 86, 114, 118, 338, 342, 370, 374, 2130, 2134, 2162, 2166, 2386, 2390, 2418, 2422, 192, 196, 224, 228, 448, 452, 480, 484, 2240, 2244, 2272, 2276, 2496, 2500, 2528, 2532, 194, 198, 226, 230, 450, 454, 482, 486, 2242, 2246, 2274, 2278, 2498, 2502, 2530, 2534, 208, 212, 240, 244, 464, 468, 496, 500, 2256, 2260, 2288, 2292, 2512, 2516, 2544, 2548, 210, 214, 242, 246, 466, 470, 498, 502, 2258, 2262, 2290, 2294, 2514, 2518, 2546, 2550, 1088, 1092, 1120, 1124, 1344, 1348, 1376, 1380, 3136, 3140, 3168, 3172, 3392, 3396, 3424, 3428, 1090, 1094, 1122, 1126, 1346, 1350, 1378, 1382, 3138, 3142, 3170, 3174, 3394, 3398, 3426, 3430, 1104, 1108, 1136, 1140, 1360, 1364, 1392, 1396, 3152, 3156, 3184, 3188, 3408, 3412, 3440, 3444, 1106, 1110, 1138, 1142, 1362, 1366, 1394, 1398, 3154, 3158, 3186, 3190, 3410, 3414, 3442, 3446, 1216, 1220, 1248, 1252, 1472, 1476, 1504, 1508, 3264, 3268, 3296, 3300, 3520, 3524, 3552, 3556, 1218, 1222, 1250, 1254, 1474, 1478, 1506, 1510, 3266, 3270, 3298, 3302, 3522, 3526, 3554, 3558, 1232, 1236, 1264, 1268, 1488, 1492, 1520, 1524, 3280, 3284, 3312, 3316, 3536, 3540, 3568, 3572, 1234, 1238, 1266, 1270, 1490, 1494, 1522, 1526, 3282, 3286, 3314, 3318, 3538, 3542, 3570, 3574, 65, 69, 97, 101, 321, 325, 353, 357, 2113, 2117, 2145, 2149, 2369, 2373, 2401, 2405, 67, 71, 99, 103, 323, 327, 355, 359, 2115, 2119, 2147, 2151, 2371, 2375, 2403, 2407, 81, 85, 113, 117, 337, 341, 369, 373, 2129, 2133, 2161, 2165, 2385, 2389, 2417, 2421, 83, 87, 115, 119, 339, 343, 371, 375, 2131, 2135, 2163, 2167, 2387, 2391, 2419, 2423, 193, 197, 225, 229, 449, 453, 481, 485, 2241, 2245, 2273, 2277, 2497, 2501, 2529, 2533, 195, 199, 227, 231, 451, 455, 483, 487, 2243, 2247, 2275, 2279, 2499, 2503, 2531, 2535, 209, 213, 241, 245, 465, 469, 497, 501, 2257, 2261, 2289, 2293, 2513, 2517, 2545, 2549, 211, 215, 243, 247, 467, 471, 499, 503, 2259, 2263, 2291, 2295, 2515, 2519, 2547, 2551, 1089, 1093, 1121, 1125, 1345, 1349, 1377, 1381, 3137, 3141, 3169, 3173, 3393, 3397, 3425, 3429, 1091, 1095, 1123, 1127, 1347, 1351, 1379, 1383, 3139, 3143, 3171, 3175, 3395, 3399, 3427, 3431, 1105, 1109, 1137, 1141, 1361, 1365, 1393, 1397, 3153, 3157, 3185, 3189, 3409, 3413, 3441, 3445, 1107, 1111, 1139, 1143, 1363, 1367, 1395, 1399, 3155, 3159, 3187, 3191, 3411, 3415, 3443, 3447, 1217, 1221, 1249, 1253, 1473, 1477, 1505, 1509, 3265, 3269, 3297, 3301, 3521, 3525, 3553, 3557, 1219, 1223, 1251, 1255, 1475, 1479, 1507, 1511, 3267, 3271, 3299, 3303, 3523, 3527, 3555, 3559, 1233, 1237, 1265, 1269, 1489, 1493, 1521, 1525, 3281, 3285, 3313, 3317, 3537, 3541, 3569, 3573, 1235, 1239, 1267, 1271, 1491, 1495, 1523, 1527, 3283, 3287, 3315, 3319, 3539, 3543, 3571, 3575, 72, 76, 104, 108, 328, 332, 360, 364, 2120, 2124, 2152, 2156, 2376, 2380, 2408, 2412, 74, 78, 106, 110, 330, 334, 362, 366, 2122, 2126, 2154, 2158, 2378, 2382, 2410, 2414, 88, 92, 120, 124, 344, 348, 376, 380, 2136, 2140, 2168, 2172, 2392, 2396, 2424, 2428, 90, 94, 122, 126, 346, 350, 378, 382, 2138, 2142, 2170, 2174, 2394, 2398, 2426, 2430, 200, 204, 232, 236, 456, 460, 488, 492, 2248, 2252, 2280, 2284, 2504, 2508, 2536, 2540, 202, 206, 234, 238, 458, 462, 490, 494, 2250, 2254, 2282, 2286, 2506, 2510, 2538, 2542, 216, 220, 248, 252, 472, 476, 504, 508, 2264, 2268, 2296, 2300, 2520, 2524, 2552, 2556, 218, 222, 250, 254, 474, 478, 506, 510, 2266, 2270, 2298, 2302, 2522, 2526, 2554, 2558, 1096, 1100, 1128, 1132, 1352, 1356, 1384, 1388, 3144, 3148, 3176, 3180, 3400, 3404, 3432, 3436, 1098, 1102, 1130, 1134, 1354, 1358, 1386, 1390, 3146, 3150, 3178, 3182, 3402, 3406, 3434, 3438, 1112, 1116, 1144, 1148, 1368, 1372, 1400, 1404, 3160, 3164, 3192, 3196, 3416, 3420, 3448, 3452, 1114, 1118, 1146, 1150, 1370, 1374, 1402, 1406, 3162, 3166, 3194, 3198, 3418, 3422, 3450, 3454, 1224, 1228, 1256, 1260, 1480, 1484, 1512, 1516, 3272, 3276, 3304, 3308, 3528, 3532, 3560, 3564, 1226, 1230, 1258, 1262, 1482, 1486, 1514, 1518, 3274, 3278, 3306, 3310, 3530, 3534, 3562, 3566, 1240, 1244, 1272, 1276, 1496, 1500, 1528, 1532, 3288, 3292, 3320, 3324, 3544, 3548, 3576, 3580, 1242, 1246, 1274, 1278, 1498, 1502, 1530, 1534, 3290, 3294, 3322, 3326, 3546, 3550, 3578, 3582, 73, 77, 105, 109, 329, 333, 361, 365, 2121, 2125, 2153, 2157, 2377, 2381, 2409, 2413, 75, 79, 107, 111, 331, 335, 363, 367, 2123, 2127, 2155, 2159, 2379, 2383, 2411, 2415, 89, 93, 121, 125, 345, 349, 377, 381, 2137, 2141, 2169, 2173, 2393, 2397, 2425, 2429, 91, 95, 123, 127, 347, 351, 379, 383, 2139, 2143, 2171, 2175, 2395, 2399, 2427, 2431, 201, 205, 233, 237, 457, 461, 489, 493, 2249, 2253, 2281, 2285, 2505, 2509, 2537, 2541, 203, 207, 235, 239, 459, 463, 491, 495, 2251, 2255, 2283, 2287, 2507, 2511, 2539, 2543, 217, 221, 249, 253, 473, 477, 505, 509, 2265, 2269, 2297, 2301, 2521, 2525, 2553, 2557, 219, 223, 251, 255, 475, 479, 507, 511, 2267, 2271, 2299, 2303, 2523, 2527, 2555, 2559, 1097, 1101, 1129, 1133, 1353, 1357, 1385, 1389, 3145, 3149, 3177, 3181, 3401, 3405, 3433, 3437, 1099, 1103, 1131, 1135, 1355, 1359, 1387, 1391, 3147, 3151, 3179, 3183, 3403, 3407, 3435, 3439, 1113, 1117, 1145, 1149, 1369, 1373, 1401, 1405, 3161, 3165, 3193, 3197, 3417, 3421, 3449, 3453, 1115, 1119, 1147, 1151, 1371, 1375, 1403, 1407, 3163, 3167, 3195, 3199, 3419, 3423, 3451, 3455, 1225, 1229, 1257, 1261, 1481, 1485, 1513, 1517, 3273, 3277, 3305, 3309, 3529, 3533, 3561, 3565, 1227, 1231, 1259, 1263, 1483, 1487, 1515, 1519, 3275, 3279, 3307, 3311, 3531, 3535, 3563, 3567, 1241, 1245, 1273, 1277, 1497, 1501, 1529, 1533, 3289, 3293, 3321, 3325, 3545, 3549, 3577, 3581, 1243, 1247, 1275, 1279, 1499, 1503, 1531, 1535, 3291, 3295, 3323, 3327, 3547, 3551, 3579, 3583, 512, 516, 544, 548, 768, 772, 800, 804, 2560, 2564, 2592, 2596, 2816, 2820, 2848, 2852, 514, 518, 546, 550, 770, 774, 802, 806, 2562, 2566, 2594, 2598, 2818, 2822, 2850, 2854, 528, 532, 560, 564, 784, 788, 816, 820, 2576, 2580, 2608, 2612, 2832, 2836, 2864, 2868, 530, 534, 562, 566, 786, 790, 818, 822, 2578, 2582, 2610, 2614, 2834, 2838, 2866, 2870, 640, 644, 672, 676, 896, 900, 928, 932, 2688, 2692, 2720, 2724, 2944, 2948, 2976, 2980, 642, 646, 674, 678, 898, 902, 930, 934, 2690, 2694, 2722, 2726, 2946, 2950, 2978, 2982, 656, 660, 688, 692, 912, 916, 944, 948, 2704, 2708, 2736, 2740, 2960, 2964, 2992, 2996, 658, 662, 690, 694, 914, 918, 946, 950, 2706, 2710, 2738, 2742, 2962, 2966, 2994, 2998, 1536, 1540, 1568, 1572, 1792, 1796, 1824, 1828, 3584, 3588, 3616, 3620, 3840, 3844, 3872, 3876, 1538, 1542, 1570, 1574, 1794, 1798, 1826, 1830, 3586, 3590, 3618, 3622, 3842, 3846, 3874, 3878, 1552, 1556, 1584, 1588, 1808, 1812, 1840, 1844, 3600, 3604, 3632, 3636, 3856, 3860, 3888, 3892, 1554, 1558, 1586, 1590, 1810, 1814, 1842, 1846, 3602, 3606, 3634, 3638, 3858, 3862, 3890, 3894, 1664, 1668, 1696, 1700, 1920, 1924, 1952, 1956, 3712, 3716, 3744, 3748, 3968, 3972, 4000, 4004, 1666, 1670, 1698, 1702, 1922, 1926, 1954, 1958, 3714, 3718, 3746, 3750, 3970, 3974, 4002, 4006, 1680, 1684, 1712, 1716, 1936, 1940, 1968, 1972, 3728, 3732, 3760, 3764, 3984, 3988, 4016, 4020, 1682, 1686, 1714, 1718, 1938, 1942, 1970, 1974, 3730, 3734, 3762, 3766, 3986, 3990, 4018, 4022, 513, 517, 545, 549, 769, 773, 801, 805, 2561, 2565, 2593, 2597, 2817, 2821, 2849, 2853, 515, 519, 547, 551, 771, 775, 803, 807, 2563, 2567, 2595, 2599, 2819, 2823, 2851, 2855, 529, 533, 561, 565, 785, 789, 817, 821, 2577, 2581, 2609, 2613, 2833, 2837, 2865, 2869, 531, 535, 563, 567, 787, 791, 819, 823, 2579, 2583, 2611, 2615, 2835, 2839, 2867, 2871, 641, 645, 673, 677, 897, 901, 929, 933, 2689, 2693, 2721, 2725, 2945, 2949, 2977, 2981, 643, 647, 675, 679, 899, 903, 931, 935, 2691, 2695, 2723, 2727, 2947, 2951, 2979, 2983, 657, 661, 689, 693, 913, 917, 945, 949, 2705, 2709, 2737, 2741, 2961, 2965, 2993, 2997, 659, 663, 691, 695, 915, 919, 947, 951, 2707, 2711, 2739, 2743, 2963, 2967, 2995, 2999, 1537, 1541, 1569, 1573, 1793, 1797, 1825, 1829, 3585, 3589, 3617, 3621, 3841, 3845, 3873, 3877, 1539, 1543, 1571, 1575, 1795, 1799, 1827, 1831, 3587, 3591, 3619, 3623, 3843, 3847, 3875, 3879, 1553, 1557, 1585, 1589, 1809, 1813, 1841, 1845, 3601, 3605, 3633, 3637, 3857, 3861, 3889, 3893, 1555, 1559, 1587, 1591, 1811, 1815, 1843, 1847, 3603, 3607, 3635, 3639, 3859, 3863, 3891, 3895, 1665, 1669, 1697, 1701, 1921, 1925, 1953, 1957, 3713, 3717, 3745, 3749, 3969, 3973, 4001, 4005, 1667, 1671, 1699, 1703, 1923, 1927, 1955, 1959, 3715, 3719, 3747, 3751, 3971, 3975, 4003, 4007, 1681, 1685, 1713, 1717, 1937, 1941, 1969, 1973, 3729, 3733, 3761, 3765, 3985, 3989, 4017, 4021, 1683, 1687, 1715, 1719, 1939, 1943, 1971, 1975, 3731, 3735, 3763, 3767, 3987, 3991, 4019, 4023, 520, 524, 552, 556, 776, 780, 808, 812, 2568, 2572, 2600, 2604, 2824, 2828, 2856, 2860, 522, 526, 554, 558, 778, 782, 810, 814, 2570, 2574, 2602, 2606, 2826, 2830, 2858, 2862, 536, 540, 568, 572, 792, 796, 824, 828, 2584, 2588, 2616, 2620, 2840, 2844, 2872, 2876, 538, 542, 570, 574, 794, 798, 826, 830, 2586, 2590, 2618, 2622, 2842, 2846, 2874, 2878, 648, 652, 680, 684, 904, 908, 936, 940, 2696, 2700, 2728, 2732, 2952, 2956, 2984, 2988, 650, 654, 682, 686, 906, 910, 938, 942, 2698, 2702, 2730, 2734, 2954, 2958, 2986, 2990, 664, 668, 696, 700, 920, 924, 952, 956, 2712, 2716, 2744, 2748, 2968, 2972, 3000, 3004, 666, 670, 698, 702, 922, 926, 954, 958, 2714, 2718, 2746, 2750, 2970, 2974, 3002, 3006, 1544, 1548, 1576, 1580, 1800, 1804, 1832, 1836, 3592, 3596, 3624, 3628, 3848, 3852, 3880, 3884, 1546, 1550, 1578, 1582, 1802, 1806, 1834, 1838, 3594, 3598, 3626, 3630, 3850, 3854, 3882, 3886, 1560, 1564, 1592, 1596, 1816, 1820, 1848, 1852, 3608, 3612, 3640, 3644, 3864, 3868, 3896, 3900, 1562, 1566, 1594, 1598, 1818, 1822, 1850, 1854, 3610, 3614, 3642, 3646, 3866, 3870, 3898, 3902, 1672, 1676, 1704, 1708, 1928, 1932, 1960, 1964, 3720, 3724, 3752, 3756, 3976, 3980, 4008, 4012, 1674, 1678, 1706, 1710, 1930, 1934, 1962, 1966, 3722, 3726, 3754, 3758, 3978, 3982, 4010, 4014, 1688, 1692, 1720, 1724, 1944, 1948, 1976, 1980, 3736, 3740, 3768, 3772, 3992, 3996, 4024, 4028, 1690, 1694, 1722, 1726, 1946, 1950, 1978, 1982, 3738, 3742, 3770, 3774, 3994, 3998, 4026, 4030, 521, 525, 553, 557, 777, 781, 809, 813, 2569, 2573, 2601, 2605, 2825, 2829, 2857, 2861, 523, 527, 555, 559, 779, 783, 811, 815, 2571, 2575, 2603, 2607, 2827, 2831, 2859, 2863, 537, 541, 569, 573, 793, 797, 825, 829, 2585, 2589, 2617, 2621, 2841, 2845, 2873, 2877, 539, 543, 571, 575, 795, 799, 827, 831, 2587, 2591, 2619, 2623, 2843, 2847, 2875, 2879, 649, 653, 681, 685, 905, 909, 937, 941, 2697, 2701, 2729, 2733, 2953, 2957, 2985, 2989, 651, 655, 683, 687, 907, 911, 939, 943, 2699, 2703, 2731, 2735, 2955, 2959, 2987, 2991, 665, 669, 697, 701, 921, 925, 953, 957, 2713, 2717, 2745, 2749, 2969, 2973, 3001, 3005, 667, 671, 699, 703, 923, 927, 955, 959, 2715, 2719, 2747, 2751, 2971, 2975, 3003, 3007, 1545, 1549, 1577, 1581, 1801, 1805, 1833, 1837, 3593, 3597, 3625, 3629, 3849, 3853, 3881, 3885, 1547, 1551, 1579, 1583, 1803, 1807, 1835, 1839, 3595, 3599, 3627, 3631, 3851, 3855, 3883, 3887, 1561, 1565, 1593, 1597, 1817, 1821, 1849, 1853, 3609, 3613, 3641, 3645, 3865, 3869, 3897, 3901, 1563, 1567, 1595, 1599, 1819, 1823, 1851, 1855, 3611, 3615, 3643, 3647, 3867, 3871, 3899, 3903, 1673, 1677, 1705, 1709, 1929, 1933, 1961, 1965, 3721, 3725, 3753, 3757, 3977, 3981, 4009, 4013, 1675, 1679, 1707, 1711, 1931, 1935, 1963, 1967, 3723, 3727, 3755, 3759, 3979, 3983, 4011, 4015, 1689, 1693, 1721, 1725, 1945, 1949, 1977, 1981, 3737, 3741, 3769, 3773, 3993, 3997, 4025, 4029, 1691, 1695, 1723, 1727, 1947, 1951, 1979, 1983, 3739, 3743, 3771, 3775, 3995, 3999, 4027, 4031, 576, 580, 608, 612, 832, 836, 864, 868, 2624, 2628, 2656, 2660, 2880, 2884, 2912, 2916, 578, 582, 610, 614, 834, 838, 866, 870, 2626, 2630, 2658, 2662, 2882, 2886, 2914, 2918, 592, 596, 624, 628, 848, 852, 880, 884, 2640, 2644, 2672, 2676, 2896, 2900, 2928, 2932, 594, 598, 626, 630, 850, 854, 882, 886, 2642, 2646, 2674, 2678, 2898, 2902, 2930, 2934, 704, 708, 736, 740, 960, 964, 992, 996, 2752, 2756, 2784, 2788, 3008, 3012, 3040, 3044, 706, 710, 738, 742, 962, 966, 994, 998, 2754, 2758, 2786, 2790, 3010, 3014, 3042, 3046, 720, 724, 752, 756, 976, 980, 1008, 1012, 2768, 2772, 2800, 2804, 3024, 3028, 3056, 3060, 722, 726, 754, 758, 978, 982, 1010, 1014, 2770, 2774, 2802, 2806, 3026, 3030, 3058, 3062, 1600, 1604, 1632, 1636, 1856, 1860, 1888, 1892, 3648, 3652, 3680, 3684, 3904, 3908, 3936, 3940, 1602, 1606, 1634, 1638, 1858, 1862, 1890, 1894, 3650, 3654, 3682, 3686, 3906, 3910, 3938, 3942, 1616, 1620, 1648, 1652, 1872, 1876, 1904, 1908, 3664, 3668, 3696, 3700, 3920, 3924, 3952, 3956, 1618, 1622, 1650, 1654, 1874, 1878, 1906, 1910, 3666, 3670, 3698, 3702, 3922, 3926, 3954, 3958, 1728, 1732, 1760, 1764, 1984, 1988, 2016, 2020, 3776, 3780, 3808, 3812, 4032, 4036, 4064, 4068, 1730, 1734, 1762, 1766, 1986, 1990, 2018, 2022, 3778, 3782, 3810, 3814, 4034, 4038, 4066, 4070, 1744, 1748, 1776, 1780, 2000, 2004, 2032, 2036, 3792, 3796, 3824, 3828, 4048, 4052, 4080, 4084, 1746, 1750, 1778, 1782, 2002, 2006, 2034, 2038, 3794, 3798, 3826, 3830, 4050, 4054, 4082, 4086, 577, 581, 609, 613, 833, 837, 865, 869, 2625, 2629, 2657, 2661, 2881, 2885, 2913, 2917, 579, 583, 611, 615, 835, 839, 867, 871, 2627, 2631, 2659, 2663, 2883, 2887, 2915, 2919, 593, 597, 625, 629, 849, 853, 881, 885, 2641, 2645, 2673, 2677, 2897, 2901, 2929, 2933, 595, 599, 627, 631, 851, 855, 883, 887, 2643, 2647, 2675, 2679, 2899, 2903, 2931, 2935, 705, 709, 737, 741, 961, 965, 993, 997, 2753, 2757, 2785, 2789, 3009, 3013, 3041, 3045, 707, 711, 739, 743, 963, 967, 995, 999, 2755, 2759, 2787, 2791, 3011, 3015, 3043, 3047, 721, 725, 753, 757, 977, 981, 1009, 1013, 2769, 2773, 2801, 2805, 3025, 3029, 3057, 3061, 723, 727, 755, 759, 979, 983, 1011, 1015, 2771, 2775, 2803, 2807, 3027, 3031, 3059, 3063, 1601, 1605, 1633, 1637, 1857, 1861, 1889, 1893, 3649, 3653, 3681, 3685, 3905, 3909, 3937, 3941, 1603, 1607, 1635, 1639, 1859, 1863, 1891, 1895, 3651, 3655, 3683, 3687, 3907, 3911, 3939, 3943, 1617, 1621, 1649, 1653, 1873, 1877, 1905, 1909, 3665, 3669, 3697, 3701, 3921, 3925, 3953, 3957, 1619, 1623, 1651, 1655, 1875, 1879, 1907, 1911, 3667, 3671, 3699, 3703, 3923, 3927, 3955, 3959, 1729, 1733, 1761, 1765, 1985, 1989, 2017, 2021, 3777, 3781, 3809, 3813, 4033, 4037, 4065, 4069, 1731, 1735, 1763, 1767, 1987, 1991, 2019, 2023, 3779, 3783, 3811, 3815, 4035, 4039, 4067, 4071, 1745, 1749, 1777, 1781, 2001, 2005, 2033, 2037, 3793, 3797, 3825, 3829, 4049, 4053, 4081, 4085, 1747, 1751, 1779, 1783, 2003, 2007, 2035, 2039, 3795, 3799, 3827, 3831, 4051, 4055, 4083, 4087, 584, 588, 616, 620, 840, 844, 872, 876, 2632, 2636, 2664, 2668, 2888, 2892, 2920, 2924, 586, 590, 618, 622, 842, 846, 874, 878, 2634, 2638, 2666, 2670, 2890, 2894, 2922, 2926, 600, 604, 632, 636, 856, 860, 888, 892, 2648, 2652, 2680, 2684, 2904, 2908, 2936, 2940, 602, 606, 634, 638, 858, 862, 890, 894, 2650, 2654, 2682, 2686, 2906, 2910, 2938, 2942, 712, 716, 744, 748, 968, 972, 1000, 1004, 2760, 2764, 2792, 2796, 3016, 3020, 3048, 3052, 714, 718, 746, 750, 970, 974, 1002, 1006, 2762, 2766, 2794, 2798, 3018, 3022, 3050, 3054, 728, 732, 760, 764, 984, 988, 1016, 1020, 2776, 2780, 2808, 2812, 3032, 3036, 3064, 3068, 730, 734, 762, 766, 986, 990, 1018, 1022, 2778, 2782, 2810, 2814, 3034, 3038, 3066, 3070, 1608, 1612, 1640, 1644, 1864, 1868, 1896, 1900, 3656, 3660, 3688, 3692, 3912, 3916, 3944, 3948, 1610, 1614, 1642, 1646, 1866, 1870, 1898, 1902, 3658, 3662, 3690, 3694, 3914, 3918, 3946, 3950, 1624, 1628, 1656, 1660, 1880, 1884, 1912, 1916, 3672, 3676, 3704, 3708, 3928, 3932, 3960, 3964, 1626, 1630, 1658, 1662, 1882, 1886, 1914, 1918, 3674, 3678, 3706, 3710, 3930, 3934, 3962, 3966, 1736, 1740, 1768, 1772, 1992, 1996, 2024, 2028, 3784, 3788, 3816, 3820, 4040, 4044, 4072, 4076, 1738, 1742, 1770, 1774, 1994, 1998, 2026, 2030, 3786, 3790, 3818, 3822, 4042, 4046, 4074, 4078, 1752, 1756, 1784, 1788, 2008, 2012, 2040, 2044, 3800, 3804, 3832, 3836, 4056, 4060, 4088, 4092, 1754, 1758, 1786, 1790, 2010, 2014, 2042, 2046, 3802, 3806, 3834, 3838, 4058, 4062, 4090, 4094, 585, 589, 617, 621, 841, 845, 873, 877, 2633, 2637, 2665, 2669, 2889, 2893, 2921, 2925, 587, 591, 619, 623, 843, 847, 875, 879, 2635, 2639, 2667, 2671, 2891, 2895, 2923, 2927, 601, 605, 633, 637, 857, 861, 889, 893, 2649, 2653, 2681, 2685, 2905, 2909, 2937, 2941, 603, 607, 635, 639, 859, 863, 891, 895, 2651, 2655, 2683, 2687, 2907, 2911, 2939, 2943, 713, 717, 745, 749, 969, 973, 1001, 1005, 2761, 2765, 2793, 2797, 3017, 3021, 3049, 3053, 715, 719, 747, 751, 971, 975, 1003, 1007, 2763, 2767, 2795, 2799, 3019, 3023, 3051, 3055, 729, 733, 761, 765, 985, 989, 1017, 1021, 2777, 2781, 2809, 2813, 3033, 3037, 3065, 3069, 731, 735, 763, 767, 987, 991, 1019, 1023, 2779, 2783, 2811, 2815, 3035, 3039, 3067, 3071, 1609, 1613, 1641, 1645, 1865, 1869, 1897, 1901, 3657, 3661, 3689, 3693, 3913, 3917, 3945, 3949, 1611, 1615, 1643, 1647, 1867, 1871, 1899, 1903, 3659, 3663, 3691, 3695, 3915, 3919, 3947, 3951, 1625, 1629, 1657, 1661, 1881, 1885, 1913, 1917, 3673, 3677, 3705, 3709, 3929, 3933, 3961, 3965, 1627, 1631, 1659, 1663, 1883, 1887, 1915, 1919, 3675, 3679, 3707, 3711, 3931, 3935, 3963, 3967, 1737, 1741, 1769, 1773, 1993, 1997, 2025, 2029, 3785, 3789, 3817, 3821, 4041, 4045, 4073, 4077, 1739, 1743, 1771, 1775, 1995, 1999, 2027, 2031, 3787, 3791, 3819, 3823, 4043, 4047, 4075, 4079, 1753, 1757, 1785, 1789, 2009, 2013, 2041, 2045, 3801, 3805, 3833, 3837, 4057, 4061, 4089, 4093, 1755, 1759, 1787, 1791, 2011, 2015, 2043, 2047, 3803, 3807, 3835, 3839, 4059, 4063, 4091, 4095}; + +uint64_t hilbert_grid_16_16_16[16 * 16* 16] = { + 0, 7, 8, 11, 212, 211, 204, 203, 3892, 3891, 3884, 3883, 4084, 4087, 4088, 4095, 3, 4, 9, 10, 215, 208, 207, 200, 3895, 3888, 3887, 3880, 4085, 4086, 4091, 4092, 60, 59, 54, 53, 216, 219, 198, 199, 3896, 3897, 3876, 3879, 4042, 4041, 4036, 4035, 63, 56, 55, 52, 217, 218, 193, 192, 3903, 3902, 3877, 3878, 4043, 4040, 4039, 4032, 64, 67, 124, 127, 128, 131, 188, 191, 3904, 3907, 3964, 3967, 3968, 3971, 4028, 4031, 65, 66, 125, 126, 129, 130, 189, 190, 3905, 3906, 3965, 3966, 3969, 3970, 4029, 4030, 90, 93, 98, 101, 154, 157, 162, 165, 3930, 3933, 3938, 3941, 3994, 3997, 4002, 4005, 89, 94, 97, 102, 153, 158, 161, 166, 3929, 3934, 3937, 3942, 3993, 3998, 4001, 4006, 1702, 1703, 1704, 1707, 1876, 1879, 1880, 1881, 2214, 2215, 2216, 2219, 2388, 2391, 2392, 2393, 1697, 1696, 1705, 1706, 1877, 1878, 1887, 1886, 2209, 2208, 2217, 2218, 2389, 2390, 2399, 2398, 1694, 1695, 1686, 1685, 1898, 1897, 1888, 1889, 2206, 2207, 2198, 2197, 2410, 2409, 2400, 2401, 1689, 1688, 1687, 1684, 1899, 1896, 1895, 1894, 2201, 2200, 2199, 2196, 2411, 2408, 2407, 2406, 1638, 1639, 1640, 1643, 1940, 1943, 1944, 1945, 2150, 2151, 2152, 2155, 2452, 2455, 2456, 2457, 1633, 1632, 1641, 1642, 1941, 1942, 1951, 1950, 2145, 2144, 2153, 2154, 2453, 2454, 2463, 2462, 1630, 1631, 1622, 1621, 1962, 1961, 1952, 1953, 2142, 2143, 2134, 2133, 2474, 2473, 2464, 2465, 1625, 1624, 1623, 1620, 1963, 1960, 1959, 1958, 2137, 2136, 2135, 2132, 2475, 2472, 2471, 2470, 1, 6, 15, 12, 213, 210, 205, 202, 3893, 3890, 3885, 3882, 4083, 4080, 4089, 4094, 2, 5, 14, 13, 214, 209, 206, 201, 3894, 3889, 3886, 3881, 4082, 4081, 4090, 4093, 61, 58, 49, 50, 223, 220, 197, 196, 3899, 3898, 3875, 3872, 4045, 4046, 4037, 4034, 62, 57, 48, 51, 222, 221, 194, 195, 3900, 3901, 3874, 3873, 4044, 4047, 4038, 4033, 71, 68, 123, 120, 135, 132, 187, 184, 3911, 3908, 3963, 3960, 3975, 3972, 4027, 4024, 70, 69, 122, 121, 134, 133, 186, 185, 3910, 3909, 3962, 3961, 3974, 3973, 4026, 4025, 91, 92, 99, 100, 155, 156, 163, 164, 3931, 3932, 3939, 3940, 3995, 3996, 4003, 4004, 88, 95, 96, 103, 152, 159, 160, 167, 3928, 3935, 3936, 3943, 3992, 3999, 4000, 4007, 1701, 1700, 1711, 1708, 1875, 1872, 1883, 1882, 2213, 2212, 2223, 2220, 2387, 2384, 2395, 2394, 1698, 1699, 1710, 1709, 1874, 1873, 1884, 1885, 2210, 2211, 2222, 2221, 2386, 2385, 2396, 2397, 1693, 1692, 1681, 1682, 1901, 1902, 1891, 1890, 2205, 2204, 2193, 2194, 2413, 2414, 2403, 2402, 1690, 1691, 1680, 1683, 1900, 1903, 1892, 1893, 2202, 2203, 2192, 2195, 2412, 2415, 2404, 2405, 1637, 1636, 1647, 1644, 1939, 1936, 1947, 1946, 2149, 2148, 2159, 2156, 2451, 2448, 2459, 2458, 1634, 1635, 1646, 1645, 1938, 1937, 1948, 1949, 2146, 2147, 2158, 2157, 2450, 2449, 2460, 2461, 1629, 1628, 1617, 1618, 1965, 1966, 1955, 1954, 2141, 2140, 2129, 2130, 2477, 2478, 2467, 2466, 1626, 1627, 1616, 1619, 1964, 1967, 1956, 1957, 2138, 2139, 2128, 2131, 2476, 2479, 2468, 2469, 26, 27, 16, 19, 234, 237, 242, 245, 3850, 3853, 3858, 3861, 4076, 4079, 4068, 4069, 29, 28, 17, 18, 233, 238, 241, 246, 3849, 3854, 3857, 3862, 4077, 4078, 4067, 4066, 34, 35, 46, 45, 224, 227, 250, 251, 3844, 3845, 3868, 3871, 4050, 4049, 4060, 4061, 37, 36, 47, 44, 225, 226, 253, 252, 3843, 3842, 3869, 3870, 4051, 4048, 4059, 4058, 72, 73, 118, 119, 136, 137, 182, 183, 3912, 3913, 3958, 3959, 3976, 3977, 4022, 4023, 79, 78, 113, 112, 143, 142, 177, 176, 3919, 3918, 3953, 3952, 3983, 3982, 4017, 4016, 80, 81, 110, 111, 144, 145, 174, 175, 3920, 3921, 3950, 3951, 3984, 3985, 4014, 4015, 87, 86, 105, 104, 151, 150, 169, 168, 3927, 3926, 3945, 3944, 3991, 3990, 4009, 4008, 1726, 1721, 1712, 1715, 1868, 1871, 1862, 1857, 2238, 2233, 2224, 2227, 2380, 2383, 2374, 2369, 1725, 1722, 1713, 1714, 1869, 1870, 1861, 1858, 2237, 2234, 2225, 2226, 2381, 2382, 2373, 2370, 1666, 1669, 1678, 1677, 1906, 1905, 1914, 1917, 2178, 2181, 2190, 2189, 2418, 2417, 2426, 2429, 1665, 1670, 1679, 1676, 1907, 1904, 1913, 1918, 2177, 2182, 2191, 2188, 2419, 2416, 2425, 2430, 1662, 1657, 1648, 1651, 1932, 1935, 1926, 1921, 2174, 2169, 2160, 2163, 2444, 2447, 2438, 2433, 1661, 1658, 1649, 1650, 1933, 1934, 1925, 1922, 2173, 2170, 2161, 2162, 2445, 2446, 2437, 2434, 1602, 1605, 1614, 1613, 1970, 1969, 1978, 1981, 2114, 2117, 2126, 2125, 2482, 2481, 2490, 2493, 1601, 1606, 1615, 1612, 1971, 1968, 1977, 1982, 2113, 2118, 2127, 2124, 2483, 2480, 2489, 2494, 25, 24, 23, 20, 235, 236, 243, 244, 3851, 3852, 3859, 3860, 4075, 4072, 4071, 4070, 30, 31, 22, 21, 232, 239, 240, 247, 3848, 3855, 3856, 3863, 4074, 4073, 4064, 4065, 33, 32, 41, 42, 231, 228, 249, 248, 3847, 3846, 3867, 3864, 4053, 4054, 4063, 4062, 38, 39, 40, 43, 230, 229, 254, 255, 3840, 3841, 3866, 3865, 4052, 4055, 4056, 4057, 75, 74, 117, 116, 139, 138, 181, 180, 3915, 3914, 3957, 3956, 3979, 3978, 4021, 4020, 76, 77, 114, 115, 140, 141, 178, 179, 3916, 3917, 3954, 3955, 3980, 3981, 4018, 4019, 83, 82, 109, 108, 147, 146, 173, 172, 3923, 3922, 3949, 3948, 3987, 3986, 4013, 4012, 84, 85, 106, 107, 148, 149, 170, 171, 3924, 3925, 3946, 3947, 3988, 3989, 4010, 4011, 1727, 1720, 1719, 1716, 1867, 1864, 1863, 1856, 2239, 2232, 2231, 2228, 2379, 2376, 2375, 2368, 1724, 1723, 1718, 1717, 1866, 1865, 1860, 1859, 2236, 2235, 2230, 2229, 2378, 2377, 2372, 2371, 1667, 1668, 1673, 1674, 1909, 1910, 1915, 1916, 2179, 2180, 2185, 2186, 2421, 2422, 2427, 2428, 1664, 1671, 1672, 1675, 1908, 1911, 1912, 1919, 2176, 2183, 2184, 2187, 2420, 2423, 2424, 2431, 1663, 1656, 1655, 1652, 1931, 1928, 1927, 1920, 2175, 2168, 2167, 2164, 2443, 2440, 2439, 2432, 1660, 1659, 1654, 1653, 1930, 1929, 1924, 1923, 2172, 2171, 2166, 2165, 2442, 2441, 2436, 2435, 1603, 1604, 1609, 1610, 1973, 1974, 1979, 1980, 2115, 2116, 2121, 2122, 2485, 2486, 2491, 2492, 1600, 1607, 1608, 1611, 1972, 1975, 1976, 1983, 2112, 2119, 2120, 2123, 2484, 2487, 2488, 2495, 486, 487, 488, 491, 276, 275, 268, 267, 3828, 3827, 3820, 3819, 3604, 3607, 3608, 3609, 481, 480, 489, 490, 279, 272, 271, 264, 3831, 3824, 3823, 3816, 3605, 3606, 3615, 3614, 478, 479, 470, 469, 280, 283, 262, 263, 3832, 3833, 3812, 3815, 3626, 3625, 3616, 3617, 473, 472, 471, 468, 281, 282, 257, 256, 3839, 3838, 3813, 3814, 3627, 3624, 3623, 3622, 436, 437, 394, 395, 372, 373, 330, 331, 3764, 3765, 3722, 3723, 3700, 3701, 3658, 3659, 435, 434, 397, 396, 371, 370, 333, 332, 3763, 3762, 3725, 3724, 3699, 3698, 3661, 3660, 428, 429, 402, 403, 364, 365, 338, 339, 3756, 3757, 3730, 3731, 3692, 3693, 3666, 3667, 427, 426, 405, 404, 363, 362, 341, 340, 3755, 3754, 3733, 3732, 3691, 3690, 3669, 3668, 1728, 1731, 1788, 1791, 1792, 1795, 1852, 1855, 2240, 2243, 2300, 2303, 2304, 2307, 2364, 2367, 1729, 1730, 1789, 1790, 1793, 1794, 1853, 1854, 2241, 2242, 2301, 2302, 2305, 2306, 2365, 2366, 1754, 1757, 1762, 1765, 1818, 1821, 1826, 1829, 2266, 2269, 2274, 2277, 2330, 2333, 2338, 2341, 1753, 1758, 1761, 1766, 1817, 1822, 1825, 1830, 2265, 2270, 2273, 2278, 2329, 2334, 2337, 2342, 1588, 1587, 1580, 1579, 2004, 2003, 1996, 1995, 2100, 2099, 2092, 2091, 2516, 2515, 2508, 2507, 1591, 1584, 1583, 1576, 2007, 2000, 1999, 1992, 2103, 2096, 2095, 2088, 2519, 2512, 2511, 2504, 1592, 1593, 1572, 1575, 2008, 2011, 1990, 1991, 2104, 2105, 2084, 2087, 2520, 2523, 2502, 2503, 1599, 1598, 1573, 1574, 2009, 2010, 1985, 1984, 2111, 2110, 2085, 2086, 2521, 2522, 2497, 2496, 485, 484, 495, 492, 277, 274, 269, 266, 3829, 3826, 3821, 3818, 3603, 3600, 3611, 3610, 482, 483, 494, 493, 278, 273, 270, 265, 3830, 3825, 3822, 3817, 3602, 3601, 3612, 3613, 477, 476, 465, 466, 287, 284, 261, 260, 3835, 3834, 3811, 3808, 3629, 3630, 3619, 3618, 474, 475, 464, 467, 286, 285, 258, 259, 3836, 3837, 3810, 3809, 3628, 3631, 3620, 3621, 439, 438, 393, 392, 375, 374, 329, 328, 3767, 3766, 3721, 3720, 3703, 3702, 3657, 3656, 432, 433, 398, 399, 368, 369, 334, 335, 3760, 3761, 3726, 3727, 3696, 3697, 3662, 3663, 431, 430, 401, 400, 367, 366, 337, 336, 3759, 3758, 3729, 3728, 3695, 3694, 3665, 3664, 424, 425, 406, 407, 360, 361, 342, 343, 3752, 3753, 3734, 3735, 3688, 3689, 3670, 3671, 1735, 1732, 1787, 1784, 1799, 1796, 1851, 1848, 2247, 2244, 2299, 2296, 2311, 2308, 2363, 2360, 1734, 1733, 1786, 1785, 1798, 1797, 1850, 1849, 2246, 2245, 2298, 2297, 2310, 2309, 2362, 2361, 1755, 1756, 1763, 1764, 1819, 1820, 1827, 1828, 2267, 2268, 2275, 2276, 2331, 2332, 2339, 2340, 1752, 1759, 1760, 1767, 1816, 1823, 1824, 1831, 2264, 2271, 2272, 2279, 2328, 2335, 2336, 2343, 1589, 1586, 1581, 1578, 2005, 2002, 1997, 1994, 2101, 2098, 2093, 2090, 2517, 2514, 2509, 2506, 1590, 1585, 1582, 1577, 2006, 2001, 1998, 1993, 2102, 2097, 2094, 2089, 2518, 2513, 2510, 2505, 1595, 1594, 1571, 1568, 2015, 2012, 1989, 1988, 2107, 2106, 2083, 2080, 2527, 2524, 2501, 2500, 1596, 1597, 1570, 1569, 2014, 2013, 1986, 1987, 2108, 2109, 2082, 2081, 2526, 2525, 2498, 2499, 510, 505, 496, 499, 298, 301, 306, 309, 3786, 3789, 3794, 3797, 3596, 3599, 3590, 3585, 509, 506, 497, 498, 297, 302, 305, 310, 3785, 3790, 3793, 3798, 3597, 3598, 3589, 3586, 450, 453, 462, 461, 288, 291, 314, 315, 3780, 3781, 3804, 3807, 3634, 3633, 3642, 3645, 449, 454, 463, 460, 289, 290, 317, 316, 3779, 3778, 3805, 3806, 3635, 3632, 3641, 3646, 440, 443, 388, 391, 376, 379, 324, 327, 3768, 3771, 3716, 3719, 3704, 3707, 3652, 3655, 441, 442, 389, 390, 377, 378, 325, 326, 3769, 3770, 3717, 3718, 3705, 3706, 3653, 3654, 420, 419, 412, 411, 356, 355, 348, 347, 3748, 3747, 3740, 3739, 3684, 3683, 3676, 3675, 423, 416, 415, 408, 359, 352, 351, 344, 3751, 3744, 3743, 3736, 3687, 3680, 3679, 3672, 1736, 1737, 1782, 1783, 1800, 1801, 1846, 1847, 2248, 2249, 2294, 2295, 2312, 2313, 2358, 2359, 1743, 1742, 1777, 1776, 1807, 1806, 1841, 1840, 2255, 2254, 2289, 2288, 2319, 2318, 2353, 2352, 1744, 1745, 1774, 1775, 1808, 1809, 1838, 1839, 2256, 2257, 2286, 2287, 2320, 2321, 2350, 2351, 1751, 1750, 1769, 1768, 1815, 1814, 1833, 1832, 2263, 2262, 2281, 2280, 2327, 2326, 2345, 2344, 1546, 1549, 1554, 1557, 2026, 2029, 2034, 2037, 2058, 2061, 2066, 2069, 2538, 2541, 2546, 2549, 1545, 1550, 1553, 1558, 2025, 2030, 2033, 2038, 2057, 2062, 2065, 2070, 2537, 2542, 2545, 2550, 1540, 1541, 1564, 1567, 2016, 2019, 2042, 2043, 2052, 2053, 2076, 2079, 2528, 2531, 2554, 2555, 1539, 1538, 1565, 1566, 2017, 2018, 2045, 2044, 2051, 2050, 2077, 2078, 2529, 2530, 2557, 2556, 511, 504, 503, 500, 299, 300, 307, 308, 3787, 3788, 3795, 3796, 3595, 3592, 3591, 3584, 508, 507, 502, 501, 296, 303, 304, 311, 3784, 3791, 3792, 3799, 3594, 3593, 3588, 3587, 451, 452, 457, 458, 295, 292, 313, 312, 3783, 3782, 3803, 3800, 3637, 3638, 3643, 3644, 448, 455, 456, 459, 294, 293, 318, 319, 3776, 3777, 3802, 3801, 3636, 3639, 3640, 3647, 447, 444, 387, 384, 383, 380, 323, 320, 3775, 3772, 3715, 3712, 3711, 3708, 3651, 3648, 446, 445, 386, 385, 382, 381, 322, 321, 3774, 3773, 3714, 3713, 3710, 3709, 3650, 3649, 421, 418, 413, 410, 357, 354, 349, 346, 3749, 3746, 3741, 3738, 3685, 3682, 3677, 3674, 422, 417, 414, 409, 358, 353, 350, 345, 3750, 3745, 3742, 3737, 3686, 3681, 3678, 3673, 1739, 1738, 1781, 1780, 1803, 1802, 1845, 1844, 2251, 2250, 2293, 2292, 2315, 2314, 2357, 2356, 1740, 1741, 1778, 1779, 1804, 1805, 1842, 1843, 2252, 2253, 2290, 2291, 2316, 2317, 2354, 2355, 1747, 1746, 1773, 1772, 1811, 1810, 1837, 1836, 2259, 2258, 2285, 2284, 2323, 2322, 2349, 2348, 1748, 1749, 1770, 1771, 1812, 1813, 1834, 1835, 2260, 2261, 2282, 2283, 2324, 2325, 2346, 2347, 1547, 1548, 1555, 1556, 2027, 2028, 2035, 2036, 2059, 2060, 2067, 2068, 2539, 2540, 2547, 2548, 1544, 1551, 1552, 1559, 2024, 2031, 2032, 2039, 2056, 2063, 2064, 2071, 2536, 2543, 2544, 2551, 1543, 1542, 1563, 1560, 2023, 2020, 2041, 2040, 2055, 2054, 2075, 2072, 2535, 2532, 2553, 2552, 1536, 1537, 1562, 1561, 2022, 2021, 2046, 2047, 2048, 2049, 2074, 2073, 2534, 2533, 2558, 2559, 512, 515, 572, 575, 576, 577, 602, 601, 3494, 3493, 3518, 3519, 3520, 3523, 3580, 3583, 513, 514, 573, 574, 583, 582, 603, 600, 3495, 3492, 3513, 3512, 3521, 3522, 3581, 3582, 538, 541, 546, 549, 584, 591, 592, 599, 3496, 3503, 3504, 3511, 3546, 3549, 3554, 3557, 537, 542, 545, 550, 587, 588, 595, 596, 3499, 3500, 3507, 3508, 3545, 3550, 3553, 3558, 998, 993, 990, 985, 948, 947, 940, 939, 3156, 3155, 3148, 3147, 3110, 3105, 3102, 3097, 997, 994, 989, 986, 951, 944, 943, 936, 3159, 3152, 3151, 3144, 3109, 3106, 3101, 3098, 1022, 1021, 962, 961, 952, 953, 932, 935, 3160, 3163, 3142, 3143, 3134, 3133, 3074, 3073, 1023, 1020, 963, 960, 959, 958, 933, 934, 3161, 3162, 3137, 3136, 3135, 3132, 3075, 3072, 1024, 1027, 1084, 1087, 1088, 1089, 1114, 1113, 2982, 2981, 3006, 3007, 3008, 3011, 3068, 3071, 1025, 1026, 1085, 1086, 1095, 1094, 1115, 1112, 2983, 2980, 3001, 3000, 3009, 3010, 3069, 3070, 1050, 1053, 1058, 1061, 1096, 1103, 1104, 1111, 2984, 2991, 2992, 2999, 3034, 3037, 3042, 3045, 1049, 1054, 1057, 1062, 1099, 1100, 1107, 1108, 2987, 2988, 2995, 2996, 3033, 3038, 3041, 3046, 1510, 1505, 1502, 1497, 1460, 1459, 1452, 1451, 2644, 2643, 2636, 2635, 2598, 2593, 2590, 2585, 1509, 1506, 1501, 1498, 1463, 1456, 1455, 1448, 2647, 2640, 2639, 2632, 2597, 2594, 2589, 2586, 1534, 1533, 1474, 1473, 1464, 1465, 1444, 1447, 2648, 2651, 2630, 2631, 2622, 2621, 2562, 2561, 1535, 1532, 1475, 1472, 1471, 1470, 1445, 1446, 2649, 2650, 2625, 2624, 2623, 2620, 2563, 2560, 519, 516, 571, 568, 579, 578, 605, 606, 3489, 3490, 3517, 3516, 3527, 3524, 3579, 3576, 518, 517, 570, 569, 580, 581, 604, 607, 3488, 3491, 3514, 3515, 3526, 3525, 3578, 3577, 539, 540, 547, 548, 585, 590, 593, 598, 3497, 3502, 3505, 3510, 3547, 3548, 3555, 3556, 536, 543, 544, 551, 586, 589, 594, 597, 3498, 3501, 3506, 3509, 3544, 3551, 3552, 3559, 999, 992, 991, 984, 949, 946, 941, 938, 3157, 3154, 3149, 3146, 3111, 3104, 3103, 3096, 996, 995, 988, 987, 950, 945, 942, 937, 3158, 3153, 3150, 3145, 3108, 3107, 3100, 3099, 1017, 1018, 965, 966, 955, 954, 931, 928, 3167, 3164, 3141, 3140, 3129, 3130, 3077, 3078, 1016, 1019, 964, 967, 956, 957, 930, 929, 3166, 3165, 3138, 3139, 3128, 3131, 3076, 3079, 1031, 1028, 1083, 1080, 1091, 1090, 1117, 1118, 2977, 2978, 3005, 3004, 3015, 3012, 3067, 3064, 1030, 1029, 1082, 1081, 1092, 1093, 1116, 1119, 2976, 2979, 3002, 3003, 3014, 3013, 3066, 3065, 1051, 1052, 1059, 1060, 1097, 1102, 1105, 1110, 2985, 2990, 2993, 2998, 3035, 3036, 3043, 3044, 1048, 1055, 1056, 1063, 1098, 1101, 1106, 1109, 2986, 2989, 2994, 2997, 3032, 3039, 3040, 3047, 1511, 1504, 1503, 1496, 1461, 1458, 1453, 1450, 2645, 2642, 2637, 2634, 2599, 2592, 2591, 2584, 1508, 1507, 1500, 1499, 1462, 1457, 1454, 1449, 2646, 2641, 2638, 2633, 2596, 2595, 2588, 2587, 1529, 1530, 1477, 1478, 1467, 1466, 1443, 1440, 2655, 2652, 2629, 2628, 2617, 2618, 2565, 2566, 1528, 1531, 1476, 1479, 1468, 1469, 1442, 1441, 2654, 2653, 2626, 2627, 2616, 2619, 2564, 2567, 520, 521, 566, 567, 636, 637, 610, 609, 3486, 3485, 3458, 3459, 3528, 3529, 3574, 3575, 527, 526, 561, 560, 635, 634, 611, 608, 3487, 3484, 3461, 3460, 3535, 3534, 3569, 3568, 528, 529, 558, 559, 630, 625, 622, 617, 3478, 3473, 3470, 3465, 3536, 3537, 3566, 3567, 535, 534, 553, 552, 629, 626, 621, 618, 3477, 3474, 3469, 3466, 3543, 3542, 3561, 3560, 1000, 1001, 982, 983, 906, 909, 914, 917, 3178, 3181, 3186, 3189, 3112, 3113, 3094, 3095, 1007, 1006, 977, 976, 905, 910, 913, 918, 3177, 3182, 3185, 3190, 3119, 3118, 3089, 3088, 1008, 1009, 974, 975, 900, 901, 924, 927, 3168, 3171, 3194, 3195, 3120, 3121, 3086, 3087, 1015, 1014, 969, 968, 899, 898, 925, 926, 3169, 3170, 3197, 3196, 3127, 3126, 3081, 3080, 1032, 1033, 1078, 1079, 1148, 1149, 1122, 1121, 2974, 2973, 2946, 2947, 3016, 3017, 3062, 3063, 1039, 1038, 1073, 1072, 1147, 1146, 1123, 1120, 2975, 2972, 2949, 2948, 3023, 3022, 3057, 3056, 1040, 1041, 1070, 1071, 1142, 1137, 1134, 1129, 2966, 2961, 2958, 2953, 3024, 3025, 3054, 3055, 1047, 1046, 1065, 1064, 1141, 1138, 1133, 1130, 2965, 2962, 2957, 2954, 3031, 3030, 3049, 3048, 1512, 1513, 1494, 1495, 1418, 1421, 1426, 1429, 2666, 2669, 2674, 2677, 2600, 2601, 2582, 2583, 1519, 1518, 1489, 1488, 1417, 1422, 1425, 1430, 2665, 2670, 2673, 2678, 2607, 2606, 2577, 2576, 1520, 1521, 1486, 1487, 1412, 1413, 1436, 1439, 2656, 2659, 2682, 2683, 2608, 2609, 2574, 2575, 1527, 1526, 1481, 1480, 1411, 1410, 1437, 1438, 2657, 2658, 2685, 2684, 2615, 2614, 2569, 2568, 523, 522, 565, 564, 639, 638, 613, 614, 3481, 3482, 3457, 3456, 3531, 3530, 3573, 3572, 524, 525, 562, 563, 632, 633, 612, 615, 3480, 3483, 3462, 3463, 3532, 3533, 3570, 3571, 531, 530, 557, 556, 631, 624, 623, 616, 3479, 3472, 3471, 3464, 3539, 3538, 3565, 3564, 532, 533, 554, 555, 628, 627, 620, 619, 3476, 3475, 3468, 3467, 3540, 3541, 3562, 3563, 1003, 1002, 981, 980, 907, 908, 915, 916, 3179, 3180, 3187, 3188, 3115, 3114, 3093, 3092, 1004, 1005, 978, 979, 904, 911, 912, 919, 3176, 3183, 3184, 3191, 3116, 3117, 3090, 3091, 1011, 1010, 973, 972, 903, 902, 923, 920, 3175, 3172, 3193, 3192, 3123, 3122, 3085, 3084, 1012, 1013, 970, 971, 896, 897, 922, 921, 3174, 3173, 3198, 3199, 3124, 3125, 3082, 3083, 1035, 1034, 1077, 1076, 1151, 1150, 1125, 1126, 2969, 2970, 2945, 2944, 3019, 3018, 3061, 3060, 1036, 1037, 1074, 1075, 1144, 1145, 1124, 1127, 2968, 2971, 2950, 2951, 3020, 3021, 3058, 3059, 1043, 1042, 1069, 1068, 1143, 1136, 1135, 1128, 2967, 2960, 2959, 2952, 3027, 3026, 3053, 3052, 1044, 1045, 1066, 1067, 1140, 1139, 1132, 1131, 2964, 2963, 2956, 2955, 3028, 3029, 3050, 3051, 1515, 1514, 1493, 1492, 1419, 1420, 1427, 1428, 2667, 2668, 2675, 2676, 2603, 2602, 2581, 2580, 1516, 1517, 1490, 1491, 1416, 1423, 1424, 1431, 2664, 2671, 2672, 2679, 2604, 2605, 2578, 2579, 1523, 1522, 1485, 1484, 1415, 1414, 1435, 1432, 2663, 2660, 2681, 2680, 2611, 2610, 2573, 2572, 1524, 1525, 1482, 1483, 1408, 1409, 1434, 1433, 2662, 2661, 2686, 2687, 2612, 2613, 2570, 2571, 724, 727, 728, 729, 640, 641, 666, 665, 3430, 3429, 3454, 3455, 3366, 3367, 3368, 3371, 725, 726, 735, 734, 647, 646, 667, 664, 3431, 3428, 3449, 3448, 3361, 3360, 3369, 3370, 746, 745, 736, 737, 648, 655, 656, 663, 3432, 3439, 3440, 3447, 3358, 3359, 3350, 3349, 747, 744, 743, 742, 651, 652, 659, 660, 3435, 3436, 3443, 3444, 3353, 3352, 3351, 3348, 788, 791, 792, 793, 884, 883, 876, 875, 3220, 3219, 3212, 3211, 3302, 3303, 3304, 3307, 789, 790, 799, 798, 887, 880, 879, 872, 3223, 3216, 3215, 3208, 3297, 3296, 3305, 3306, 810, 809, 800, 801, 888, 889, 868, 871, 3224, 3227, 3206, 3207, 3294, 3295, 3286, 3285, 811, 808, 807, 806, 895, 894, 869, 870, 3225, 3226, 3201, 3200, 3289, 3288, 3287, 3284, 1236, 1239, 1240, 1241, 1152, 1153, 1178, 1177, 2918, 2917, 2942, 2943, 2854, 2855, 2856, 2859, 1237, 1238, 1247, 1246, 1159, 1158, 1179, 1176, 2919, 2916, 2937, 2936, 2849, 2848, 2857, 2858, 1258, 1257, 1248, 1249, 1160, 1167, 1168, 1175, 2920, 2927, 2928, 2935, 2846, 2847, 2838, 2837, 1259, 1256, 1255, 1254, 1163, 1164, 1171, 1172, 2923, 2924, 2931, 2932, 2841, 2840, 2839, 2836, 1300, 1303, 1304, 1305, 1396, 1395, 1388, 1387, 2708, 2707, 2700, 2699, 2790, 2791, 2792, 2795, 1301, 1302, 1311, 1310, 1399, 1392, 1391, 1384, 2711, 2704, 2703, 2696, 2785, 2784, 2793, 2794, 1322, 1321, 1312, 1313, 1400, 1401, 1380, 1383, 2712, 2715, 2694, 2695, 2782, 2783, 2774, 2773, 1323, 1320, 1319, 1318, 1407, 1406, 1381, 1382, 2713, 2714, 2689, 2688, 2777, 2776, 2775, 2772, 723, 720, 731, 730, 643, 642, 669, 670, 3425, 3426, 3453, 3452, 3365, 3364, 3375, 3372, 722, 721, 732, 733, 644, 645, 668, 671, 3424, 3427, 3450, 3451, 3362, 3363, 3374, 3373, 749, 750, 739, 738, 649, 654, 657, 662, 3433, 3438, 3441, 3446, 3357, 3356, 3345, 3346, 748, 751, 740, 741, 650, 653, 658, 661, 3434, 3437, 3442, 3445, 3354, 3355, 3344, 3347, 787, 784, 795, 794, 885, 882, 877, 874, 3221, 3218, 3213, 3210, 3301, 3300, 3311, 3308, 786, 785, 796, 797, 886, 881, 878, 873, 3222, 3217, 3214, 3209, 3298, 3299, 3310, 3309, 813, 814, 803, 802, 891, 890, 867, 864, 3231, 3228, 3205, 3204, 3293, 3292, 3281, 3282, 812, 815, 804, 805, 892, 893, 866, 865, 3230, 3229, 3202, 3203, 3290, 3291, 3280, 3283, 1235, 1232, 1243, 1242, 1155, 1154, 1181, 1182, 2913, 2914, 2941, 2940, 2853, 2852, 2863, 2860, 1234, 1233, 1244, 1245, 1156, 1157, 1180, 1183, 2912, 2915, 2938, 2939, 2850, 2851, 2862, 2861, 1261, 1262, 1251, 1250, 1161, 1166, 1169, 1174, 2921, 2926, 2929, 2934, 2845, 2844, 2833, 2834, 1260, 1263, 1252, 1253, 1162, 1165, 1170, 1173, 2922, 2925, 2930, 2933, 2842, 2843, 2832, 2835, 1299, 1296, 1307, 1306, 1397, 1394, 1389, 1386, 2709, 2706, 2701, 2698, 2789, 2788, 2799, 2796, 1298, 1297, 1308, 1309, 1398, 1393, 1390, 1385, 2710, 2705, 2702, 2697, 2786, 2787, 2798, 2797, 1325, 1326, 1315, 1314, 1403, 1402, 1379, 1376, 2719, 2716, 2693, 2692, 2781, 2780, 2769, 2770, 1324, 1327, 1316, 1317, 1404, 1405, 1378, 1377, 2718, 2717, 2690, 2691, 2778, 2779, 2768, 2771, 716, 719, 710, 705, 700, 701, 674, 673, 3422, 3421, 3394, 3395, 3390, 3385, 3376, 3379, 717, 718, 709, 706, 699, 698, 675, 672, 3423, 3420, 3397, 3396, 3389, 3386, 3377, 3378, 754, 753, 762, 765, 694, 689, 686, 681, 3414, 3409, 3406, 3401, 3330, 3333, 3342, 3341, 755, 752, 761, 766, 693, 690, 685, 682, 3413, 3410, 3405, 3402, 3329, 3334, 3343, 3340, 780, 783, 774, 769, 842, 845, 850, 853, 3242, 3245, 3250, 3253, 3326, 3321, 3312, 3315, 781, 782, 773, 770, 841, 846, 849, 854, 3241, 3246, 3249, 3254, 3325, 3322, 3313, 3314, 818, 817, 826, 829, 836, 837, 860, 863, 3232, 3235, 3258, 3259, 3266, 3269, 3278, 3277, 819, 816, 825, 830, 835, 834, 861, 862, 3233, 3234, 3261, 3260, 3265, 3270, 3279, 3276, 1228, 1231, 1222, 1217, 1212, 1213, 1186, 1185, 2910, 2909, 2882, 2883, 2878, 2873, 2864, 2867, 1229, 1230, 1221, 1218, 1211, 1210, 1187, 1184, 2911, 2908, 2885, 2884, 2877, 2874, 2865, 2866, 1266, 1265, 1274, 1277, 1206, 1201, 1198, 1193, 2902, 2897, 2894, 2889, 2818, 2821, 2830, 2829, 1267, 1264, 1273, 1278, 1205, 1202, 1197, 1194, 2901, 2898, 2893, 2890, 2817, 2822, 2831, 2828, 1292, 1295, 1286, 1281, 1354, 1357, 1362, 1365, 2730, 2733, 2738, 2741, 2814, 2809, 2800, 2803, 1293, 1294, 1285, 1282, 1353, 1358, 1361, 1366, 2729, 2734, 2737, 2742, 2813, 2810, 2801, 2802, 1330, 1329, 1338, 1341, 1348, 1349, 1372, 1375, 2720, 2723, 2746, 2747, 2754, 2757, 2766, 2765, 1331, 1328, 1337, 1342, 1347, 1346, 1373, 1374, 2721, 2722, 2749, 2748, 2753, 2758, 2767, 2764, 715, 712, 711, 704, 703, 702, 677, 678, 3417, 3418, 3393, 3392, 3391, 3384, 3383, 3380, 714, 713, 708, 707, 696, 697, 676, 679, 3416, 3419, 3398, 3399, 3388, 3387, 3382, 3381, 757, 758, 763, 764, 695, 688, 687, 680, 3415, 3408, 3407, 3400, 3331, 3332, 3337, 3338, 756, 759, 760, 767, 692, 691, 684, 683, 3412, 3411, 3404, 3403, 3328, 3335, 3336, 3339, 779, 776, 775, 768, 843, 844, 851, 852, 3243, 3244, 3251, 3252, 3327, 3320, 3319, 3316, 778, 777, 772, 771, 840, 847, 848, 855, 3240, 3247, 3248, 3255, 3324, 3323, 3318, 3317, 821, 822, 827, 828, 839, 838, 859, 856, 3239, 3236, 3257, 3256, 3267, 3268, 3273, 3274, 820, 823, 824, 831, 832, 833, 858, 857, 3238, 3237, 3262, 3263, 3264, 3271, 3272, 3275, 1227, 1224, 1223, 1216, 1215, 1214, 1189, 1190, 2905, 2906, 2881, 2880, 2879, 2872, 2871, 2868, 1226, 1225, 1220, 1219, 1208, 1209, 1188, 1191, 2904, 2907, 2886, 2887, 2876, 2875, 2870, 2869, 1269, 1270, 1275, 1276, 1207, 1200, 1199, 1192, 2903, 2896, 2895, 2888, 2819, 2820, 2825, 2826, 1268, 1271, 1272, 1279, 1204, 1203, 1196, 1195, 2900, 2899, 2892, 2891, 2816, 2823, 2824, 2827, 1291, 1288, 1287, 1280, 1355, 1356, 1363, 1364, 2731, 2732, 2739, 2740, 2815, 2808, 2807, 2804, 1290, 1289, 1284, 1283, 1352, 1359, 1360, 1367, 2728, 2735, 2736, 2743, 2812, 2811, 2806, 2805, 1333, 1334, 1339, 1340, 1351, 1350, 1371, 1368, 2727, 2724, 2745, 2744, 2755, 2756, 2761, 2762, 1332, 1335, 1336, 1343, 1344, 1345, 1370, 1369, 2726, 2725, 2750, 2751, 2752, 2759, 2760, 2763}; \ No newline at end of file diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp b/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp new file mode 100644 index 00000000..0a743aeb --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp @@ -0,0 +1,50 @@ + +#include "Neon/Neon.h" +#include "Neon/domain/tools/SpaceCurves.h" +#include "domain-space-filling-curves.h" +#include "goldenEncoding.h" +#include "gtest/gtest.h" +#include "runHelper.h" + +TEST(domain_space_filling_curves, morton) +{ + Neon::int32_3d dim = {16, 16, 16}; + for (int x = 0; x < dim.x; x++) { + for (int y = 0; y < dim.y; y++) { + for (int z = 0; z < dim.z; z++) { + using namespace Neon::domain::tool::spaceCurves; + Neon::int32_3d idx = {x, y, z}; + auto morton = Encoder::encode(EncoderType::morton, dim, idx); + auto sweep = Encoder::encode(EncoderType::sweep, dim, {z,y,x}); + + ASSERT_EQ(morton_grid_16_16_16[sweep], morton) << dim << " " << idx << " " << morton; + } + } + } +} + +TEST(domain_space_filling_curves, hilbert) +{ + Neon::int32_3d dim = {16, 16, 16}; + for (int x = 0; x < dim.x; x++) { + for (int y = 0; y < dim.y; y++) { + for (int z = 0; z < dim.z; z++) { + + using namespace Neon::domain::tool::spaceCurves; + Neon::int32_3d idx = {x, y, z}; + auto hilbert = Encoder::encode(EncoderType::hilbert, dim, idx); + auto sweep = Encoder::encode(EncoderType::sweep, dim, {z,y,x}); + + ASSERT_EQ(hilbert_grid_16_16_16[sweep], hilbert) << dim << " " << idx << " " << hilbert; + } + } + } +} + + +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + Neon::init(); + return RUN_ALL_TESTS(); +} diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h b/libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h new file mode 100644 index 00000000..993bce70 --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h @@ -0,0 +1,100 @@ +#pragma once +#include +#include "gtest/gtest.h" + +#include "Neon/core/core.h" +#include "Neon/core/tools/io/ioToVti.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/DeviceType.h" + +#include "Neon/domain/dGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/eGrid.h" +#include "Neon/domain/tools/Geometries.h" +#include "Neon/domain/tools/TestData.h" + +#include "gtest/gtest.h" + +using namespace Neon; +using namespace Neon::domain; + +using namespace Neon::domain::tool::testing; +using namespace Neon::domain::tool; + +template +void runAllTestConfigurations(std::function&)> f) +{ + std::vector nGpuTest; + nGpuTest.push_back(1); + std::vector cardinalityTest{1}; + + std::vector dimTest{{32,32,32}}; + std::vector runtimeE; + + runtimeE.push_back(Neon::Runtime::openmp); + + + std::vector geos; + std::vector memoryLayoutOptions{Neon::MemoryLayout::structOfArrays}; + + if constexpr (std::is_same_v) { + geos = std::vector{ + Geometry::FullDomain, + }; + } else { + geos = std::vector{ + Geometry::FullDomain, + // Geometry::Sphere, + // Geometry::HollowSphere, + + }; + } + + for (auto dim : dimTest) { + for (const auto& card : cardinalityTest) { + for (auto& geo : geos) { + for (const auto& ngpu : nGpuTest) { + for (const auto& runtime : runtimeE) { + for (const auto& memoryLayout : memoryLayoutOptions) { + + int maxnGPUs = [] { + if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) { + return Neon::set::DevSet::maxSet().setCardinality(); + } + return 1; + }(); + + std::vector ids; + for (int i = 0; i < ngpu; i++) { + ids.push_back(i % maxnGPUs); + } + + Neon::Backend backend(ids, runtime); + Neon::MemoryOptions memoryOptions = backend.getMemoryOptions(); + memoryOptions.setOrder(memoryLayout); + + if constexpr (std::is_same_v) { + if (dim.z < 8 * ngpu * 3) { + dim.z = ngpu * 3 * 8; + } + if (memoryLayout == Neon::MemoryLayout::arrayOfStructs) { + continue; + } + } + + assert(card == 1); + TestData testData(backend, + dim, + card, + memoryOptions, + geo); + + NEON_INFO(testData.toString()); + f(testData); + } + } + } + } + } + } +} From b3897f08f6494c239c39bf934122e6dfc5bc14c1 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 13 Jul 2023 19:05:19 -0400 Subject: [PATCH 36/94] WIP --- .../domain/details/mGrid/mPartition_imp.h | 2 +- .../include/Neon/domain/tools/Partitioner1D.h | 19 +++--- .../Neon/domain/tools/PointHashTable.h | 2 + .../Neon/domain/tools/PointHashTable_imp.h | 6 ++ .../tools/partitioning/SpanClassifier.h | 65 +++++++++++++++---- .../domain/tools/partitioning/SpanLayout.h | 6 +- .../tools/partitioning/SpanClassifier.cpp | 4 +- 7 files changed, 80 insertions(+), 24 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h b/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h index 9473ca55..4ab44988 100644 --- a/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h @@ -33,7 +33,7 @@ mPartition::mPartition(int level, NghIdx* stencilNghIndex, int* refFactors, int* spacing) - : Neon::domain::details::bGrid::bPartition(0, cardinality, mem, neighbourBlocks, mask, origin, stencilNghIndex), + : Neon::domain::details::bGrid::bPartition(0, cardinality, mem, neighbourBlocks, mask, origin, stencilNghIndex, {0,0,0}), mLevel(level), mMemParent(memParent), mMemChild(memChild), diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h index 67f7d9f7..b30f9129 100644 --- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h +++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h @@ -105,13 +105,14 @@ class Partitioner1D template - Partitioner1D(const Neon::Backend& backend, - const ActiveIndexLambda& activeIndexLambda, - const BcLambda& bcLambda, - const Neon::index_3d& dataBlockSize, - const Neon::int32_3d& domainSize, - const Neon::domain::Stencil stencil, - const int& multiResDiscreteIdxSpacing = 1) + Partitioner1D(const Neon::Backend& backend, + const ActiveIndexLambda& activeIndexLambda, + const BcLambda& bcLambda, + const Neon::index_3d& dataBlockSize, + const Neon::int32_3d& domainSize, + const Neon::domain::Stencil stencil, + const int& multiResDiscreteIdxSpacing = 1, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::HILBERT) { mData = std::make_shared(); @@ -182,7 +183,7 @@ class Partitioner1D { return mData->block3DSpan; } - + auto getMemoryGrid() -> Neon::aGrid& { return mData->mTopologyWithGhost; @@ -288,7 +289,7 @@ class Partitioner1D auto getDenseMeta() -> const DenseMeta& { - //setDenseMeta(); + // setDenseMeta(); return *mData->mDenseMeta; } diff --git a/libNeonDomain/include/Neon/domain/tools/PointHashTable.h b/libNeonDomain/include/Neon/domain/tools/PointHashTable.h index 1b3e547e..d7bca923 100644 --- a/libNeonDomain/include/Neon/domain/tools/PointHashTable.h +++ b/libNeonDomain/include/Neon/domain/tools/PointHashTable.h @@ -61,6 +61,8 @@ class PointHashTable */ auto size() const -> size_t; + auto getBBox() const -> Point const&; + private: using Key = size_t; diff --git a/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h b/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h index 3a7375af..1c9abbef 100644 --- a/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h +++ b/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h @@ -105,4 +105,10 @@ auto PointHashTable::size() const -> size_t { return mMap.size(); } + +template +auto PointHashTable::getBBox() const -> Point const&{ + return mBBox; +} + } // namespace Neon::domain::tool \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h index 8833af7a..72d17d4b 100644 --- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h +++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h @@ -2,12 +2,61 @@ #include "Neon/core/core.h" +#include #include "Cassifications.h" #include "Neon/domain/tools/PointHashTable.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/domain/tools/partitioning/SpanDecomposition.h" - +#include namespace Neon::domain::tool::partitioning { +struct Hash +{ + std::vector id1dTo3d; + Neon::domain::tool::PointHashTable id3dTo1d; + + auto reHash(Neon::domain::tool::spaceCurves::EncoderType encoderType) -> void + { + // Encoding all points w.r.t the encoder type + std::vector code; + for (auto const& point : id1dTo3d) { + code.push_back(Neon::domain::tool::spaceCurves::Encoder::encode(encoderType, point, id3dTo1d.getBBox())); + } + // Sort id1dTo3d w.r.t. the codes + std::vector permutation = getSortedPermutation(code, [](uint64_t a, uint64_t b) { + return a < b; + }); + id1dTo3d = applyPermutation(id1dTo3d, permutation); + for(uint64_t i = 0; i < id1dTo3d.size(); i++) { + *(id3dTo1d.getMetadata(id1dTo3d[i])) = i; + } + } + + private: + template + std::vector getSortedPermutation( + const std::vector& vec, + Compare const& compare) + { + std::vector p(vec.size()); + std::iota(p.begin(), p.end(), 0); + std::sort(p.begin(), p.end(), + [&](std::size_t i, std::size_t j) { return compare(vec[i], vec[j]); }); + return p; + } + + template + std::vector applyPermutation( + const std::vector& vec, + const std::vector& p) + { + std::vector sorted_vec(vec.size()); + std::transform(p.begin(), p.end(), sorted_vec.begin(), + [&](std::size_t i) { return vec[i]; }); + return sorted_vec; + } +}; + class SpanClassifier { public: @@ -48,7 +97,7 @@ class SpanClassifier ByPartition, ByDirection, ByDomain) const - -> const Neon::domain::tool::PointHashTable&; + -> const Neon::domain::tool::PointHashTable&; [[nodiscard]] auto countInternal(Neon::SetIdx setIdx, ByDomain byDomain) const -> int; @@ -72,7 +121,7 @@ class SpanClassifier ByPartition, ByDirection, ByDomain) - -> Neon::domain::tool::PointHashTable&; + -> Neon::domain::tool::PointHashTable&; private: auto addPoint(Neon::SetIdx const& setIdx, @@ -82,13 +131,7 @@ class SpanClassifier ByDomain byDomain) -> void; - struct Info - { - std::vector id1dTo3d; - Neon::domain::tool::PointHashTable id3dTo1d; - }; - - using Leve0_Info = Info; + using Leve0_Info = Hash; using Leve1_ByDomain = std::array; using Leve2_ByDirection = std::array; using Leve3_ByPartition = std::array; @@ -129,7 +172,7 @@ SpanClassifier::SpanClassifier(const Neon::Backend& backend, for (auto& level2 : leve3ByPartition) { for (auto& level1 : level2) { for (auto& level0 : level1) { - level0.id3dTo1d = Neon::domain::tool::PointHashTable(block3DSpan); + level0.id3dTo1d = Neon::domain::tool::PointHashTable(block3DSpan); } } } diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h index a7e86f7c..4a01dd16 100644 --- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h +++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h @@ -1,7 +1,7 @@ #pragma once #include "Neon/core/core.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/domain/tools/partitioning/SpanClassifier.h" - namespace Neon::domain::tool::partitioning { class SpanLayout @@ -30,6 +30,10 @@ class SpanLayout std::shared_ptr spanPartitionerPtr, std::shared_ptr spanClassifierPtr); + auto sort(Neon::domain::tool::spaceCurves::EncoderType encoderType, + SpanClassifier& spanClassifier) + -> void; + auto getCount() -> Neon::set::DataSet; diff --git a/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp b/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp index e9c62754..4b372fc9 100644 --- a/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp +++ b/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp @@ -37,7 +37,7 @@ auto SpanClassifier::getMapper1Dto3D(const SetIdx& setIdx, auto SpanClassifier::getMapper3Dto1D(const SetIdx& setIdx, ByPartition byPartition, ByDirection byDirection, - ByDomain byDomain) const -> const Neon::domain::tool::PointHashTable& + ByDomain byDomain) const -> const Neon::domain::tool::PointHashTable& { return mData[setIdx] [static_cast(byPartition)] @@ -63,7 +63,7 @@ auto SpanClassifier::getMapper3Dto1D(const SetIdx& setIdx, ByPartition byPartition, ByDirection byDirection, ByDomain byDomain) - -> Neon::domain::tool::PointHashTable& + -> Neon::domain::tool::PointHashTable& { return mData[setIdx] [static_cast(byPartition)] From 0c8d8cb44f7d5aa8c0b1035b04deb9bfe5c77b8b Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 18 Jul 2023 17:19:07 -0400 Subject: [PATCH 37/94] WIP --- .../include/Neon/domain/details/bGrid/bGrid.h | 9 ++-- .../Neon/domain/details/bGrid/bGrid_imp.h | 10 +++-- .../include/Neon/domain/details/eGrid/eGrid.h | 3 +- .../Neon/domain/details/eGrid/eGrid_imp.h | 4 +- .../include/Neon/domain/tools/Partitioner1D.h | 5 ++- .../tools/partitioning/SpanClassifier.h | 42 +++++++++++++------ 6 files changed, 49 insertions(+), 24 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index 59131cd7..e1c7e55d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -60,7 +60,8 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, const ActiveCellLambda activeCellLambda, const Neon::domain::Stencil& stencil, const double_3d& spacingData = double_3d(1, 1, 1), - const double_3d& origin = double_3d(0, 0, 0)); + const double_3d& origin = double_3d(0, 0, 0), + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); /** @@ -72,10 +73,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, const int multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. - * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */ - , + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */, const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */, - const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */); + const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); /** * Returns some properties for a given cartesian in the Cartesian domain. diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 607237c6..e8dd29b1 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -1,4 +1,5 @@ #include "Neon/domain/details/bGrid/bGrid.h" +#include "Neon/domain/tools/SpaceCurves.h" namespace Neon::domain::details::bGrid { @@ -9,8 +10,9 @@ bGrid::bGrid(const Neon::Backend& backend, const ActiveCellLambda activeCellLambda, const Neon::domain::Stencil& stencil, const double_3d& spacingData, - const double_3d& origin) - : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin) + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) + : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin, encoderType) { } @@ -22,7 +24,8 @@ bGrid::bGrid(const Neon::Backend& backend, const Neon::domain::Stencil& stencil, const int multiResDiscreteIdxSpacing, const double_3d& spacingData, - const double_3d& origin) + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType ) { @@ -58,6 +61,7 @@ bGrid::bGrid(const Neon::Backend& backend, SBlock::memBlockSize3D.template newType(), domainSize, Neon::domain::Stencil::s27_t(false), + encoderType, multiResDiscreteIdxSpacing); mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping(); diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h index 346c2121..8a6269eb 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h @@ -84,7 +84,8 @@ class eGrid : public Neon::domain::interface::GridBaseTemplate const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, - const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */); + const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); eGrid(const Neon::Backend& backend /**< Target for computation */, const Neon::int32_3d& dimension /**< Dimension of the bounding box containing the domain */, diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h index a12f87ce..a717510d 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h @@ -10,7 +10,8 @@ eGrid::eGrid(const Neon::Backend& backend, const ActiveCellLambda& activeCellLambda, const Neon::domain::Stencil& stencil, const Vec_3d& spacing, - const Vec_3d& origin) + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType spaceFillingCode ) { mData = std::make_shared(backend); mData->stencil = stencil; @@ -40,6 +41,7 @@ eGrid::eGrid(const Neon::Backend& backend, 1, dimension, stencil, + spaceFillingCode, 1); diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h index b30f9129..87df30a0 100644 --- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h +++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h @@ -111,8 +111,8 @@ class Partitioner1D const Neon::index_3d& dataBlockSize, const Neon::int32_3d& domainSize, const Neon::domain::Stencil stencil, - const int& multiResDiscreteIdxSpacing = 1, - Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::HILBERT) + Neon::domain::tool::spaceCurves::EncoderType spaceFillingType, + const int& multiResDiscreteIdxSpacing = 1) { mData = std::make_shared(); @@ -165,6 +165,7 @@ class Partitioner1D domainSize, stencil, multiResDiscreteIdxSpacing, + spaceFillingType, mData->spanDecomposition); mData->mSpanLayout = std::make_shared( diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h index 72d17d4b..4777dc59 100644 --- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h +++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h @@ -7,7 +7,6 @@ #include "Neon/domain/tools/PointHashTable.h" #include "Neon/domain/tools/SpaceCurves.h" #include "Neon/domain/tools/partitioning/SpanDecomposition.h" -#include namespace Neon::domain::tool::partitioning { struct Hash @@ -27,7 +26,7 @@ struct Hash return a < b; }); id1dTo3d = applyPermutation(id1dTo3d, permutation); - for(uint64_t i = 0; i < id1dTo3d.size(); i++) { + for (uint64_t i = 0; i < id1dTo3d.size(); i++) { *(id3dTo1d.getMetadata(id1dTo3d[i])) = i; } } @@ -76,6 +75,7 @@ class SpanClassifier const Neon::int32_3d& domainSize, const Neon::domain::Stencil stencil, const int& discreteVoxelSpacing, + Neon::domain::tool::spaceCurves::EncoderType encoderType, std::shared_ptr sp); @@ -146,17 +146,18 @@ template -SpanClassifier::SpanClassifier(const Neon::Backend& backend, - const ActiveCellLambda& activeCellLambda, - const BcLambda& bcLambda, - const Block3dIdxToBlockOrigin& block3dIdxToBlockOrigin, - const GetVoxelAbsolute3DIdx& getVoxelAbsolute3DIdx, - const Neon::int32_3d& block3DSpan, - const Neon::int32_3d& dataBlockSize3D, - const Neon::int32_3d& domainSize, - const Neon::domain::Stencil stencil, - const int& discreteVoxelSpacing, - std::shared_ptr spanDecompositionNoUse) +SpanClassifier::SpanClassifier(const Neon::Backend& backend, + const ActiveCellLambda& activeCellLambda, + const BcLambda& bcLambda, + const Block3dIdxToBlockOrigin& block3dIdxToBlockOrigin, + const GetVoxelAbsolute3DIdx& getVoxelAbsolute3DIdx, + const Neon::int32_3d& block3DSpan, + const Neon::int32_3d& dataBlockSize3D, + const Neon::int32_3d& domainSize, + const Neon::domain::Stencil stencil, + const int& discreteVoxelSpacing, + Neon::domain::tool::spaceCurves::EncoderType spaceFillingType, + std::shared_ptr spanDecompositionNoUse) { mData = backend.devSet().newDataSet(); mSpanDecomposition = spanDecompositionNoUse; @@ -279,5 +280,20 @@ SpanClassifier::SpanClassifier(const Neon::Backend& backend, } } }); + + mData.forEachSeq([&](SetIdx, auto& leve3ByPartition) { + // using Leve0_Info = Info; + // using Leve1_ByDomain = std::array; + // using Leve2_ByDirection = std::array; + // using Leve3_ByPartition = std::array; + // using Data = Neon::set::DataSet; + for (auto& level2 : leve3ByPartition) { + for (auto& level1 : level2) { + for (auto& level0 : level1) { + level0.reHash(spaceFillingType); + } + } + } + }); } } // namespace Neon::domain::tool::partitioning From 3cc397c6e27679809ff60eb1b629d04f7c393d89 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 19 Jul 2023 14:43:23 -0400 Subject: [PATCH 38/94] Fixing space filling curves --- .../tools/partitioning/SpanClassifier.h | 37 +++++++++- .../src/gtests.cpp | 70 ++++++++++++++++++- 2 files changed, 104 insertions(+), 3 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h index 4777dc59..636320b9 100644 --- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h +++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h @@ -16,10 +16,29 @@ struct Hash auto reHash(Neon::domain::tool::spaceCurves::EncoderType encoderType) -> void { + // std::cout << "BEFORE Cartesian "; + // for (int i = 0; i < int(id1dTo3d.size()); i++) { + // std::cout << id1dTo3d[i] << " "; + // } + // std::cout << std::endl + // << " ID "; + // for (int i = 0; i < int(id1dTo3d.size()); i++) { + // std::cout << *id3dTo1d.getMetadata(id1dTo3d[i]) << " "; + // } + // std::cout << std::endl + // << " CODE "; + // for (int i = 0; i < int(id1dTo3d.size()); i++) { + // std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(encoderType, id3dTo1d.getBBox(), id1dTo3d[i]) << " "; + // } + // std::cout << std::endl; + // std::cout << " BOX " << id3dTo1d.getBBox(); + // + // std::cout << std::endl; + // Encoding all points w.r.t the encoder type std::vector code; for (auto const& point : id1dTo3d) { - code.push_back(Neon::domain::tool::spaceCurves::Encoder::encode(encoderType, point, id3dTo1d.getBBox())); + code.push_back(Neon::domain::tool::spaceCurves::Encoder::encode(encoderType, id3dTo1d.getBBox(), point)); } // Sort id1dTo3d w.r.t. the codes std::vector permutation = getSortedPermutation(code, [](uint64_t a, uint64_t b) { @@ -29,6 +48,22 @@ struct Hash for (uint64_t i = 0; i < id1dTo3d.size(); i++) { *(id3dTo1d.getMetadata(id1dTo3d[i])) = i; } +// +// std::cout << "AFTER Cartesian "; +// for (int i = 0; i < int(id1dTo3d.size()); i++) { +// std::cout << id1dTo3d[i] << " "; +// } +// std::cout << std::endl +// << " ID "; +// for (int i = 0; i < int(id1dTo3d.size()); i++) { +// std::cout << *id3dTo1d.getMetadata(id1dTo3d[i]) << " "; +// } +// std::cout << std::endl +// << " CODE "; +// for (int i = 0; i < int(id1dTo3d.size()); i++) { +// std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(encoderType, id3dTo1d.getBBox(), id1dTo3d[i]) << " "; +// } +// std::cout << std::endl; } private: diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp b/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp index 0a743aeb..954d4ecd 100644 --- a/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp +++ b/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp @@ -15,7 +15,7 @@ TEST(domain_space_filling_curves, morton) using namespace Neon::domain::tool::spaceCurves; Neon::int32_3d idx = {x, y, z}; auto morton = Encoder::encode(EncoderType::morton, dim, idx); - auto sweep = Encoder::encode(EncoderType::sweep, dim, {z,y,x}); + auto sweep = Encoder::encode(EncoderType::sweep, dim, {z, y, x}); ASSERT_EQ(morton_grid_16_16_16[sweep], morton) << dim << " " << idx << " " << morton; } @@ -33,7 +33,7 @@ TEST(domain_space_filling_curves, hilbert) using namespace Neon::domain::tool::spaceCurves; Neon::int32_3d idx = {x, y, z}; auto hilbert = Encoder::encode(EncoderType::hilbert, dim, idx); - auto sweep = Encoder::encode(EncoderType::sweep, dim, {z,y,x}); + auto sweep = Encoder::encode(EncoderType::sweep, dim, {z, y, x}); ASSERT_EQ(hilbert_grid_16_16_16[sweep], hilbert) << dim << " " << idx << " " << hilbert; } @@ -41,6 +41,72 @@ TEST(domain_space_filling_curves, hilbert) } } +TEST(domain_space_filling_curves, hilbert_hilbert) +{ + auto run = [](Neon::domain::tool::spaceCurves::EncoderType encodingType, int dimEdge) { + // Step 1 -> Neon backend: choosing the hardware for the computation + Neon::init(); + // auto runtime = Neon::Runtime::openmp; + auto runtime = Neon::Runtime::openmp; + // We are overbooking GPU 0 three times + std::vector devIds{0}; + Neon::Backend backend(devIds, runtime); + + // Step 2 -> Neon grid: setting up a dense cartesian domain + Neon::index_3d dim(dimEdge, dimEdge, dimEdge); // Size of the domain + + using Grid = Neon::eGrid; // Selecting one of the grid provided by Neon + Neon::domain::Stencil gradStencil([] { + // We use a center difference scheme to compute the grad + // The order of the points is important, + // as we'll leverage the specific order when computing the grad. + // First positive direction on x, y and z, + // then negative direction on x, y, z respectively. + return std::vector{ + {1, 0, 0}, + {0, 1, 0}, + {0, 0, 1}, + {-1, 0, 0}, + {0, -1, 0}, + {0, 0, -1}}; + }()); + // Actual Neon grid allocation + Grid grid( + backend, + dim, + [&](const Neon::index_3d&) -> bool { + return true; + }, // <- defining the active cells. + gradStencil, + 1.0, + 0.0, encodingType); + + auto field = grid.newField("spaceCode", 1, 0); + + grid.newContainer("DecoceFromId", + [&](Neon::set::Loader& l) { + auto f = l.load(field); + return [=] NEON_CUDA_HOST_DEVICE(const Grid::Idx& gidx) mutable { + auto internalId = gidx.helpGet(); + auto global = f.getGlobalIndex(gidx); +#pragma omp critical + { + using namespace Neon::domain::tool::spaceCurves; + auto encoded = Encoder::encode(encodingType, dim, global); + // std::cout << global << " -> internal " << internalId << " code " << encoded << std::endl; + EXPECT_EQ(internalId, encoded); + } + f(gidx, 0) = internalId; + }; + }) + .run(Neon::Backend::mainStreamIdx); + field.ioToVtk("DecoceFromId", "grad"); + printf("DONE\n"); + }; + run(Neon::domain::tool::spaceCurves::EncoderType::sweep, 32); + run(Neon::domain::tool::spaceCurves::EncoderType::morton,32); + run(Neon::domain::tool::spaceCurves::EncoderType::hilbert,32); +} int main(int argc, char** argv) { From 1e7c890841c70272025873b6fc6d90fb47b23e6c Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 19 Jul 2023 16:42:52 -0400 Subject: [PATCH 39/94] Adding space filling curve parameter to dGrid --- .../include/Neon/domain/details/dGrid/dGrid.h | 5 +-- .../Neon/domain/details/dGrid/dGrid_imp.h | 32 +++++++++++-------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h index 226b0bb8..5d56e526 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h @@ -20,7 +20,7 @@ #include "Neon/domain/interface/LaunchConfig.h" #include "Neon/domain/interface/Stencil.h" #include "Neon/domain/interface/common.h" - +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/domain/tools/SpanTable.h" #include "Neon/domain/patterns/PatternScalar.h" @@ -84,7 +84,8 @@ class dGrid : public Neon::domain::interface::GridBaseTemplate const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, - const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */); + const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); /** * Returns a LaunchParameters configured for the specified inputs. diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h index 297971de..9b7a7ac6 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h @@ -8,12 +8,18 @@ template dGrid::dGrid(const Neon::Backend& backend, const Neon::int32_3d& dimension, const ActiveCellLambda& /*activeCellLambda*/, - const Neon::domain::Stencil& stencil, - const Vec_3d& spacing, - const Vec_3d& origin) + const Neon::domain::Stencil& stencil, + const Vec_3d& spacing, + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) { mData = std::make_shared(backend); const index_3d defaultBlockSize(256, 1, 1); + if (encoderType != Neon::domain::tool::spaceCurves::EncoderType::sweep) { + NeonException exce("dGrid"); + exce << "dGRid only supports sweep space filling curves"; + NEON_THROW(exce); + } { auto nElementsPerPartition = backend.devSet().template newDataSet(0); @@ -224,11 +230,11 @@ auto dGrid::newContainer(const std::string& name, { const Neon::index_3d& defaultBlockSize = getDefaultBlock(); Neon::set::Container c = Neon::set::Container::factory(name, - Neon::set::internal::ContainerAPI::DataViewSupport::on, - *this, - lambda, - defaultBlockSize, - [](const Neon::index_3d&) { return 0; }); + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + defaultBlockSize, + [](const Neon::index_3d&) { return 0; }); return c; } @@ -242,11 +248,11 @@ auto dGrid::newContainer(const std::string& name, -> Neon::set::Container { Neon::set::Container c = Neon::set::Container::factory(name, - Neon::set::internal::ContainerAPI::DataViewSupport::on, - *this, - lambda, - blockSize, - [sharedMem](const Neon::index_3d&) { return sharedMem; }); + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + blockSize, + [sharedMem](const Neon::index_3d&) { return sharedMem; }); return c; } From 9e9dc40c6d4948d35192e1f26a4b72c287f5144c Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 19 Jul 2023 19:34:55 -0400 Subject: [PATCH 40/94] Extending benchmark with space filling curve option. --- .../lbm-lid-driven-cavity-flow.py | 91 ++++++++++--------- .../lbm-lid-driven-cavity-flow/src/Config.cpp | 72 +++++++++------ .../lbm-lid-driven-cavity-flow/src/Config.h | 46 +++++----- .../lbm-lid-driven-cavity-flow/src/Report.cpp | 1 + .../src/RunCavityTwoPop.cu | 4 +- .../Neon/domain/tools/gridTransformer/tGrid.h | 15 +-- .../domain/tools/gridTransformer/tGrid_ti.h | 13 +-- 7 files changed, 137 insertions(+), 105 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index 677aefba..fa263aa9 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -6,8 +6,9 @@ COMPUTE_FP_LIST = "double float".split() OCC_LIST = "nOCC sOCC".split() HU_LIST = "huGrid huLattice".split() +CURVE_LIST = "sweep morton hilbert".split() WARM_UP_ITER = 10 -MAX_ITER = 100 +MAX_ITER = 10000 REPETITIONS = 5 import subprocess @@ -40,10 +41,13 @@ def countAll(): for DEVICE_SET in DEVICE_SET_LIST: for GRID in GRID_LIST: for HU in HU_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue - - counter += 1 + for CURVE in CURVE_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue + + counter += 1 return counter @@ -64,44 +68,49 @@ def countAll(): for COMPUTE_FP in COMPUTE_FP_LIST: for GRID in GRID_LIST: for HU in HU_LIST: + for CURVE in CURVE_LIST: + + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + + DEVICE_SET.replace(' ', '_') + '-' + + GRID + '_' + + DOMAIN_SIZE + '-' + + STORAGE_FP + '-' + COMPUTE_FP + '-' + + OCC) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--curve ' + CURVE) - parameters = [] - parameters.append('--deviceType ' + DEVICE_TYPE) - parameters.append('--deviceIds ' + DEVICE_SET) - parameters.append('--grid ' + GRID) - parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) - parameters.append('--max-iter ' + str(MAX_ITER)) - parameters.append( - '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + - DEVICE_SET.replace(' ', '_') + '-' + - GRID + '_' + - DOMAIN_SIZE + '-' + - STORAGE_FP + '-' + COMPUTE_FP + '-' + - OCC) - parameters.append('--computeFP ' + COMPUTE_FP) - parameters.append('--storageFP ' + STORAGE_FP) - parameters.append('--benchmark') - parameters.append('--' + OCC) - parameters.append('--' + HU) + parameters.append('--benchmark') + parameters.append('--' + OCC) + parameters.append('--' + HU) - commandList = [] - commandList.append(command) - for el in parameters: - for s in el.split(): - commandList.append(s) + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) - fp.write("\n-------------------------------------------\n") - fp.write(' '.join(commandList)) - fp.write("\n-------------------------------------------\n") - fp.flush() - print(' '.join(commandList)) - subprocess.run(commandList, text=True, stdout=fp) + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + print(' '.join(commandList)) + subprocess.run(commandList, text=True, stdout=fp) - counter += 1 - printProgressBar(counter * 100.0 / SAMPLES, 'Progress') + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp index 165dcff5..115125bd 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp @@ -41,6 +41,7 @@ auto Config::toString() const -> std::string s << "......... computeType " << c.computeType << std::endl; s << "........... storeType " << c.storeType << std::endl; + s << "............... curve " << c.curve << std::endl; s << ". ............... occ " << Neon::skeleton::OccUtils::toString(c.occ) << std::endl; s << "....... transfer Mode " << Neon::set::TransferModeUtils::toString(c.transferMode) << std::endl; @@ -60,43 +61,58 @@ auto Config::parseArgs(const int argc, char* argv[]) auto& config = *this; auto cli = - ( - clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device ids to use", - clipp::required("--deviceIds") & clipp::integers("gpus", config.devices) % "Device ids to use", - clipp::option("--grid") & clipp::value("grid", config.gridType) % "Could be dGrid, eGrid, bGrid", - clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain", - clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters", - clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations", - clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run.", - clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename", - - clipp::option("--computeFP") & clipp::value("computeFP", config.computeType) % "Could be double or float", - clipp::option("--storageFP") & clipp::value("storageFP", config.storeType) % "Could be double or float", - - ( - (clipp::option("--sOCC").set(config.occ, Neon::skeleton::Occ::standard) % "Standard OCC") | - (clipp::option("--nOCC").set(config.occ, Neon::skeleton::Occ::none) % "No OCC (on by default)")), - ( - (clipp::option("--put").set(config.transferMode, Neon::set::TransferMode::put) % "Set transfer mode to PUT") | - (clipp::option("--get").set(config.transferMode, Neon::set::TransferMode::get) % "Set transfer mode to GET (on by default)")), - ( - (clipp::option("--huLattice").set(config.stencilSemantic, Neon::set::StencilSemantic::streaming) % "Halo update with lattice semantic (on by default)") | - (clipp::option("--huGrid").set(config.stencilSemantic, Neon::set::StencilSemantic::standard) % "Halo update with grid semantic ")), - ( - (clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode") | - (clipp::option("--visual").set(config.benchmark, false) % "Run export partial data")), - - ( - clipp::option("--vti").set(config.vti, true) % "Standard OCC") + (clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device ids to use", + clipp::required("--deviceIds") & clipp::integers("gpus", config.devices) % "Device ids to use", + clipp::option("--grid") & clipp::value("grid", config.gridType) % "Could be dGrid, eGrid, bGrid", + clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain", + clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters", + clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations", + clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run.", + clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename", + + clipp::option("--computeFP") & clipp::value("computeFP", config.computeType) % "Could be double or float", + clipp::option("--storageFP") & clipp::value("storageFP", config.storeType) % "Could be double or float", + + clipp::option("--curve") & clipp::value("curve", config.curve) % "Could be sweep (the default), morton, or hilber", + ( + (clipp::option("--sOCC").set(config.occ, Neon::skeleton::Occ::standard) % "Standard OCC") | + (clipp::option("--nOCC").set(config.occ, Neon::skeleton::Occ::none) % "No OCC (on by default)")), + ( + (clipp::option("--put").set(config.transferMode, Neon::set::TransferMode::put) % "Set transfer mode to PUT") | + (clipp::option("--get").set(config.transferMode, Neon::set::TransferMode::get) % "Set transfer mode to GET (on by default)")), + ( + (clipp::option("--huLattice").set(config.stencilSemantic, Neon::set::StencilSemantic::streaming) % "Halo update with lattice semantic (on by default)") | + (clipp::option("--huGrid").set(config.stencilSemantic, Neon::set::StencilSemantic::standard) % "Halo update with grid semantic ")), + ( + (clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode") | + (clipp::option("--visual").set(config.benchmark, false) % "Run export partial data")), + + ( + clipp::option("--vti").set(config.vti, true) % "Standard OCC") ); + if (!clipp::parse(argc, argv, cli)) { auto fmt = clipp::doc_formatting{}.doc_column(31); std::cout << make_man_page(cli, argv[0], fmt) << '\n'; return -1; } + if (config.curve == "sweep") + config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep; + if (config.curve == "morton") + config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::morton; + if (config.curve == "hilbert") + config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::hilbert; + + if (config.curve != "sweep" && config.curve != "morton" && config.curve != "hilbert") { + auto fmt = clipp::doc_formatting{}.doc_column(31); + std::cout << config.curve << " is not a supported configuration" << std::endl; + std::cout << make_man_page(cli, argv[0], fmt) << '\n'; + return -1; + } + helpSetLbmParameters(); return 0; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h index af32972e..18695ce4 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h @@ -3,6 +3,7 @@ #include #include #include "Neon/core/tools/clipp.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/skeleton/Skeleton.h" template @@ -16,28 +17,29 @@ struct LbmParameters struct Config { - double Re = 100.; // Reynolds number - double ulb = 0.04; // Velocity in lattice units - int N = 160; // Number of nodes in x-direction - bool benchmark = false; // Run in benchmark mode ? - double max_t = 10.0; // Non-benchmark mode: Total time in dim.less units - int outFrequency = 200; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) - int dataFrequency = 0; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) - int benchIniIter = 1000; // Benchmark mode: Number of warmup iterations - int benchMaxIter = 2000; // Benchmark mode: Total number of iterations - int repetitions = 1; // Benchmark mode: number of time the test is run - std::string deviceType = "gpu"; - std::vector devices = std::vector(0); // Devices for the execution - std::string reportFile = "lbm-lid-driven-cavity-flow"; // Report file name - std::string gridType = "dGrid"; // Neon grid type - Neon::skeleton::Occ occ = Neon::skeleton::Occ::none; // Neon OCC type - Neon::set::TransferMode transferMode = Neon::set::TransferMode::get; // Neon transfer mode for halo update - Neon::set::StencilSemantic stencilSemantic = Neon::set::StencilSemantic::streaming; - bool vti = false; // Export vti file - std::string computeType = "double"; - std::string storeType = "double"; - - LbmParameters mLbmParameters; + double Re = 100.; // Reynolds number + double ulb = 0.04; // Velocity in lattice units + int N = 160; // Number of nodes in x-direction + bool benchmark = false; // Run in benchmark mode ? + double max_t = 10.0; // Non-benchmark mode: Total time in dim.less units + int outFrequency = 200; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) + int dataFrequency = 0; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) + int benchIniIter = 1000; // Benchmark mode: Number of warmup iterations + int benchMaxIter = 2000; // Benchmark mode: Total number of iterations + int repetitions = 1; // Benchmark mode: number of time the test is run + std::string deviceType = "gpu"; + std::vector devices = std::vector(0); // Devices for the execution + std::string reportFile = "lbm-lid-driven-cavity-flow"; // Report file name + std::string gridType = "dGrid"; // Neon grid type + Neon::skeleton::Occ occ = Neon::skeleton::Occ::none; // Neon OCC type + Neon::set::TransferMode transferMode = Neon::set::TransferMode::get; // Neon transfer mode for halo update + Neon::set::StencilSemantic stencilSemantic = Neon::set::StencilSemantic::streaming; + bool vti = false; // Export vti file + std::string computeType = "double"; + std::string storeType = "double"; + std::string curve = "sweep"; + Neon::domain::tool::spaceCurves::EncoderType spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep; + LbmParameters mLbmParameters; auto toString() const -> std::string; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp b/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp index 2e88f907..546b03b7 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp @@ -29,6 +29,7 @@ Report::Report(const Config& c) mReport.addMember("computeType", c.computeType); mReport.addMember("storeType", c.storeType); + mReport.addMember("spaceCurve", Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(c.spaceCurve)); mReport.addMember("occ", Neon::skeleton::OccUtils::toString(c.occ)); diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index 6a14cd8f..757bcb28 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -62,7 +62,9 @@ auto run(Config& config, Grid grid( bk, {config.N, config.N, config.N}, [](const Neon::index_3d&) { return true; }, - Lattice::template getDirectionAsVector()); + Lattice::template getDirectionAsVector(), + 0.0, 1.0, + config.spaceCurve); PopulationField pop0 = grid.template newField("Population", Lattice::Q, Storage(0.0)); PopulationField pop1 = grid.template newField("Population", Lattice::Q, Storage(0.0)); diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h index bd28e8f5..ac98983c 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h @@ -8,8 +8,8 @@ #include "Neon/domain/interface/Stencil.h" #include "Neon/domain/interface/common.h" #include "Neon/domain/patterns/PatternScalar.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/domain/tools/SpanTable.h" - /** * template * GridTransformation { @@ -56,12 +56,13 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate - tGrid(const Neon::Backend& backend /**< Target for computation */, - const Neon::int32_3d& dimension /**< Dimension of the bounding box containing the domain */, - const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, - const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, - const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, - const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */); + tGrid(const Neon::Backend& backend /**< Target for computation */, + const Neon::int32_3d& dimension /**< Dimension of the bounding box containing the domain */, + const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, + const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, + const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, + const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); tGrid(const tGrid& other); // copy constructor tGrid(tGrid&& other) noexcept; // move constructor diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h index 0a0249d7..8fd3f3ab 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h @@ -32,12 +32,13 @@ tGrid::tGrid(FoundationGrid& foundationGrid) template template -tGrid::tGrid(const Neon::Backend& bk, - const Neon::int32_3d& dimension, - const SparsityPattern& activeCellLambda, - const Neon::domain::Stencil& stencil, - const Vec_3d& spacing, - const Vec_3d& origin) +tGrid::tGrid(const Neon::Backend& bk, + const Neon::int32_3d& dimension, + const SparsityPattern& activeCellLambda, + const Neon::domain::Stencil& stencil, + const Vec_3d& spacing, + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) { mData = std::make_shared(bk); mData->foundationGrid = FoundationGrid(bk, From 18ffd024e44d584c28c0c3528327d0f0e12b92ca Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 19 Jul 2023 19:50:50 -0400 Subject: [PATCH 41/94] WIP --- .../lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py | 2 +- .../include/Neon/domain/tools/gridTransformer/tGrid_ti.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index fa263aa9..c560755f 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -46,7 +46,7 @@ def countAll(): continue if STORAGE_FP == 'float' and COMPUTE_FP == 'double': continue - + counter += 1 return counter diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h index 8fd3f3ab..bf994458 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h @@ -46,7 +46,8 @@ tGrid::tGrid(const Neon::Backend& bk activeCellLambda, stencil, spacing, - origin); + origin, + encoderType); GridTransformation::initSpan(mData->foundationGrid, NEON_OUT mData->spanTable); tGrid::GridBase::init("tGrid", From 199c94601fab18fd77c57eabab85244d0afad6b5 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 25 Jul 2023 15:55:22 -0400 Subject: [PATCH 42/94] Extending grid report capabilities. --- .../lbm-lid-driven-cavity-flow/src/Metrics.h | 6 ++ .../lbm-lid-driven-cavity-flow/src/Repoert.h | 3 +- .../lbm-lid-driven-cavity-flow/src/Report.cpp | 5 ++ .../src/RunCavityTwoPop.cu | 86 ++++++++++--------- .../Neon/domain/details/bGrid/bGrid_imp.h | 53 +++++++----- .../Neon/domain/details/dGrid/dGrid_imp.h | 8 +- .../Neon/domain/details/eGrid/eGrid_imp.h | 8 +- .../Neon/domain/details/sGrid/sGrid_imp.h | 8 +- .../include/Neon/domain/interface/GridBase.h | 71 ++++++++------- .../include/Neon/domain/tools/Partitioner1D.h | 21 +++-- .../domain/tools/gridTransformer/tGrid_ti.h | 8 +- .../tools/partitioning/SpanClassifier.h | 4 +- .../src/domain/details/aGrid/aGrid.cpp | 4 +- .../src/domain/details/eGrid/eGrid.cpp | 16 ++-- .../src/domain/interface/GridBase.cpp | 76 ++++++++++++---- 15 files changed, 239 insertions(+), 138 deletions(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h index be94ab76..7e6697ef 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h @@ -23,6 +23,12 @@ void recordBackend(Neon::Backend& bk, report.recordBk(bk); } +void recordGrid(Neon::domain::interface::GridBase& g, + Report& report) +{ + report.recordGrid(g); +} + } // namespace diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h index 565a9108..4ca0827b 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h @@ -3,7 +3,7 @@ #include #include #include "Config.h" - +#include "Neon/domain/interface/GridBase.h" struct Report { Neon::Report mReport; @@ -36,4 +36,5 @@ struct Report auto save() -> void; void recordBk(Neon::Backend& backend); + void recordGrid(Neon::domain::interface::GridBase& g); }; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp b/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp index 546b03b7..049d1735 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp @@ -101,3 +101,8 @@ void Report::recordBk(Neon::Backend& backend) { backend.toReport(mReport); } + +void Report::recordGrid(Neon::domain::interface::GridBase& g) +{ + g.toReport(mReport, true); +} \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index 757bcb28..aa9bc461 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -51,10 +51,7 @@ auto run(Config& config, NEON_THROW(exce); }(); - if (!backendWasReported) { - metrics::recordBackend(bk, report); - backendWasReported = true; - } + Neon::double_3d ulid(1., 0., 0.); // Neon Grid and Fields initialization @@ -66,6 +63,12 @@ auto run(Config& config, 0.0, 1.0, config.spaceCurve); + if (!backendWasReported) { + metrics::recordBackend(bk, report); + metrics::recordGrid(grid, report); + backendWasReported = true; + } + PopulationField pop0 = grid.template newField("Population", Lattice::Q, Storage(0.0)); PopulationField pop1 = grid.template newField("Population", Lattice::Q, Storage(0.0)); @@ -281,7 +284,7 @@ auto run(Config& config, NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") } } - if (config.gridType == "bGrid") { + if (config.gridType == "bGrid" || config.gridType == "bGrid_8_8_8") { return details::runFilterStoreType(config, report); } if (config.gridType == "bGrid_4_4_4") { @@ -302,42 +305,42 @@ auto run(Config& config, NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") } } - if (config.gridType == "bGrid_32_8_4") { - if constexpr (!skipTest) { - using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); - } else { - NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") - } - } - if (config.gridType == "bGrid_32_8_4") { - if constexpr (!skipTest) { - using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); - } else { - NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") - } - } - if (config.gridType == "bGrid_32_2_8") { - if constexpr (!skipTest) { - using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); - } else { - NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") - } - } - if (config.gridType == "bGrid_32_8_2") { - if constexpr (!skipTest) { - using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); - } else { - NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") - } - } +// if (config.gridType == "bGrid_32_8_4") { +// if constexpr (!skipTest) { +// using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; +// using Grid = Neon::domain::details::bGrid::bGrid; +// return details::runFilterStoreType(config, report); +// } else { +// NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") +// } +// } +// if (config.gridType == "bGrid_32_8_4") { +// if constexpr (!skipTest) { +// using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; +// using Grid = Neon::domain::details::bGrid::bGrid; +// return details::runFilterStoreType(config, report); +// } else { +// NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") +// } +// } +// if (config.gridType == "bGrid_32_2_8") { +// if constexpr (!skipTest) { +// using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; +// using Grid = Neon::domain::details::bGrid::bGrid; +// return details::runFilterStoreType(config, report); +// } else { +// NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") +// } +// } +// if (config.gridType == "bGrid_32_8_2") { +// if constexpr (!skipTest) { +// using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; +// using Grid = Neon::domain::details::bGrid::bGrid; +// return details::runFilterStoreType(config, report); +// } else { +// NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") +// } +// } if (config.gridType == "dGridSoA") { if constexpr (!skipTest) { return details::runFilterStoreType(config, report); @@ -345,5 +348,6 @@ auto run(Config& config, NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") } } + NEON_THROW_UNSUPPORTED_OPERATION("Unknown grid type: " + config.gridType); } } // namespace CavityTwoPop diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index e8dd29b1..a375c64d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -5,12 +5,12 @@ namespace Neon::domain::details::bGrid { template template -bGrid::bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const double_3d& spacingData, - const double_3d& origin, +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData, + const double_3d& origin, Neon::domain::tool::spaceCurves::EncoderType encoderType) : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin, encoderType) { @@ -18,14 +18,14 @@ bGrid::bGrid(const Neon::Backend& backend, template template -bGrid::bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const int multiResDiscreteIdxSpacing, - const double_3d& spacingData, - const double_3d& origin, - Neon::domain::tool::spaceCurves::EncoderType encoderType ) +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const int multiResDiscreteIdxSpacing, + const double_3d& spacingData, + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) { @@ -38,18 +38,25 @@ bGrid::bGrid(const Neon::Backend& backend, SBlock::memBlockSizeY, SBlock::memBlockSizeZ); + std::stringstream gridName; + gridName << "bGrid_" << SBlock::memBlockSizeX << "_" + << SBlock::memBlockSizeY << "_" + << SBlock::memBlockSizeZ; { auto nElementsPerPartition = backend.devSet().template newDataSet(0); // We do an initialization with nElementsPerPartition to zero, // then we reset to the computed number. - bGrid::GridBase::init("bGrid", + + bGrid::GridBase::init(gridName.str(), backend, domainSize, stencil, nElementsPerPartition, defaultKernelBlockSize, multiResDiscreteIdxSpacing, - origin); + origin, + encoderType, + defaultKernelBlockSize); } { // Initialization of the partitioner @@ -111,8 +118,8 @@ bGrid::bGrid(const Neon::Backend& backend, for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { auto globalPosition = blockOrigin + Neon::int32_3d(i * this->mData->mMultiResDiscreteIdxSpacing, - j * this->mData->mMultiResDiscreteIdxSpacing, - k * this->mData->mMultiResDiscreteIdxSpacing); + j * this->mData->mMultiResDiscreteIdxSpacing, + k * this->mData->mMultiResDiscreteIdxSpacing); bool const isInDomain = globalPosition < domainSize * this->mData->mMultiResDiscreteIdxSpacing; bool const isActive = activeCellLambda(globalPosition); if (isActive && isInDomain) { @@ -159,8 +166,8 @@ bGrid::bGrid(const Neon::Backend& backend, BlockIdx blockNghIdx = Span::getInvalidBlockId(); typename decltype(blockConnectivity)::Idx nghIdx; Neon::int8_3d stencilPoint(i - int8_t(1), - j - int8_t(1), - k - int8_t(1)); + j - int8_t(1), + k - int8_t(1)); bool isValid = blockConnectivity.getNghIndex(idx, stencilPoint, nghIdx); if (isValid) { blockNghIdx = static_cast(nghIdx.helpGet()); @@ -224,14 +231,16 @@ bGrid::bGrid(const Neon::Backend& backend, mData->stencilIdTo3dOffset.updateDeviceData(backend, Neon::Backend::mainStreamIdx); } // Init the base grid - bGrid::GridBase::init("bGrid", + bGrid::GridBase::init(gridName.str(), backend, domainSize, Neon::domain::Stencil(), mData->mNumActiveVoxel, SBlock::memBlockSize3D.template newType(), spacingData, - origin); + origin, + encoderType, + defaultKernelBlockSize); { // setting launchParameters mData->launchParametersTable.forEachSeq([&](Neon::DataView dw, Neon::set::LaunchParameters& bLaunchParameters) { diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h index 9b7a7ac6..a6fbf1aa 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h @@ -32,7 +32,9 @@ dGrid::dGrid(const Neon::Backend& backend, nElementsPerPartition, Neon::index_3d(256, 1, 1), spacing, - origin); + origin, + Neon::domain::tool::spaceCurves::EncoderType::sweep, + {0, 0, 0}); } const int32_t numDevices = getBackend().devSet().setCardinality(); @@ -186,7 +188,9 @@ dGrid::dGrid(const Neon::Backend& backend, nElementsPerPartition, defaultBlockSize, spacing, - origin); + origin, + Neon::domain::tool::spaceCurves::EncoderType::sweep, + {0, 0, 0}); } } diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h index a717510d..1e5c444b 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h @@ -30,7 +30,9 @@ eGrid::eGrid(const Neon::Backend& backend, nElementsPerPartition, Neon::index_3d(256, 1, 1), spacing, - origin); + origin, + spaceFillingCode, + {1,1,1}); } @@ -126,7 +128,9 @@ eGrid::eGrid(const Neon::Backend& backend, nElementsPerPartition, defaultBlockSize, spacing, - origin); + origin, + spaceFillingCode, + {1,1,1}); } } diff --git a/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h b/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h index eed4c3bf..c76b2d42 100644 --- a/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h @@ -41,7 +41,9 @@ sGrid::sGrid(const OuterGridT& outerGrid, nElementsPerPartition, defaultsBlockDim, outerGrid.getSpacing(), - outerGrid.getOrigin()); + outerGrid.getOrigin(), + outerGrid.getSpaceCurve(), + outerGrid.getMemoryBlock()); mStorage = std::make_shared(); mStorage->init(outerGrid); @@ -173,7 +175,9 @@ sGrid::sGrid(const OuterGridT& outerGrid, mStorage->getCount(Neon::DataView::STANDARD), defaultsBlockDim, outerGrid.getSpacing(), - outerGrid.getOrigin()); + outerGrid.getOrigin(), + outerGrid.getSpaceCurve(), + outerGrid.getMemoryBlock()); } template diff --git a/libNeonDomain/include/Neon/domain/interface/GridBase.h b/libNeonDomain/include/Neon/domain/interface/GridBase.h index daa5d697..04837435 100644 --- a/libNeonDomain/include/Neon/domain/interface/GridBase.h +++ b/libNeonDomain/include/Neon/domain/interface/GridBase.h @@ -9,8 +9,8 @@ #include "Neon/set/DevSet.h" #include "Neon/core/tools/io/ioToVti.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Stencil.h" - namespace Neon::domain::interface { /** @@ -66,13 +66,6 @@ class GridBase auto getNumActiveCellsPerPartition() const -> const Neon::set::DataSet&; - // /** - // * Return the number of cells stored per partition - // * @return - // */ - // auto getNumActiveCellsPerPartition() const - // -> const Neon::set::DataSet&; - /** * Creates a DataSet object compatible with the number of GPU used by the grid. */ @@ -123,6 +116,8 @@ class GridBase auto getGridUID() const -> size_t; + + /** * Add the grid information in a Report object */ @@ -136,31 +131,40 @@ class GridBase auto getDefaultBlock() const -> const Neon::index_3d&; + auto getMemoryBlock() const + -> Neon::index_3d; + + auto getSpaceCurve() const + -> Neon::domain::tool::spaceCurves::EncoderType; protected: /** * Protected constructor */ - GridBase(const std::string& gridImplementationName, - const Neon::Backend& backend, - const Neon::index_3d& dim, - const Neon::domain::Stencil& stencil, - const Neon::set::DataSet& nPartitionElements /**< Number of element per partition */, - const Neon::index_3d& defaultBlockSize, - const Vec_3d& spacingData = Vec_3d(1, 1, 1) /*! Spacing, i.e. size of a voxel */, - const Vec_3d& origin = Vec_3d(0, 0, 0) /*! Origin */); + GridBase(const std::string& gridImplementationName, + const Neon::Backend& backend, + const Neon::index_3d& dim, + const Neon::domain::Stencil& stencil, + const Neon::set::DataSet& nPartitionElements /**< Number of element per partition */, + const Neon::index_3d& defaultBlockSize, + const Vec_3d& spacingData /*! Spacing, i.e. size of a voxel */, + const Vec_3d& origin /*! Origin */, + Neon::domain::tool::spaceCurves::EncoderType spaceCurve, + Neon::index_3d memoryBlock); /** * Protected initialization function used by derived classes to set some parameters. */ - auto init(const std::string& gridImplementationName /**< Name of the implementation, for example dGrid eGrid etc */, - const Neon::Backend& backend /**< Backend used to create the grid */, - const Neon::index_3d& dimension /**< Dimension of the grid */, - const Neon::domain::Stencil& stencil /**< Union of all the stencil that will be used with the grid */, - const Neon::set::DataSet& nPartitionElements /**< Elements associated to each partition */, - const Neon::index_3d& defaultBlockSize /**< Default thread block size */, - const Vec_3d& spacingData /**< Grid spacing */, - const Vec_3d& origin /**< Position in space of the grid's origin */) -> void; + auto init(const std::string& gridImplementationName /**< Name of the implementation, for example dGrid eGrid etc */, + const Neon::Backend& backend /**< Backend used to create the grid */, + const Neon::index_3d& dimension /**< Dimension of the grid */, + const Neon::domain::Stencil& stencil /**< Union of all the stencil that will be used with the grid */, + const Neon::set::DataSet& nPartitionElements /**< Elements associated to each partition */, + const Neon::index_3d& defaultBlockSize /**< Default thread block size */, + const Vec_3d& spacingData /**< Grid spacing */, + const Vec_3d& origin /**< Position in space of the grid's origin */, + Neon::domain::tool::spaceCurves::EncoderType spaceCurve, + Neon::index_3d memoryBlock) -> void; /** * Protected method to set the default thread blocks size @@ -175,6 +179,7 @@ class GridBase -> Neon::set::LaunchParameters&; + private: struct Storage { @@ -187,14 +192,16 @@ class GridBase index_3d blockDim; }; - Neon::Backend backend /**< Backend used to create and run the grid. */; - Neon::index_3d dimension /**< Dimension of the grid */; - Neon::domain::Stencil stencil /**< Stencil used for the grid initialization */; - Neon::set::DataSet nPartitionElements /**< Number of elements per partition */; - Vec_3d spacing /**< Spacing, i.e. size of a voxel */; - Vec_3d origin /**< Position in space of the grid's origin */; - Defaults_t defaults; - std::string gridImplementationName; + Neon::Backend backend /**< Backend used to create and run the grid. */; + Neon::index_3d dimension /**< Dimension of the grid */; + Neon::domain::Stencil stencil /**< Stencil used for the grid initialization */; + Neon::set::DataSet nPartitionElements /**< Number of elements per partition */; + Vec_3d spacing /**< Spacing, i.e. size of a voxel */; + Vec_3d origin /**< Position in space of the grid's origin */; + Defaults_t defaults; + std::string gridImplementationName; + Neon::domain::tool::spaceCurves::EncoderType spaceCurve; + Neon::index_3d memoryBlock; }; std::shared_ptr mStorage; diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h index 87df30a0..6d110a1f 100644 --- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h +++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h @@ -120,6 +120,7 @@ class Partitioner1D mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing; mData->mStencil = stencil; mData->mDomainSize = domainSize; + mData->spaceCurve = spaceFillingType; // Block space interval (i.e. indexing space at the block granularity) @@ -185,6 +186,11 @@ class Partitioner1D return mData->block3DSpan; } + auto getSpaceCurve() const -> Neon::domain::tool::spaceCurves::EncoderType + { + return mData->spaceCurve; + } + auto getMemoryGrid() -> Neon::aGrid& { return mData->mTopologyWithGhost; @@ -445,13 +451,14 @@ class Partitioner1D class Data { public: - Neon::index_3d mDataBlockSize = 0; - int mMultiResDiscreteIdxSpacing = 0; - Neon::domain::Stencil mStencil; - Neon::index_3d mDomainSize; - Neon::int32_3d block3DSpan; - bool globalMappingInit = false; - Neon::aGrid::Field globalMapping; + Neon::index_3d mDataBlockSize = 0; + int mMultiResDiscreteIdxSpacing = 0; + Neon::domain::Stencil mStencil; + Neon::index_3d mDomainSize; + Neon::int32_3d block3DSpan; + bool globalMappingInit = false; + Neon::aGrid::Field globalMapping; + Neon::domain::tool::spaceCurves::EncoderType spaceCurve; bool getStencil3dTo1dOffsetInit = false; Neon::set::MemSet stencil3dTo1dOffset; diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h index bf994458..b01b8718 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h @@ -27,7 +27,9 @@ tGrid::tGrid(FoundationGrid& foundationGrid) foundationGrid.getNumActiveCellsPerPartition(), foundationGrid.getDefaultBlock(), foundationGrid.getSpacing(), - foundationGrid.getOrigin()); + foundationGrid.getOrigin(), + foundationGrid.getSpaceCurve(), + foundationGrid.getMemoryBlock()); } template @@ -57,7 +59,9 @@ tGrid::tGrid(const Neon::Backend& bk mData->foundationGrid.getNumActiveCellsPerPartition(), mData->foundationGrid.getDefaultBlock(), mData->foundationGrid.getSpacing(), - mData->foundationGrid.getOrigin()); + mData->foundationGrid.getOrigin(), + encoderType, + mData->foundationGrid.getMemoryBlock()); } template diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h index 636320b9..7cf442c6 100644 --- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h +++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h @@ -28,7 +28,7 @@ struct Hash // std::cout << std::endl // << " CODE "; // for (int i = 0; i < int(id1dTo3d.size()); i++) { - // std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(encoderType, id3dTo1d.getBBox(), id1dTo3d[i]) << " "; + // std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(spaceCurve, id3dTo1d.getBBox(), id1dTo3d[i]) << " "; // } // std::cout << std::endl; // std::cout << " BOX " << id3dTo1d.getBBox(); @@ -61,7 +61,7 @@ struct Hash // std::cout << std::endl // << " CODE "; // for (int i = 0; i < int(id1dTo3d.size()); i++) { -// std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(encoderType, id3dTo1d.getBBox(), id1dTo3d[i]) << " "; +// std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(spaceCurve, id3dTo1d.getBBox(), id1dTo3d[i]) << " "; // } // std::cout << std::endl; } diff --git a/libNeonDomain/src/domain/details/aGrid/aGrid.cpp b/libNeonDomain/src/domain/details/aGrid/aGrid.cpp index be36fd4c..87942976 100644 --- a/libNeonDomain/src/domain/details/aGrid/aGrid.cpp +++ b/libNeonDomain/src/domain/details/aGrid/aGrid.cpp @@ -61,7 +61,9 @@ auto aGrid::init(const Neon::Backend& backend, lenghts, blockDim, spacingData, - origin); + origin, + Neon::domain::tool::spaceCurves::EncoderType::sweep, + {0, 0, 0}); mStorage = std::make_shared(); diff --git a/libNeonDomain/src/domain/details/eGrid/eGrid.cpp b/libNeonDomain/src/domain/details/eGrid/eGrid.cpp index 164ae3b1..d4e12b7f 100644 --- a/libNeonDomain/src/domain/details/eGrid/eGrid.cpp +++ b/libNeonDomain/src/domain/details/eGrid/eGrid.cpp @@ -25,7 +25,9 @@ eGrid::eGrid(const Backend& backend, nElementsPerPartition, Neon::index_3d(256, 1, 1), spacing, - origin); + origin, + partitioner.getSpaceCurve(), + {1,1,1}); } @@ -35,7 +37,7 @@ eGrid::eGrid(const Backend& backend, mData->mGlobalMappingAField = mData->partitioner1D.getGlobalMapping(); mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset(); mData->memoryGrid = mData->partitioner1D.getMemoryGrid(); - //mData->partitioner1D.getDenseMeta(mData->denseMeta); + // mData->partitioner1D.getDenseMeta(mData->denseMeta); const int32_t numDevices = getBackend().devSet().setCardinality(); @@ -109,7 +111,9 @@ eGrid::eGrid(const Backend& backend, nElementsPerPartition, defaultBlockSize, spacing, - origin); + origin, + partitioner.getSpaceCurve(), + {1,1,1}); } } @@ -200,7 +204,7 @@ auto eGrid::convertToNghIdx(Neon::index_3d const& offset) auto eGrid::isInsideDomain(const index_3d& idx) const -> bool { - //auto const& metaInfo = mData->denseMeta.get(idx); + // auto const& metaInfo = mData->denseMeta.get(idx); auto const& metaInfo = mData->partitioner1D.getDenseMeta().get(idx); return metaInfo.isValid(); } @@ -225,7 +229,7 @@ auto eGrid::getProperties(const index_3d& idx) const -> GridBaseTemplate::CellPr if (this->getDevSet().setCardinality() == 1) { cellProperties.init(0, DataView::INTERNAL); } else { - //auto const& metaInfo = mData->denseMeta.get(idx); + // auto const& metaInfo = mData->denseMeta.get(idx); auto const& metaInfo = mData->partitioner1D.getDenseMeta().get(idx); cellProperties.init(metaInfo.setIdx, metaInfo.dw); } @@ -262,7 +266,7 @@ auto eGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tupledenseMeta.get(idx); + // auto const& meta = mData->denseMeta.get(idx); auto const& meta = mData->partitioner1D.getDenseMeta().get(idx); if (meta.isValid()) { auto const& span = getSpan(Execution::host, meta.setIdx, Neon::DataView::STANDARD); diff --git a/libNeonDomain/src/domain/interface/GridBase.cpp b/libNeonDomain/src/domain/interface/GridBase.cpp index 81663239..3bfd8a21 100644 --- a/libNeonDomain/src/domain/interface/GridBase.cpp +++ b/libNeonDomain/src/domain/interface/GridBase.cpp @@ -3,14 +3,16 @@ namespace Neon::domain::interface { -auto GridBase::init(const std::string& gridImplementationName, - const Neon::Backend& backend, - const Neon::index_3d& dimension, - const Neon::domain::Stencil& stencil, - const Neon::set::DataSet& nPartitionElements, - const Neon::index_3d& defaultBlockSize, - const Vec_3d& spacingData, - const Vec_3d& origin) -> void +auto GridBase::init(const std::string& gridImplementationName, + const Neon::Backend& backend, + const Neon::index_3d& dimension, + const Neon::domain::Stencil& stencil, + const Neon::set::DataSet& nPartitionElements, + const Neon::index_3d& defaultBlockSize, + const Vec_3d& spacingData, + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType spaceCurve, + Neon::index_3d memoryBlock) -> void { mStorage->backend = backend; mStorage->dimension = dimension; @@ -24,6 +26,8 @@ auto GridBase::init(const std::string& gridImplementationName, mStorage->defaults.launchParameters[DataViewUtil::toInt(dw)] = backend.devSet().newLaunchParameters(); } mStorage->defaults.blockDim = defaultBlockSize; + mStorage->spaceCurve = spaceCurve; + mStorage->memoryBlock = memoryBlock; } GridBase::GridBase() @@ -31,14 +35,16 @@ GridBase::GridBase() { } -GridBase::GridBase(const std::string& gridImplementationName, - const Neon::Backend& backend, - const Neon::index_3d& dimension, - const Neon::domain::Stencil& stencil, - const Neon::set::DataSet& nPartitionElements, - const Neon::index_3d& defaultBlockSize, - const Vec_3d& spacingData, - const Vec_3d& origin) +GridBase::GridBase(const std::string& gridImplementationName, + const Neon::Backend& backend, + const Neon::index_3d& dimension, + const Neon::domain::Stencil& stencil, + const Neon::set::DataSet& nPartitionElements, + const Neon::index_3d& defaultBlockSize, + const Vec_3d& spacingData, + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType spaceCurve, + Neon::index_3d memoryBlock) : mStorage(std::make_shared()) { init(gridImplementationName, @@ -48,7 +54,9 @@ GridBase::GridBase(const std::string& gridImplementationName, nPartitionElements, defaultBlockSize, spacingData, - origin); + origin, + spaceCurve, + memoryBlock); } auto GridBase::getDimension() const -> const Neon::index_3d& @@ -161,7 +169,8 @@ auto GridBase::toString() const -> std::string return tmp.str(); }() << "}" - << ", [Backend]:{" << getBackend().toString() << "}"; + << ", [Backend]:{" << getBackend().toString() << "}" + << ", [Memory]:{" << Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(mStorage->spaceCurve) << ", " << this->mStorage->memoryBlock << "}"; return s.str(); } @@ -232,10 +241,41 @@ auto GridBase::toReport(Neon::Report& report, }(), &subdoc); + report.addMember( + "MemoryBlock", + [&] { + std::stringstream list; + list << "["; + list << getMemoryBlock().x << " " + << getMemoryBlock().y << " " + << getMemoryBlock().z << "]"; + return list.str(); + }(), + &subdoc); + + report.addMember( + "SpaceCurve", + [&] { + std::stringstream list; + list << Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(mStorage->spaceCurve); + return list.str(); + }(), + &subdoc); + if (includeBackendInfo) getBackend().toReport(report, &subdoc); report.addSubdoc("Grid", subdoc); } +auto GridBase::getMemoryBlock() const -> Neon::index_3d +{ + return mStorage->memoryBlock; +} + +auto GridBase::getSpaceCurve() const -> Neon::domain::tool::spaceCurves::EncoderType +{ + return mStorage->spaceCurve; +} + } // namespace Neon::domain::interface \ No newline at end of file From 6ddabb3155caaef3d121979d8cff57dee6fdc545 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 25 Jul 2023 16:11:40 -0400 Subject: [PATCH 43/94] Fixes to python script. --- .../lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index c560755f..2ce5dcd3 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -90,7 +90,9 @@ def countAll(): GRID + '_' + DOMAIN_SIZE + '-' + STORAGE_FP + '-' + COMPUTE_FP + '-' + - OCC) + OCC + '-' + + HU + '-' + + CURVE) parameters.append('--computeFP ' + COMPUTE_FP) parameters.append('--storageFP ' + STORAGE_FP) parameters.append('--curve ' + CURVE) From 7e158b662851bc0a7c21c7294995bc467394044a Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 28 Jul 2023 16:41:26 -0400 Subject: [PATCH 44/94] WIP --- .../lbm-lid-driven-cavity-flow/src/CellType.h | 15 + .../src/ContainerFactory.h | 19 ++ .../src/ContainersD3Q19.h | 288 ++++++++++++++---- .../src/ContainersD3Q27.h | 3 +- .../lbm-lid-driven-cavity-flow/src/D3Q19.h | 68 +++++ .../src/DeviceD3Q19.h | 221 ++++++++------ .../src/LbmSkeleton.h | 22 +- .../lbm-lid-driven-cavity-flow/src/Methods.h | 8 + .../src/RunCavityTwoPop.cu | 126 ++++---- 9 files changed, 547 insertions(+), 223 deletions(-) create mode 100644 benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h b/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h index 7037b6ae..1ca70c6f 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h @@ -22,13 +22,28 @@ struct CellType classification = c; wallNghBitflag = n; } + NEON_CUDA_HOST_DEVICE explicit CellType(Classification c) { classification = c; wallNghBitflag = 0; } + // Converting to int to exportVti + operator int() const { return int(classification); } + + template + static auto isWall(const uint32_t& wallNghBitFlag) + -> bool + { + return wallNghBitFlag & (uint32_t(1) << fwdRegIdx); + } + auto setWall(int fwdRegIdx) + -> void + { + wallNghBitflag = wallNghBitflag | ((uint32_t(1) << fwdRegIdx)); + } uint32_t wallNghBitflag; Classification classification; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h index 26f7a5a4..ce5f69a2 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h @@ -3,12 +3,31 @@ #include "Neon/Neon.h" #include "Neon/set/Containter.h" +namespace pull { template struct ContainerFactory { }; +} // namespace pull +namespace push { +template +struct ContainerFactory +{ +}; +} // namespace push + +namespace common { +template +struct ContainerFactory +{ +}; +} // namespace common #include "ContainersD3Q19.h" #include "ContainersD3Q27.h" \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h index d2ca08eb..fcbda83d 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h @@ -3,9 +3,11 @@ #include "CellType.h" #include "D3Q19.h" #include "DeviceD3Q19.h" +#include "Methods.h" #include "Neon/Neon.h" #include "Neon/set/Containter.h" +namespace pull { /** * Specialization for D3Q19 */ @@ -27,7 +29,77 @@ struct ContainerFactory; using U = typename Grid::template Field; - using Functions = DeviceD3Q19; + using PullFunctions = pull::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + PullFunctions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + PullFunctions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + +}; +} // namespace pull +namespace push { +/** + * Specialization for D3Q19 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using PushFunctions = push::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; static auto iteration(Neon::set::StencilSemantic stencilSemantic, @@ -50,21 +122,23 @@ struct ContainerFactory u{.0, .0, .0}; - Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + CommonFunctions::macroscopic(popIn, + NEON_OUT rho, NEON_OUT u); Compute usqr = 1.5 * (u[0] * u[0] + u[1] * u[1] + u[2] * u[2]); - Functions::collideBgkUnrolled(gidx, - popIn, - rho, u, - usqr, omega, - NEON_OUT fOut); + CommonFunctions::collideBgkUnrolled(gidx, + rho, u, + usqr, omega, + NEON_IO popIn); + + PushFunctions::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut); } }; }); @@ -109,63 +183,152 @@ struct ContainerFactory +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using PullFunctions = pull::DeviceD3Q19; + using PushFunctions = push::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; + + template static auto - computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - Rho& rhoField /*! output Population field */, - U& uField /*! output Population field */) + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + if constexpr (method == int(Method::push)) { + using Factory = push::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + fOutField, + omega, + fOutField); + } + if constexpr (method == int(Method::pull)) { + using Factory = pull::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + fOutField, + omega, + fOutField); + } + } + + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) -> Neon::set::Container { - Neon::set::Container container = fInField.getGrid().newContainer( + Neon::set::Container container = infoInField.getGrid().newContainer( "LBM_iteration", [&](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL); - auto& rhoXpu = L.load(rhoField); - auto& uXpu = L.load(uField); - - const auto& cellInfoPartition = L.load(cellTypeField); + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - Compute rho = 0; - std::array u{.0, .0, .0}; - Storage popIn[Lattice::Q]; - - if (cellInfo.classification == CellType::bulk) { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; - Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId != Lattice::Memory::center) { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; - } else { - if (cellInfo.classification == CellType::movingWall) { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GORegisterId) { - if constexpr (GORegisterId == Lattice::Registers::center) { - popIn[Lattice::Registers::center] = fIn(gidx, Lattice::Memory::center); - } else { - popIn[GORegisterId] = fIn(gidx, Lattice::Memory::template mapFromRegisters()); + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification != CellType::bulk) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId)); } - }); + } + }); - rho = 1.0; - u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), - static_cast(popIn[1]) / static_cast(6. * 1. / 18.), - static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; - } + infoOut(gidx, 0) = cellType; } - - rhoXpu(gidx, 0) = static_cast(rho); - uXpu(gidx, 0) = static_cast(u[0]); - uXpu(gidx, 1) = static_cast(u[1]); - uXpu(gidx, 2) = static_cast(u[2]); }; }); return container; } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = + fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + + Storage popIn[Lattice::Q]; + CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); + + if (cellInfo.classification == CellType::bulk) { + CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + static auto problemSetup(PopField& fInField /*! inpout population field */, PopField& fOutField, @@ -183,44 +346,47 @@ struct ContainerFactory([&](auto q) { - if (globlalIdx.y == domainDim.y - 1) { - val = -6. * Lattice::Memory::template getT() * ulb * - (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + - Lattice::Memory::template getDirection().v[1] * ulid.v[1] + - Lattice::Memory::template getDirection().v[2] * ulid.v[2]); + if (globalIdx.y == domainDim.y - 1) { + popVal = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); } else { - val = 0; + popVal = 0; } - fIn(gidx, q) = val; - fOut(gidx, q) = val; + fIn(gidx, q) = popVal; + fOut(gidx, q) = popVal; }); } else { flagVal.classification = CellType::bulk; - cellInfoPartition(gidx, 0) = flagVal; Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { fIn(gidx, q) = Lattice::Memory::template getT(); fOut(gidx, q) = Lattice::Memory::template getT(); }); } + cellInfoPartition(gidx, 0) = flagVal; }; }); return container; } }; +} // namespace common \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h index e41b9cfe..d5d024ea 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h @@ -5,7 +5,7 @@ #include "DeviceD3Q27.h" #include "Neon/Neon.h" #include "Neon/set/Containter.h" - +#if 0 /** * Specialization for D3Q27 */ @@ -224,3 +224,4 @@ struct ContainerFactory fwdRegIdxList{0, 1, 2, 3, 4, 5, 6, 7, 8}; + + template + static inline NEON_CUDA_HOST_DEVICE auto + getCk_u(std::array const& u) -> Compute + { + if constexpr (tegIdx == 0 || tegIdx == 9) { + return u[0]; + } + if constexpr (tegIdx == 1 || tegIdx == 10) { + return u[1]; + } + if constexpr (tegIdx == 2 || tegIdx == 11) { + return u[2]; + } + if constexpr (tegIdx == 3 || tegIdx == 12) { + return u[0] + u[1]; + } + if constexpr (tegIdx == 4 || tegIdx == 13) { + return u[0] - u[1]; + } + if constexpr (tegIdx == 5 || tegIdx == 14) { + return u[0] + u[2]; + } + if constexpr (tegIdx == 6 || tegIdx == 15) { + + return u[0] - u[2]; + } + if constexpr (tegIdx == 7 || tegIdx == 16) { + + return u[1] + u[2]; + } + if constexpr (tegIdx == 8 || tegIdx == 17) { + return u[1] - u[2]; + } + } }; struct Memory @@ -125,6 +163,7 @@ struct D3Q19 static constexpr std::array toMemory{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; + template NEON_CUDA_HOST_DEVICE static constexpr auto mapToRegisters() -> int @@ -157,6 +196,35 @@ struct D3Q19 return Registers::t[goInRegisterSpace]; } + template + struct MemMapper + { + constexpr static int fwMemIdx = fwMemIdx_; + constexpr static int fwX = Memory::stencil[fwMemIdx].x; + constexpr static int fwY = Memory::stencil[fwMemIdx].y; + constexpr static int fwZ = Memory::stencil[fwMemIdx].z; + + constexpr static int bkMemIdx = Memory::opposite[fwMemIdx]; + constexpr static int bkX = Memory::stencil[bkMemIdx].x; + constexpr static int bkY = Memory::stencil[bkMemIdx].y; + constexpr static int bkZ = Memory::stencil[bkMemIdx].z; + + constexpr static int fwRegIdx = Memory::template mapToRegisters(); + constexpr static int centerRegIdx = Registers::center; + constexpr static int centerMemIdx = Memory::center; + }; + + template + struct RegMapper + { + constexpr static int fwRegIdx = fwRegIdx_; + constexpr static int bkRegIdx = Registers::opposite[fwRegIdx]; + constexpr static int fwMemIdx = Registers::template mapToMemory(); + constexpr static int bkMemIdx = Registers::template mapToMemory(); + constexpr static int centerRegIdx = Registers::center; + constexpr static int centerMemIdx = Memory::center; + }; + static constexpr std::array t{ 1. / 18. /*! 0 */, 1. / 18. /*! 1 */, diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h index 2ad6a62a..fff6f2b3 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h @@ -4,7 +4,7 @@ #include "Neon/Neon.h" #include "Neon/set/Containter.h" - +namespace pull { template struct DeviceD3Q19 { @@ -28,28 +28,102 @@ struct DeviceD3Q19 typename PopField::Partition const& fin, NEON_OUT Storage popIn[Lattice::Q]) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx) { + using M = typename Lattice::template MappersIdxSetWithFwdMem; - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { - if constexpr (GOMemoryId == Lattice::Memory::center) { - popIn[Lattice::Registers::center] = fin(gidx, Lattice::Memory::center); + if constexpr (fwMemIdx == Lattice::Memory::center) { + popIn[M::centerRegIdx] = fin(gidx, M::centerMemIdx); } else { - constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; - constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; - constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; - constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; - constexpr int GORegistersId = Lattice::Memory::template mapToRegisters(); - - if (wallBitFlag & (uint32_t(1) << GOMemoryId)) { - popIn[GORegistersId] = - fin(gidx, BKMemoryId) + - fin.template getNghData(gidx, BKMemoryId)(); + if (CellType::isWall()) { + popIn[M::fwRegIdx] = fin(gidx, M::bkMemIdx) + + fin.template getNghData(gidx, M::bkMemIdx)(); } else { - popIn[GORegistersId] = - fin.template getNghData(gidx, GOMemoryId)(); + popIn[M::fwRegIdx] = fin.template getNghData(gidx, fwMemIdx)(); } } }); } +}; + +#undef CAST_TO_COMPUTE +} // namespace pull + +namespace push { +template +struct DeviceD3Q19 +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + pushStream(Idx const& gidx, + const uint32_t& wallNghBitFlag, + NEON_OUT Storage pOut[Lattice::Q], + NEON_OUT typename PopField::Partition const& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx_) { + using M = typename Lattice::template MappersIdxSetWithFwdMem; + + if constexpr (M::fwMemIdx == M::centerMemIdx) { + fOut(gidx, M::fwMemIdx) = pOut[M::fwRegIdx]; + } else { + if (CellType::isWall()) { + // fout(i, opp[k]) = + // pop_out + + // f(nb, k); + fOut(gidx, M::bkMemIdx) = + pOut[M::fwdRegIdx] + + fOut.template getNghData(gidx, M::fwMemIdx)(); + } else { + // fout(nb, k) = pop_out; + fOut.writeNgh(gidx, M::fwMemIdx, pOut[M::fwdRegIdx]); + } + } + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + localLoad(Idx const& gidx, + NEON_IN typename PopField::Partition const& fOut, + Storage NEON_RESTRICT pOut[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx_) { + using M = typename Lattice::template MappersIdxSetWithFwdMem; + pOut[M::fwdRegIdx] = fOut(gidx, M::fwMemIdx); + }); + } +}; +} // namespace push + + +namespace common { +template +struct DeviceD3Q19 +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + static inline NEON_CUDA_HOST_DEVICE auto macroscopic(const Storage pop[Lattice::Q], @@ -79,21 +153,14 @@ struct DeviceD3Q19 static inline NEON_CUDA_HOST_DEVICE auto collideBgkUnrolled(Idx const& i /*! Compute iterator */, - const Storage pop[Lattice::Q], Compute const& rho /*! Density */, std::array const& u /*! Velocity */, Compute const& usqr /*! Usqr */, Compute const& omega /*! Omega */, - typename PopField::Partition& fOut /*! Population */) + NEON_IO Storage pop[Lattice::Q]) -> void { - const Compute ck_u03 = u[0] + u[1]; - const Compute ck_u04 = u[0] - u[1]; - const Compute ck_u05 = u[0] + u[2]; - const Compute ck_u06 = u[0] - u[2]; - const Compute ck_u07 = u[1] + u[2]; - const Compute ck_u08 = u[1] - u[2]; constexpr Compute c1over18 = 1. / 18.; constexpr Compute c1over36 = 1. / 36.; @@ -102,72 +169,46 @@ struct DeviceD3Q19 constexpr Compute c1 = 1.; constexpr Compute c6 = 6.; - const Compute eq_00 = rho * c1over18 * (c1 - c6 * u[0] + c4dot5 * u[0] * u[0] - usqr); - const Compute eq_01 = rho * c1over18 * (c1 - c6 * u[1] + c4dot5 * u[1] * u[1] - usqr); - const Compute eq_02 = rho * c1over18 * (c1 - c6 * u[2] + c4dot5 * u[2] * u[2] - usqr); - const Compute eq_03 = rho * c1over36 * (c1 - c6 * ck_u03 + c4dot5 * ck_u03 * ck_u03 - usqr); - const Compute eq_04 = rho * c1over36 * (c1 - c6 * ck_u04 + c4dot5 * ck_u04 * ck_u04 - usqr); - const Compute eq_05 = rho * c1over36 * (c1 - c6 * ck_u05 + c4dot5 * ck_u05 * ck_u05 - usqr); - const Compute eq_06 = rho * c1over36 * (c1 - c6 * ck_u06 + c4dot5 * ck_u06 * ck_u06 - usqr); - const Compute eq_07 = rho * c1over36 * (c1 - c6 * ck_u07 + c4dot5 * ck_u07 * ck_u07 - usqr); - const Compute eq_08 = rho * c1over36 * (c1 - c6 * ck_u08 + c4dot5 * ck_u08 * ck_u08 - usqr); - - const Compute eqopp_00 = eq_00 + rho * c1over18 * c6 * u[0]; - const Compute eqopp_01 = eq_01 + rho * c1over18 * c6 * u[1]; - const Compute eqopp_02 = eq_02 + rho * c1over18 * c6 * u[2]; - const Compute eqopp_03 = eq_03 + rho * c1over36 * c6 * ck_u03; - const Compute eqopp_04 = eq_04 + rho * c1over36 * c6 * ck_u04; - const Compute eqopp_05 = eq_05 + rho * c1over36 * c6 * ck_u05; - const Compute eqopp_06 = eq_06 + rho * c1over36 * c6 * ck_u06; - const Compute eqopp_07 = eq_07 + rho * c1over36 * c6 * ck_u07; - const Compute eqopp_08 = eq_08 + rho * c1over36 * c6 * ck_u08; - - const Compute pop_out_00 = (c1 - omega) * static_cast(pop[0]) + omega * eq_00; - const Compute pop_out_01 = (c1 - omega) * static_cast(pop[1]) + omega * eq_01; - const Compute pop_out_02 = (c1 - omega) * static_cast(pop[2]) + omega * eq_02; - const Compute pop_out_03 = (c1 - omega) * static_cast(pop[3]) + omega * eq_03; - const Compute pop_out_04 = (c1 - omega) * static_cast(pop[4]) + omega * eq_04; - const Compute pop_out_05 = (c1 - omega) * static_cast(pop[5]) + omega * eq_05; - const Compute pop_out_06 = (c1 - omega) * static_cast(pop[6]) + omega * eq_06; - const Compute pop_out_07 = (c1 - omega) * static_cast(pop[7]) + omega * eq_07; - const Compute pop_out_08 = (c1 - omega) * static_cast(pop[8]) + omega * eq_08; - - const Compute pop_out_opp_00 = (c1 - omega) * static_cast(pop[10]) + omega * eqopp_00; - const Compute pop_out_opp_01 = (c1 - omega) * static_cast(pop[11]) + omega * eqopp_01; - const Compute pop_out_opp_02 = (c1 - omega) * static_cast(pop[12]) + omega * eqopp_02; - const Compute pop_out_opp_03 = (c1 - omega) * static_cast(pop[13]) + omega * eqopp_03; - const Compute pop_out_opp_04 = (c1 - omega) * static_cast(pop[14]) + omega * eqopp_04; - const Compute pop_out_opp_05 = (c1 - omega) * static_cast(pop[15]) + omega * eqopp_05; - const Compute pop_out_opp_06 = (c1 - omega) * static_cast(pop[16]) + omega * eqopp_06; - const Compute pop_out_opp_07 = (c1 - omega) * static_cast(pop[17]) + omega * eqopp_07; - const Compute pop_out_opp_08 = (c1 - omega) * static_cast(pop[18]) + omega * eqopp_08; - - -#define COMPUTE_GO_AND_BACK(GOid, BKid) \ - { \ - fOut(i, Lattice::Memory::template mapFromRegisters()) = static_cast(pop_out_0##GOid); \ - fOut(i, Lattice::Memory::template mapFromRegisters()) = static_cast(pop_out_opp_0##GOid); \ - } - - COMPUTE_GO_AND_BACK(0, 10) - COMPUTE_GO_AND_BACK(1, 11) - COMPUTE_GO_AND_BACK(2, 12) - COMPUTE_GO_AND_BACK(3, 13) - COMPUTE_GO_AND_BACK(4, 14) - COMPUTE_GO_AND_BACK(5, 15) - COMPUTE_GO_AND_BACK(6, 16) - COMPUTE_GO_AND_BACK(7, 17) - COMPUTE_GO_AND_BACK(8, 18) - -#undef COMPUTE_GO_AND_BACK - - { - const Compute eq_09 = rho * (c1 / c3) * (c1 - usqr); - const Compute pop_out_09 = (c1 - omega) * static_cast(pop[Lattice::Registers::center]) + - omega * eq_09; - fOut(i, Lattice::Memory::center) = static_cast(pop_out_09); + constexpr int regCenter = Lattice::Registers::center; + constexpr int regFir = Lattice::Registers::center; + + Neon::ConstexprFor<0, Lattice::Registers::fwdRegIdxListLen, 1>( + [&](auto fwdRegIdxListIdx) { + using M = typename Lattice::template RegMapper; + using T = typename Lattice::Registers; + + Compute eqFw; + Compute eqBk; + + const Compute ck_u = T::template getCk_u(u); + // double eq = rho * t[k] * + // (1. + + // 3. * ck_u + + // 4.5 * ck_u * ck_u - + // usqr); + eqFw = rho * T::t[M::fwRegIdx] * + (c1 + + c3 * ck_u + + c4dot5 * ck_u * ck_u - + usqr); + + // double eqopp = eq - 6.* rho * t[k] * ck_u; + eqBk = eqFw - + c6 * rho * c1over36 * T::t[M::fwRegIdx] * ck_u; + + // pop_out = (1. - omega) * fin(i, k) + omega * eq; + pop[M::fwRegIdx] = (c1 - omega) * static_cast(pop[M::fwRegIdx]) + omega * eqFw; + // pop_out_opp = (1. - omega) * fin(i, opp[k]) + omega * eqopp; + pop[M::bkRegIdx] = (c1 - omega) * static_cast(pop[M::bkRegIdx]) + omega * eqBk; + }); + { // Center; + using T = typename Lattice::Registers; + using M = typename Lattice::template RegMapper; + // eq = rho * t[k] * (1. - usqr); + const Compute eqCenter = rho * T::t[M::fwRegIdx] * (c1 - usqr); + // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; + pop[Lattice::Registers::center] = (c1 - omega) * static_cast(pop[M::fwRegIdx]) + omega * eqCenter; } } }; - -#undef CAST_TO_COMPUTE \ No newline at end of file +} // namespace common \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h index 3408bee4..22ae8177 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h @@ -7,7 +7,8 @@ #include "Neon/set/Containter.h" #include "Neon/skeleton/Skeleton.h" -template struct LbmSkeleton @@ -15,8 +16,11 @@ struct LbmSkeleton }; -template -struct LbmSkeleton +struct LbmSkeleton, Grid_> { @@ -33,7 +37,7 @@ struct LbmSkeleton; using U = typename Grid::template Field; - using ContainerFactory = ContainerFactory; + using ContainerFactory = common::ContainerFactory; LbmSkeleton(Neon::set::StencilSemantic stencilSemantic, Neon::skeleton::Occ occ, @@ -97,11 +101,11 @@ struct LbmSkeleton ops; lbmTwoPop[target] = Neon::skeleton::Skeleton(inField.getBackend()); Neon::skeleton::Options opt(occ, transfer); - ops.push_back(ContainerFactory::iteration(stencilSemantic, - inField, - cellTypeField, - omega, - outField)); + ops.push_back(ContainerFactory::template iteration(stencilSemantic, + inField, + cellTypeField, + omega, + outField)); std::stringstream appName; appName << "LBM_iteration_" << std::to_string(target); lbmTwoPop[target].sequence(ops, appName.str(), opt); diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h new file mode 100644 index 00000000..4d3bf178 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h @@ -0,0 +1,8 @@ +#pragma once + +enum class Method +{ + push, + pull, + aa +}; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index aa9bc461..146f108f 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -15,7 +15,8 @@ namespace CavityTwoPop { int backendWasReported = false; namespace details { -template auto run(Config& config, @@ -52,7 +53,6 @@ auto run(Config& config, }(); - Neon::double_3d ulid(1., 0., 0.); // Neon Grid and Fields initialization auto [start, clock_iter] = metrics::restartClock(bk, true); @@ -60,7 +60,7 @@ auto run(Config& config, bk, {config.N, config.N, config.N}, [](const Neon::index_3d&) { return true; }, Lattice::template getDirectionAsVector(), - 0.0, 1.0, + 1.0, 0.0, config.spaceCurve); if (!backendWasReported) { @@ -95,7 +95,7 @@ auto run(Config& config, lbmParameters.omega); auto exportRhoAndU = [&bk, &rho, &u, &iteration, &flag, &grid, &ulid](int iterationId) { - if ((iterationId) % 100 == 0) { + if ((iterationId) % 1 == 0) { auto& f = iteration.getInput(); { bk.syncAll(); @@ -120,7 +120,8 @@ auto run(Config& config, u.ioToVtk("u_" + iterIdStr, "u", false); rho.ioToVtk("rho_" + iterIdStr, "rho", false); // iteration.getInput().ioToVtk("pop_" + iterIdStr, "u", false); - // flag.ioToVtk("flag_" + iterIdStr, "u", false); + flag.template ioToVtk("flag_" + iterIdStr, "flag", false); + flag.template ioToVtk("flag_" + iterIdStr, "flag", false); std::vector> xPosVal; std::vector> yPosVal; @@ -245,9 +246,9 @@ auto runFilterComputeType(Config& config, Report& report) -> void if (config.computeType == "double") { return run(config, report); } - if (config.computeType == "float") { - return run(config, report); - } +// if (config.computeType == "float") { +// return run(config, report); +// } NEON_DEV_UNDER_CONSTRUCTION(""); } @@ -259,9 +260,10 @@ auto runFilterStoreType(Config& config, if (config.storeType == "double") { return runFilterComputeType(config, report); } - if (config.storeType == "float") { - return runFilterComputeType(config, report); - } +// if (config.storeType == "float") { +// return runFilterComputeType(config, report); +// } + NEON_DEV_UNDER_CONSTRUCTION(""); } } // namespace details @@ -277,77 +279,77 @@ auto run(Config& config, if (config.gridType == "dGrid") { return details::runFilterStoreType(config, report); } - if (config.gridType == "eGrid") { - if constexpr (!skipTest) { - return details::runFilterStoreType(config, report); - } else { - NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") - } - } - if (config.gridType == "bGrid" || config.gridType == "bGrid_8_8_8") { - return details::runFilterStoreType(config, report); - } - if (config.gridType == "bGrid_4_4_4") { - if constexpr (!skipTest) { - using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); - } else { - NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") - } - } - if (config.gridType == "bGrid_2_2_2") { - if constexpr (!skipTest) { - using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>; - using Grid = Neon::domain::details::bGrid::bGrid; - return details::runFilterStoreType(config, report); - } else { - NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") - } - } -// if (config.gridType == "bGrid_32_8_4") { +// if (config.gridType == "eGrid") { // if constexpr (!skipTest) { -// using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; -// using Grid = Neon::domain::details::bGrid::bGrid; -// return details::runFilterStoreType(config, report); +// return details::runFilterStoreType(config, report); // } else { // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") // } // } -// if (config.gridType == "bGrid_32_8_4") { -// if constexpr (!skipTest) { -// using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; -// using Grid = Neon::domain::details::bGrid::bGrid; -// return details::runFilterStoreType(config, report); -// } else { -// NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") -// } +// if (config.gridType == "bGrid" || config.gridType == "bGrid_8_8_8") { +// return details::runFilterStoreType(config, report); // } -// if (config.gridType == "bGrid_32_2_8") { +// if (config.gridType == "bGrid_4_4_4") { // if constexpr (!skipTest) { -// using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; +// using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; // using Grid = Neon::domain::details::bGrid::bGrid; // return details::runFilterStoreType(config, report); // } else { // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") // } // } -// if (config.gridType == "bGrid_32_8_2") { +// if (config.gridType == "bGrid_2_2_2") { // if constexpr (!skipTest) { -// using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; +// using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>; // using Grid = Neon::domain::details::bGrid::bGrid; // return details::runFilterStoreType(config, report); // } else { // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") // } // } - if (config.gridType == "dGridSoA") { - if constexpr (!skipTest) { - return details::runFilterStoreType(config, report); - } else { - NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") - } - } + // if (config.gridType == "bGrid_32_8_4") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_4") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_2_8") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_2") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "dGridSoA") { + // if constexpr (!skipTest) { + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } NEON_THROW_UNSUPPORTED_OPERATION("Unknown grid type: " + config.gridType); } } // namespace CavityTwoPop From a43ec4ef77a22501ce8b3aef68cc8357f2cb1d5c Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 1 Aug 2023 14:33:48 -0400 Subject: [PATCH 45/94] WIP: new lbm benchmark --- benchmarks/CMakeLists.txt | 3 +- benchmarks/lbm/CMakeLists.txt | 33 ++ benchmarks/lbm/lbm.py | 118 ++++++ benchmarks/lbm/lbm.sh | 30 ++ benchmarks/lbm/src/CellType.h | 56 +++ benchmarks/lbm/src/Config.cpp | 127 +++++++ benchmarks/lbm/src/Config.h | 66 ++++ benchmarks/lbm/src/ContainerFactory.h | 34 ++ benchmarks/lbm/src/ContainersD3Q19.h | 429 +++++++++++++++++++++ benchmarks/lbm/src/ContainersD3Q27.h | 227 +++++++++++ benchmarks/lbm/src/D3Q19.h | 258 +++++++++++++ benchmarks/lbm/src/D3Q27.h | 200 ++++++++++ benchmarks/lbm/src/DeviceD3Q19.h | 224 +++++++++++ benchmarks/lbm/src/DeviceD3Q27.h | 217 +++++++++++ benchmarks/lbm/src/Lbm.h | 324 ++++++++++++++++ benchmarks/lbm/src/LbmSkeleton.h | 117 ++++++ benchmarks/lbm/src/LbmToolsTemplateOnly.h | 440 ++++++++++++++++++++++ benchmarks/lbm/src/Methods.h | 45 +++ benchmarks/lbm/src/Metrics.h | 88 +++++ benchmarks/lbm/src/Precision.h | 13 + benchmarks/lbm/src/Repoert.h | 40 ++ benchmarks/lbm/src/Report.cpp | 108 ++++++ benchmarks/lbm/src/RunCavityTwoPop.cu | 203 ++++++++++ benchmarks/lbm/src/RunCavityTwoPop.h | 12 + benchmarks/lbm/src/app.cpp | 47 +++ 25 files changed, 3458 insertions(+), 1 deletion(-) create mode 100644 benchmarks/lbm/CMakeLists.txt create mode 100644 benchmarks/lbm/lbm.py create mode 100644 benchmarks/lbm/lbm.sh create mode 100644 benchmarks/lbm/src/CellType.h create mode 100644 benchmarks/lbm/src/Config.cpp create mode 100644 benchmarks/lbm/src/Config.h create mode 100644 benchmarks/lbm/src/ContainerFactory.h create mode 100644 benchmarks/lbm/src/ContainersD3Q19.h create mode 100644 benchmarks/lbm/src/ContainersD3Q27.h create mode 100644 benchmarks/lbm/src/D3Q19.h create mode 100644 benchmarks/lbm/src/D3Q27.h create mode 100644 benchmarks/lbm/src/DeviceD3Q19.h create mode 100644 benchmarks/lbm/src/DeviceD3Q27.h create mode 100644 benchmarks/lbm/src/Lbm.h create mode 100644 benchmarks/lbm/src/LbmSkeleton.h create mode 100644 benchmarks/lbm/src/LbmToolsTemplateOnly.h create mode 100644 benchmarks/lbm/src/Methods.h create mode 100644 benchmarks/lbm/src/Metrics.h create mode 100644 benchmarks/lbm/src/Precision.h create mode 100644 benchmarks/lbm/src/Repoert.h create mode 100644 benchmarks/lbm/src/Report.cpp create mode 100644 benchmarks/lbm/src/RunCavityTwoPop.cu create mode 100644 benchmarks/lbm/src/RunCavityTwoPop.h create mode 100644 benchmarks/lbm/src/app.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 10c30fea..efb267c6 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.19 FATAL_ERROR) -add_subdirectory("lbm-lid-driven-cavity-flow") +add_subdirectory(lbm) +# add_subdirectory("lbm-lid-driven-cavity-flow") # add_subdirectory("lbm-flow-over-sphere") diff --git a/benchmarks/lbm/CMakeLists.txt b/benchmarks/lbm/CMakeLists.txt new file mode 100644 index 00000000..7f0c1415 --- /dev/null +++ b/benchmarks/lbm/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.19 FATAL_ERROR) + +SET(APP "lbm") + +file(GLOB_RECURSE SrcFiles src/*.*) + +add_executable(${APP} ${SrcFiles}) + +target_link_libraries(${APP} + PUBLIC libNeonDomain + PUBLIC libNeonSkeleton) + +set_target_properties(${APP} PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) + +target_compile_options(${APP} INTERFACE + $<$:${NeonCXXFlags}> + $<$:${NeonCUDAFlags}> + ) + +add_custom_command( + TARGET ${APP} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.sh + ${CMAKE_BINARY_DIR}/bin/${APP}.sh) + +add_custom_command( + TARGET ${APP} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.py + ${CMAKE_BINARY_DIR}/bin/${APP}.py +) \ No newline at end of file diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py new file mode 100644 index 00000000..2ce5dcd3 --- /dev/null +++ b/benchmarks/lbm/lbm.py @@ -0,0 +1,118 @@ +DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split() +DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split() +DEVICE_TYPE_LIST = 'cpu gpu'.split() +GRID_LIST = "dGrid bGrid eGrid".split() +STORAGE_FP_LIST = "double float".split() +COMPUTE_FP_LIST = "double float".split() +OCC_LIST = "nOCC sOCC".split() +HU_LIST = "huGrid huLattice".split() +CURVE_LIST = "sweep morton hilbert".split() +WARM_UP_ITER = 10 +MAX_ITER = 10000 +REPETITIONS = 5 + +import subprocess +import sys + + +def printProgressBar(value, label): + n_bar = 40 # size of progress bar + max = 100 + j = value / max + sys.stdout.write('\r') + bar = 'â–ˆ' * int(n_bar * j) + bar = bar + '-' * int(n_bar * (1 - j)) + + sys.stdout.write(f"{label.ljust(10)} | [{bar:{n_bar}s}] {int(100 * j)}% ") + sys.stdout.flush() + + +def countAll(): + counter = 0 + for DEVICE_TYPE in DEVICE_TYPE_LIST: + DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in DEVICE_ID_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for GRID in GRID_LIST: + for HU in HU_LIST: + for CURVE in CURVE_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue + + counter += 1 + return counter + + +SAMPLES = countAll() +counter = 0 +command = './lbm-lid-driven-cavity-flow' +# command = 'echo' +with open(command + '.log', 'w') as fp: + for DEVICE_TYPE in DEVICE_TYPE_LIST: + DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in DEVICE_ID_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for DEVICE_SET in DEVICE_SET_LIST: + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: + for GRID in GRID_LIST: + for HU in HU_LIST: + for CURVE in CURVE_LIST: + + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue + + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + + DEVICE_SET.replace(' ', '_') + '-' + + GRID + '_' + + DOMAIN_SIZE + '-' + + STORAGE_FP + '-' + COMPUTE_FP + '-' + + OCC + '-' + + HU + '-' + + CURVE) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--curve ' + CURVE) + + parameters.append('--benchmark') + parameters.append('--' + OCC) + parameters.append('--' + HU) + + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) + + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + print(' '.join(commandList)) + subprocess.run(commandList, text=True, stdout=fp) + + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') diff --git a/benchmarks/lbm/lbm.sh b/benchmarks/lbm/lbm.sh new file mode 100644 index 00000000..7cc5108c --- /dev/null +++ b/benchmarks/lbm/lbm.sh @@ -0,0 +1,30 @@ +set -x + +DOMAIN_SIZE_LIST="64 128 192 256 320 384 448 512" +GRID_LIST="dGrid bGrid eGrid" +STORAGE_FP_LIST="double float" +COMPUTE_FP_LIST="double float" +OCC="nOCC" + +for DOMAIN_SIZE in ${DOMAIN_SIZE_LIST}; do + for STORAGE_FP in ${STORAGE_FP_LIST}; do + for COMPUTE_FP in ${COMPUTE_FP_LIST}; do + for GRID in ${GRID_LIST}; do + + if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then + continue + fi + + echo ./lbm-lid-driven-cavity-flow \ + --deviceType gpu --deviceIds 0 \ + --grid "${GRID}" \ + --domain-size "${DOMAIN_SIZE}" \ + --warmup-iter 10 --max-iter 100 --repetitions 5 \ + --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \ + --computeFP "${COMPUTE_FP}" \ + --storageFP "${STORAGE_FP}" \ + --${OCC} --benchmark + done + done + done +done diff --git a/benchmarks/lbm/src/CellType.h b/benchmarks/lbm/src/CellType.h new file mode 100644 index 00000000..1ca70c6f --- /dev/null +++ b/benchmarks/lbm/src/CellType.h @@ -0,0 +1,56 @@ +#pragma once + +struct CellType +{ + enum Classification : int + { + bounceBack, + movingWall, + bulk, + undefined + }; + + NEON_CUDA_HOST_DEVICE CellType(int dummy = 0) + { + (void)dummy; + classification = bulk; + wallNghBitflag = 0; + } + + NEON_CUDA_HOST_DEVICE explicit CellType(Classification c, uint32_t n) + { + classification = c; + wallNghBitflag = n; + } + + NEON_CUDA_HOST_DEVICE explicit CellType(Classification c) + { + classification = c; + wallNghBitflag = 0; + } + + // Converting to int to exportVti + operator int() const { return int(classification); } + + template + static auto isWall(const uint32_t& wallNghBitFlag) + -> bool + { + return wallNghBitFlag & (uint32_t(1) << fwdRegIdx); + } + + auto setWall(int fwdRegIdx) + -> void + { + wallNghBitflag = wallNghBitflag | ((uint32_t(1) << fwdRegIdx)); + } + + uint32_t wallNghBitflag; + Classification classification; +}; + +std::ostream& operator<<(std::ostream& os, const CellType& dt) +{ + os << static_cast(dt.classification); + return os; +} \ No newline at end of file diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp new file mode 100644 index 00000000..115125bd --- /dev/null +++ b/benchmarks/lbm/src/Config.cpp @@ -0,0 +1,127 @@ +#include "Config.h" +#include +#include + +auto Config::toString() const -> std::string +{ + std::stringstream s; + const Config& c = *this; + + auto vecToSting = [](const std::vector& v) { + std::stringstream s; + bool firstTime = true; + for (auto e : v) { + if (firstTime) { + firstTime = false; + } else { + s << " "; + } + s << std::to_string(e); + } + return s.str(); + }; + + s << ".................. Re " << c.Re << std::endl; + s << "................. ulb " << c.ulb << std::endl; + s << "................... N " << c.N << std::endl; + s << "........... benchmark " << c.benchmark << std::endl; + s << "............... max_t " << c.max_t << std::endl; + s << "........ outFrequency " << c.outFrequency << std::endl; + s << "....... dataFrequency " << c.dataFrequency << std::endl; + s << "................. vti " << c.vti << std::endl; + + s << "........ benchIniIter " << c.benchIniIter << std::endl; + s << "........ benchMaxIter " << c.benchMaxIter << std::endl; + + s << ".......... deviceType " << c.deviceType << std::endl; + s << ".......... numDevices " << c.devices.size() << std::endl; + s << "............. devices " << vecToSting(c.devices) << std::endl; + s << ".......... reportFile " << c.reportFile << std::endl; + s << "............ gridType " << c.gridType << std::endl; + + s << "......... computeType " << c.computeType << std::endl; + s << "........... storeType " << c.storeType << std::endl; + s << "............... curve " << c.curve << std::endl; + + s << ". ............... occ " << Neon::skeleton::OccUtils::toString(c.occ) << std::endl; + s << "....... transfer Mode " << Neon::set::TransferModeUtils::toString(c.transferMode) << std::endl; + s << "... transfer Semantic " << Neon::set::StencilSemanticUtils::toString(c.stencilSemantic) << std::endl; + + s << ". ............... nu " << mLbmParameters.nu << std::endl; + s << ".............. omega " << mLbmParameters.omega << std::endl; + s << "................. dx " << mLbmParameters.dx << std::endl; + s << "................. dt " << mLbmParameters.dt << std::endl; + + return s.str(); +} + +auto Config::parseArgs(const int argc, char* argv[]) + -> int +{ + auto& config = *this; + + auto cli = + (clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device ids to use", + clipp::required("--deviceIds") & clipp::integers("gpus", config.devices) % "Device ids to use", + clipp::option("--grid") & clipp::value("grid", config.gridType) % "Could be dGrid, eGrid, bGrid", + clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain", + clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters", + clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations", + clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run.", + clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename", + + clipp::option("--computeFP") & clipp::value("computeFP", config.computeType) % "Could be double or float", + clipp::option("--storageFP") & clipp::value("storageFP", config.storeType) % "Could be double or float", + + clipp::option("--curve") & clipp::value("curve", config.curve) % "Could be sweep (the default), morton, or hilber", + ( + (clipp::option("--sOCC").set(config.occ, Neon::skeleton::Occ::standard) % "Standard OCC") | + (clipp::option("--nOCC").set(config.occ, Neon::skeleton::Occ::none) % "No OCC (on by default)")), + ( + (clipp::option("--put").set(config.transferMode, Neon::set::TransferMode::put) % "Set transfer mode to PUT") | + (clipp::option("--get").set(config.transferMode, Neon::set::TransferMode::get) % "Set transfer mode to GET (on by default)")), + ( + (clipp::option("--huLattice").set(config.stencilSemantic, Neon::set::StencilSemantic::streaming) % "Halo update with lattice semantic (on by default)") | + (clipp::option("--huGrid").set(config.stencilSemantic, Neon::set::StencilSemantic::standard) % "Halo update with grid semantic ")), + ( + (clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode") | + (clipp::option("--visual").set(config.benchmark, false) % "Run export partial data")), + + ( + clipp::option("--vti").set(config.vti, true) % "Standard OCC") + + ); + + + if (!clipp::parse(argc, argv, cli)) { + auto fmt = clipp::doc_formatting{}.doc_column(31); + std::cout << make_man_page(cli, argv[0], fmt) << '\n'; + return -1; + } + + if (config.curve == "sweep") + config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep; + if (config.curve == "morton") + config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::morton; + if (config.curve == "hilbert") + config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::hilbert; + + if (config.curve != "sweep" && config.curve != "morton" && config.curve != "hilbert") { + auto fmt = clipp::doc_formatting{}.doc_column(31); + std::cout << config.curve << " is not a supported configuration" << std::endl; + std::cout << make_man_page(cli, argv[0], fmt) << '\n'; + return -1; + } + + helpSetLbmParameters(); + + return 0; +} + +auto Config::helpSetLbmParameters() -> void +{ + mLbmParameters.nu = ulb * static_cast(N - 2) / Re; + mLbmParameters.omega = 1. / (3. * mLbmParameters.nu + 0.5); + mLbmParameters.dx = 1. / static_cast(N - 2); + mLbmParameters.dt = mLbmParameters.dx * ulb; +} diff --git a/benchmarks/lbm/src/Config.h b/benchmarks/lbm/src/Config.h new file mode 100644 index 00000000..18695ce4 --- /dev/null +++ b/benchmarks/lbm/src/Config.h @@ -0,0 +1,66 @@ +#pragma once + +#include +#include +#include "Neon/core/tools/clipp.h" +#include "Neon/domain/tools/SpaceCurves.h" +#include "Neon/skeleton/Skeleton.h" + +template +struct LbmParameters +{ + ComputeType nu = 0; + ComputeType omega = 0; + ComputeType dx = 0; + ComputeType dt = 0; +}; + +struct Config +{ + double Re = 100.; // Reynolds number + double ulb = 0.04; // Velocity in lattice units + int N = 160; // Number of nodes in x-direction + bool benchmark = false; // Run in benchmark mode ? + double max_t = 10.0; // Non-benchmark mode: Total time in dim.less units + int outFrequency = 200; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) + int dataFrequency = 0; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) + int benchIniIter = 1000; // Benchmark mode: Number of warmup iterations + int benchMaxIter = 2000; // Benchmark mode: Total number of iterations + int repetitions = 1; // Benchmark mode: number of time the test is run + std::string deviceType = "gpu"; + std::vector devices = std::vector(0); // Devices for the execution + std::string reportFile = "lbm-lid-driven-cavity-flow"; // Report file name + std::string gridType = "dGrid"; // Neon grid type + Neon::skeleton::Occ occ = Neon::skeleton::Occ::none; // Neon OCC type + Neon::set::TransferMode transferMode = Neon::set::TransferMode::get; // Neon transfer mode for halo update + Neon::set::StencilSemantic stencilSemantic = Neon::set::StencilSemantic::streaming; + bool vti = false; // Export vti file + std::string computeType = "double"; + std::string storeType = "double"; + std::string curve = "sweep"; + Neon::domain::tool::spaceCurves::EncoderType spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep; + LbmParameters mLbmParameters; + + auto toString() + const -> std::string; + + auto parseArgs(int argc, char* argv[]) + -> int; + + template + auto getLbmParameters() + -> LbmParameters + { + LbmParameters output; + output.nu = static_cast(mLbmParameters.nu); + output.omega = static_cast(mLbmParameters.omega); + output.dx = static_cast(mLbmParameters.dx); + output.dt = static_cast(mLbmParameters.dt); + + return output; + } + + private: + auto helpSetLbmParameters() + -> void; +}; diff --git a/benchmarks/lbm/src/ContainerFactory.h b/benchmarks/lbm/src/ContainerFactory.h new file mode 100644 index 00000000..980c67ae --- /dev/null +++ b/benchmarks/lbm/src/ContainerFactory.h @@ -0,0 +1,34 @@ +#pragma once +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +namespace pull { +template +struct ContainerFactory +{ +}; +} // namespace pull + +namespace push { +template +struct ContainerFactory +{ +}; +} // namespace push + +namespace common { +template +struct ContainerFactory +{ +}; +} // namespace common +#include "ContainersD3Q19.h" +#include "ContainersD3Q27.h" \ No newline at end of file diff --git a/benchmarks/lbm/src/ContainersD3Q19.h b/benchmarks/lbm/src/ContainersD3Q19.h new file mode 100644 index 00000000..38c30170 --- /dev/null +++ b/benchmarks/lbm/src/ContainersD3Q19.h @@ -0,0 +1,429 @@ +#pragma once + +#include "./Methods.h" +#include "CellType.h" +#include "D3Q19.h" +#include "DeviceD3Q19.h" +#include "Methods.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +namespace pull { +/** + * Specialization for D3Q19 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using PullFunctions = pull::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + PullFunctions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + PullFunctions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } +}; +} // namespace pull +namespace push { +/** + * Specialization for D3Q19 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using PushFunctions = push::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + CommonFunctions::macroscopic(popIn, + NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + CommonFunctions::collideBgkUnrolled(gidx, + rho, u, + usqr, omega, + NEON_IO popIn); + + PushFunctions::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut); + } + }; + }); + return container; + } +}; +} // namespace push +namespace common { +/** + * Specialization for D3Q19 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using PullFunctions = pull::DeviceD3Q19; + using PushFunctions = push::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; + + template + static auto + iteration([[maybe_unused]] Neon::set::StencilSemantic stencilSemantic, + [[maybe_unused]] const PopField fInField /*! Input population field */, + [[maybe_unused]] const CellTypeField& cellTypeField /*! Cell type field */, + [[maybe_unused]] const Compute omega /*! LBM omega parameter */, + [[maybe_unused]] PopField fOutField /*! Output Population field */) + -> Neon::set::Container + { + if constexpr (method_ == lbm::Method::push) { + using Factory = push::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + fOutField, + omega, + fOutField); + } + if constexpr (method_ == lbm::Method::pull) { + using Factory = pull::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + fOutField, + omega, + fOutField); + } + NEON_DEV_UNDER_CONSTRUCTION(""); + } + + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwdRegIdx) { + using M = typename Lattice::template RegisterMapper; + if constexpr (M::centerMemIdx != M::fwdMemIdx) { + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification != CellType::bulk) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemIdx)); + } + } + }); + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } + + + template + static auto + userSettingBc(UserLambda userLambda, + PopField& pField, + CellTypeField& cellTypeField /*! Cell type field */) + -> Neon::set::Container + { + Neon::set::Container container = pField.getGrid().newContainer( + "UserSettingBc", + [&](Neon::set::Loader& L) -> auto { + auto& p = L.load(pField, Neon::Pattern::MAP); + auto& flag = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globalIdx = p.getGlobalIndex(gidx); + Storage pValues[Lattice::Q]; + CellType::Classification cellClass; + userLambda(globalIdx, pValues, cellClass); + + CellType flagVal(cellClass); + flag(gidx, 0) = flagVal; + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + p(gidx, M::fwdMemIdx) = pValues[M::fwdRegIdx]; + }); + }; + }); + return container; + } + + static auto + copyPopulation(PopField& fInField, + PopField& foutField) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto const& pIn = L.load(fInField, Neon::Pattern::MAP); + auto& pOut = L.load(foutField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + pOut(gidx, q) = pIn(gidx, q); + }); + }; + }); + return container; + } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = + fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + + Storage popIn[Lattice::Q]; + CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); + + if (cellInfo.classification == CellType::bulk) { + CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + + static auto + problemSetup(PopField& fInField /*! inpout population field */, + PopField& fOutField, + CellTypeField& cellTypeField, + Neon::double_3d ulid, + double ulb) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, ulid, ulb](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, Neon::Pattern::MAP); + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globalIdx = fIn.getGlobalIndex(gidx); + const auto domainDim = fIn.getDomainSize(); + + CellType flagVal; + flagVal.classification = CellType::bulk; + flagVal.wallNghBitflag = 0; + + typename Lattice::Precision::Storage popVal = 0; + + if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || + globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 || + globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) { + flagVal.classification = CellType::bounceBack; + + if (globalIdx.y == domainDim.y - 1) { + flagVal.classification = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + if (globalIdx.y == domainDim.y - 1) { + popVal = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); + } else { + popVal = 0; + } + fIn(gidx, q) = popVal; + fOut(gidx, q) = popVal; + }); + } else { + flagVal.classification = CellType::bulk; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + fIn(gidx, q) = Lattice::Memory::template getT(); + fOut(gidx, q) = Lattice::Memory::template getT(); + }); + } + cellInfoPartition(gidx, 0) = flagVal; + }; + }); + return container; + } + + static auto + setToEquilibrium(PopField& fOutField, + CellTypeField& cellTypeField) + -> Neon::set::Container + { + Neon::set::Container container = fOutField.getGrid().newContainer( + "LBM_setToEquilibrium", + [&](Neon::set::Loader& L) -> auto { + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + { // All pints are pre-set to bulk + CellType flagVal; + flagVal.classification = CellType::bulk; + cellInfoPartition(gidx, 0) = flagVal; + } + + { // All cells are pre-set to Equilibrium + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + fOut(gidx, M::fwdMemIdx) = Lattice::Registers::template getT(); + }); + } + }; + }); + return container; + } +}; +} // namespace common \ No newline at end of file diff --git a/benchmarks/lbm/src/ContainersD3Q27.h b/benchmarks/lbm/src/ContainersD3Q27.h new file mode 100644 index 00000000..d5d024ea --- /dev/null +++ b/benchmarks/lbm/src/ContainersD3Q27.h @@ -0,0 +1,227 @@ +#pragma once + +#include "CellType.h" +#include "D3Q27.h" +#include "DeviceD3Q27.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" +#if 0 +/** + * Specialization for D3Q27 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q27; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using Functions = DeviceD3Q19; + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + Functions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId != Lattice::Memory::center) { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; + + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification != CellType::bulk) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId)); + } + } + }); + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } + + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + Storage popIn[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + + Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + } else { + if (cellInfo.classification == CellType::movingWall) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GORegisterId) { + if constexpr (GORegisterId == Lattice::Registers::center) { + popIn[Lattice::Registers::center] = fIn(gidx, Lattice::Memory::center); + } else { + popIn[GORegisterId] = fIn(gidx, Lattice::Memory::template mapFromRegisters()); + } + }); + + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + + static auto + problemSetup(PopField& fInField /*! inpout population field */, + PopField& fOutField, + CellTypeField& cellTypeField, + Neon::double_3d ulid, + double ulb) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, ulid, ulb](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, Neon::Pattern::MAP); + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globlalIdx = fIn.getGlobalIndex(gidx); + const auto domainDim = fIn.getDomainSize(); + CellType flagVal; + flagVal.classification = CellType::bulk; + flagVal.wallNghBitflag = 0; + typename Lattice::Precision::Storage val = 0; + + if (globlalIdx.x == 0 || globlalIdx.x == domainDim.x - 1 || + globlalIdx.y == 0 || globlalIdx.y == domainDim.y - 1 || + globlalIdx.z == 0 || globlalIdx.z == domainDim.z - 1) { + flagVal.classification = CellType::bounceBack; + + if (globlalIdx.y == domainDim.y - 1) { + flagVal.classification = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + if (globlalIdx.y == domainDim.y - 1) { + val = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); + } else { + val = 0; + } + fIn(gidx, q) = val; + fOut(gidx, q) = val; + }); + } else { + flagVal.classification = CellType::bulk; + cellInfoPartition(gidx, 0) = flagVal; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + fIn(gidx, q) = Lattice::Memory::template getT(); + fOut(gidx, q) = Lattice::Memory::template getT(); + }); + } + }; + }); + return container; + } +}; +#endif \ No newline at end of file diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h new file mode 100644 index 00000000..8659fc91 --- /dev/null +++ b/benchmarks/lbm/src/D3Q19.h @@ -0,0 +1,258 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" +#include "Precision.h" + + +/** In each lattice we define two indexing schema + * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code. + * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields. + * + * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions. + * + */ +template +struct D3Q19 +{ + public: + D3Q19() = delete; + + static constexpr int Q = 19; /** number of directions */ + static constexpr int D = 3; /** Space dimension */ + using Precision = Precision_; + using Self = D3Q19; + + static constexpr int RegisterMapping = 1; + static constexpr int MemoryMapping = 2; + + struct Registers + { + using Self = D3Q19::Registers; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1)}; + + static constexpr int center = 9; /** Position of direction {0,0,0} */ + + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; + } + } + } + + static constexpr std::array opposite{ + 10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8}; + + static constexpr std::array t{ + 1. / 18. /*! 0 */, + 1. / 18. /*! 1 */, + 1. / 18. /*! 2 */, + 1. / 36. /*! 3 */, + 1. / 36. /*! 4 */, + 1. / 36. /*! 5 */, + 1. / 36. /*! 6 */, + 1. / 36. /*! 7 */, + 1. / 36. /*! 8 */, + 1. / 3. /*! 9 */, + 1. / 18. /*! 10 */, + 1. / 18. /*! 11 */, + 1. / 18. /*! 12 */, + 1. / 36. /*! 13 */, + 1. / 36. /*! 14 */, + 1. / 36. /*! 15 */, + 1. / 36. /*! 16 */, + 1. / 36. /*! 17 */, + 1. / 36. /*! 18 */ + }; + + template + static constexpr auto getT() -> const typename Precision::Storage + { + return t[q]; + } + + template + static constexpr auto getDirection() -> const typename Neon::index_3d + { + return stencil[q]; + } + + static constexpr int fwdRegIdxListLen = (Q - 1) / 2; + static constexpr std::array fwdRegIdxList{0, 1, 2, 3, 4, 5, 6, 7, 8}; + + template + static inline NEON_CUDA_HOST_DEVICE auto + getCk_u(std::array const& u) -> Compute + { + if constexpr (tegIdx == 0 || tegIdx == 9) { + return u[0]; + } + if constexpr (tegIdx == 1 || tegIdx == 10) { + return u[1]; + } + if constexpr (tegIdx == 2 || tegIdx == 11) { + return u[2]; + } + if constexpr (tegIdx == 3 || tegIdx == 12) { + return u[0] + u[1]; + } + if constexpr (tegIdx == 4 || tegIdx == 13) { + return u[0] - u[1]; + } + if constexpr (tegIdx == 5 || tegIdx == 14) { + return u[0] + u[2]; + } + if constexpr (tegIdx == 6 || tegIdx == 15) { + + return u[0] - u[2]; + } + if constexpr (tegIdx == 7 || tegIdx == 16) { + + return u[1] + u[2]; + } + if constexpr (tegIdx == 8 || tegIdx == 17) { + return u[1] - u[2]; + } + } + }; + + struct Memory + { + using Self = D3Q19::Memory; + + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1)}; + + + static constexpr int center = 9; /** Position of direction {0,0,0} */ + + static constexpr std::array memoryToRegister{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; + + static constexpr std::array registerToMemory{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; + + + template + NEON_CUDA_HOST_DEVICE static constexpr auto mapToRegisters() + -> int + { + return memoryToRegister[go]; + } + + template + NEON_CUDA_HOST_DEVICE static constexpr auto mapToMemory() + -> int + { + return registerToMemory[go]; + } + + template + NEON_CUDA_HOST_DEVICE static constexpr auto getOpposite() + -> int + { + return opposite[go]; + } + + static constexpr std::array opposite{ + 10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8}; + }; + +// template +// struct MemMapper +// { +// constexpr static int fwdMemIdx = fwMemIdx_; +// constexpr static int fwdX = Memory::stencil[fwdMemIdx].x; +// constexpr static int fwdY = Memory::stencil[fwdMemIdx].y; +// constexpr static int fwdZ = Memory::stencil[fwdMemIdx].z; +// +// constexpr static int bkwMemIdx = Memory::opposite[fwdMemIdx]; +// constexpr static int bkwX = Memory::stencil[bkwMemIdx].x; +// constexpr static int bkwY = Memory::stencil[bkwMemIdx].y; +// constexpr static int bkwZ = Memory::stencil[bkwMemIdx].z; +// +// constexpr static int fwdRegIdx = Memory::template mapToRegisters(); +// constexpr static int centerRegIdx = Registers::center; +// constexpr static int centerMemIdx = Memory::center; +// }; + + template + struct RegisterMapper + { + constexpr static int fwdRegIdx = fwdRegIdx_; + constexpr static int bkwRegIdx = Registers::opposite[fwdRegIdx]; + constexpr static int fwdMemIdx = Memory::template mapToMemory(); + constexpr static int bkwMemIdx = Memory::template mapToMemory(); + constexpr static int centerRegIdx = Registers::center; + constexpr static int centerMemIdx = Memory::center; + + constexpr static int fwdX = Memory::stencil[fwdMemIdx].x; + constexpr static int fwdY = Memory::stencil[fwdMemIdx].y; + constexpr static int fwdZ = Memory::stencil[fwdMemIdx].z; + + constexpr static int bkwX = Memory::stencil[bkwMemIdx].x; + constexpr static int bkwY = Memory::stencil[bkwMemIdx].y; + constexpr static int bkwZ = Memory::stencil[bkwMemIdx].z; + }; + + public: + template + static auto getDirectionAsVector() + -> std::vector + { + std::vector vec; + if constexpr (mappingType == RegisterMapping) { + for (auto const& a : Registers::stencil) { + vec.push_back(a); + } + } else if constexpr (mappingType == MemoryMapping) { + for (auto const& a : Memory::stencil) { + vec.push_back(a); + } + } + return vec; + } +}; diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h new file mode 100644 index 00000000..9f2c7f95 --- /dev/null +++ b/benchmarks/lbm/src/D3Q27.h @@ -0,0 +1,200 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" +#include "Precision.h" + + +/** In each lattice we define two indexing schema + * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code. + * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields. + * + * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions. + * + */ +template +struct D3Q27 +{ + public: + D3Q27() = delete; + + static constexpr int Q = 27; /** number of directions */ + static constexpr int D = 3; /** Space dimension */ + using Precision = Precision_; + using Self = D3Q27; + + static constexpr int RegisterMapping = 1; + static constexpr int MemoryMapping = 2; + + struct Registers + { + using Self = D3Q27::Registers; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(-1, -1, -1), + Neon::index_3d(-1, -1, 1), + Neon::index_3d(-1, 1, -1), + Neon::index_3d(-1, 1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1), + Neon::index_3d(1, 1, 1), + Neon::index_3d(1, 1, -1), + Neon::index_3d(1, -1, 1), + Neon::index_3d(1, -1, -1)}; + + static constexpr int center = 13; /** Position of direction {0,0,0} */ + + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; + } + } + } + + static constexpr std::array opposite{ + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 13, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 + }; + + static constexpr std::array t{ + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216., + 8. / 27., + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; + }; + + struct Memory + { + using Self = D3Q27::Memory; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(-1, -1, -1), + Neon::index_3d(-1, -1, 1), + Neon::index_3d(-1, 1, -1), + Neon::index_3d(-1, 1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1), + Neon::index_3d(1, 1, 1), + Neon::index_3d(1, 1, -1), + Neon::index_3d(1, -1, 1), + Neon::index_3d(1, -1, -1)}; + + + static constexpr int center = 13; /** Position of direction {0,0,0} */ + + template + static constexpr auto mapToRegisters() + -> int + { + auto direction = stencil[go]; + for (int i = 0; i < Q; ++i) { + if (Registers::stencil[i] == direction) { + return i; + } + } + } + + template + static constexpr auto mapFromRegisters() + -> int + { + auto direction = Registers::stencil[go]; + for (int i = 0; i < Q; ++i) { + if (Self::stencil[i] == direction) { + return i; + } + } + } + + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; + } + } + } + + static constexpr std::array opposite{ + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 13, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 + }; + + template + static constexpr auto helpGetValueforT() + -> typename Precision::Storage + { + auto goInRegisterSpace = Self::template mapToRegisters(); + return Registers::t[goInRegisterSpace]; + } + + static constexpr std::array t{ + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216., + 8. / 27., + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; + }; + + public: + template + static auto getDirectionAsVector() + -> std::vector + { + std::vector vec; + if constexpr (mappingType == RegisterMapping) { + for (auto const& a : Registers::stencil) { + vec.push_back(a); + } + } else if constexpr (mappingType == MemoryMapping) { + for (auto const& a : Memory::stencil) { + vec.push_back(a); + } + } + return vec; + } +}; diff --git a/benchmarks/lbm/src/DeviceD3Q19.h b/benchmarks/lbm/src/DeviceD3Q19.h new file mode 100644 index 00000000..fd2f8d08 --- /dev/null +++ b/benchmarks/lbm/src/DeviceD3Q19.h @@ -0,0 +1,224 @@ +#pragma once +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +namespace pull { +template +struct DeviceD3Q19 +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx) { + using M = typename Lattice::template MappersIdxSetWithFwdMem; + + if constexpr (fwMemIdx == Lattice::Memory::center) { + popIn[M::centerRegIdx] = fin(gidx, M::centerMemIdx); + } else { + if (CellType::isWall()) { + popIn[M::fwdRegIdx] = fin(gidx, M::bkMemIdx) + + fin.template getNghData(gidx, M::bkMemIdx)(); + } else { + popIn[M::fwdRegIdx] = fin.template getNghData(gidx, fwMemIdx)(); + } + } + }); + } +}; + +#undef CAST_TO_COMPUTE +} // namespace pull + +namespace push { +template +struct DeviceD3Q19 +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + pushStream(Idx const& gidx, + const uint32_t& wallNghBitFlag, + NEON_OUT Storage pOut[Lattice::Q], + NEON_OUT typename PopField::Partition const& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + + if constexpr (M::fwdMemIdx == M::centerMemIdx) { + fOut(gidx, M::fwdMemIdx) = pOut[M::fwdRegIdx]; + } else { + if (CellType::isWall()) { + // fout(i, opp[k]) = + // pop_out + + // f(nb, k); + fOut(gidx, M::bkMemIdx) = + pOut[M::fwdRegIdx] + + fOut.template getNghData(gidx, M::fwdMemIdx)(); + } else { + // fout(nb, k) = pop_out; + fOut.writeNgh(gidx, M::fwdMemIdx, pOut[M::fwdRegIdx]); + } + } + }); + } +}; +} // namespace push + + +namespace common { +template +struct DeviceD3Q19 +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const Storage pop[Lattice::Q], + NEON_OUT Compute& rho, + NEON_OUT std::array& u) + -> void + { + +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! Compute iterator */, + Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + NEON_IO Storage pop[Lattice::Q]) + + -> void + { + + constexpr Compute c1over18 = 1. / 18.; + constexpr Compute c1over36 = 1. / 36.; + constexpr Compute c4dot5 = 4.5; + constexpr Compute c3 = 3.; + constexpr Compute c1 = 1.; + constexpr Compute c6 = 6.; + + constexpr int regCenter = Lattice::Registers::center; + constexpr int regFir = Lattice::Registers::center; + + Neon::ConstexprFor<0, Lattice::Registers::fwdRegIdxListLen, 1>( + [&](auto fwdRegIdxListIdx) { + using M = typename Lattice::template RegisterMapper; + using T = typename Lattice::Registers; + + Compute eqFw; + Compute eqBk; + + const Compute ck_u = T::template getCk_u(u); + // double eq = rho * t[k] * + // (1. + + // 3. * ck_u + + // 4.5 * ck_u * ck_u - + // usqr); + eqFw = rho * T::t[M::fwdRegIdx] * + (c1 + + c3 * ck_u + + c4dot5 * ck_u * ck_u - + usqr); + + // double eqopp = eq - 6.* rho * t[k] * ck_u; + eqBk = eqFw - + c6 * rho * c1over36 * T::t[M::fwdRegIdx] * ck_u; + + // pop_out = (1. - omega) * fin(i, k) + omega * eq; + pop[M::fwdRegIdx] = (c1 - omega) * static_cast(pop[M::fwdRegIdx]) + omega * eqFw; + // pop_out_opp = (1. - omega) * fin(i, opp[k]) + omega * eqopp; + pop[M::bkwRegIdx] = (c1 - omega) * static_cast(pop[M::bkwRegIdx]) + omega * eqBk; + }); + { // Center; + using T = typename Lattice::Registers; + using M = typename Lattice::template RegisterMapper; + // eq = rho * t[k] * (1. - usqr); + const Compute eqCenter = rho * T::t[M::fwdRegIdx] * (c1 - usqr); + // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; + pop[Lattice::Registers::center] = (c1 - omega) * static_cast(pop[M::fwdRegIdx]) + omega * eqCenter; + } + } + static inline NEON_CUDA_HOST_DEVICE auto + localLoad(Idx const& gidx, + NEON_IN typename PopField::Partition const& fOut, + Storage NEON_RESTRICT pOut[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + pOut[M::fwdRegIdx] = fOut(gidx, M::fwdMemIdx); + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + localStore(Idx const& gidx, + Storage NEON_RESTRICT pOut[Lattice::Q], + NEON_IN typename PopField::Partition& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + fOut(gidx, M::fwdMemIdx) = pOut[M::fwdRegIdx]; + }); + } +}; +} // namespace common \ No newline at end of file diff --git a/benchmarks/lbm/src/DeviceD3Q27.h b/benchmarks/lbm/src/DeviceD3Q27.h new file mode 100644 index 00000000..f977492b --- /dev/null +++ b/benchmarks/lbm/src/DeviceD3Q27.h @@ -0,0 +1,217 @@ +#pragma once +#include "CellType.h" +#include "D3Q27.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + + +template +struct DeviceD3Q27 +{ + using Lattice = D3Q27; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId == Lattice::Memory::center) { + popIn[Lattice::Registers::center] = fin(gidx, Lattice::Memory::center); + } else { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; + constexpr int GORegistersId = Lattice::Memory::template mapToRegisters(); + + if (wallBitFlag & (uint32_t(1) << GOMemoryId)) { + popIn[GORegistersId] = + fin(gidx, BKMemoryId) + + fin.template getNghData(gidx, BKMemoryId)(); + } else { + popIn[GORegistersId] = + fin.template getNghData(gidx, GOMemoryId)(); + } + } + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const Storage pop[Lattice::Q], + NEON_OUT Compute& rho, + NEON_OUT std::array& u) + -> void + { + +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6) + POP(9) + POP(10) + POP(11) + POP(12); + const Compute X_P1 = POP(14) + POP(17) + POP(18) + POP(19) + POP(20) + POP(23) + POP(24) + POP(25) + POP(26); + const Compute X_0 = POP(1) + POP(2) + POP(7) + POP(8) + POP(13) + POP(15) + POP(16) + POP(21) + POP(22); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(9) + POP(10) + POP(18) + POP(25) + POP(26); + const Compute Y_P1 = POP(15) + POP(17) + POP(21) + POP(22) + POP(23) + POP(24) + POP(4) + POP(11) + POP(12); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(9) + POP(11) + POP(20) + POP(22) + POP(24) + POP(26); + const Compute Z_P1 = POP(16) + POP(19) + POP(21) + POP(23) + POP(25) + POP(6) + POP(8) + POP(10) + POP(12); +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[0] = (Y_P1 - Y_M1) / rho; + u[0] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! Compute iterator */, + const Storage pop[Lattice::Q], + Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + typename PopField::Partition& fOut /*! Population */) + + -> void + { + const Compute cku1 = u[0] + u[1]; + const Compute cku2 = -u[0] + u[1]; + const Compute cku3 = u[0] + u[2]; + const Compute cku4 = -u[0] + u[2]; + const Compute cku5 = u[1] + u[2]; + const Compute cku6 = -u[1] + u[2]; + const Compute cku7 = u[0] + u[1] + u[2]; + const Compute cku8 = -u[0] + u[1] + u[2]; + const Compute cku9 = u[0] - u[1] + u[2]; + const Compute cku0 = u[0] + u[1] - u[2]; + + std::array feqRM; + + constexpr int F000 = 13; + constexpr int FM00 = 0; + constexpr int F0M0 = 1; + constexpr int F00M = 2; + constexpr int FMM0 = 3; + constexpr int FMP0 = 4; + constexpr int FM0M = 5; + constexpr int FM0P = 6; + constexpr int F0MM = 7; + constexpr int F0MP = 8; + constexpr int FMMM = 9; + constexpr int FMMP = 10; + constexpr int FMPM = 11; + constexpr int FMPP = 12; + constexpr int FP00 = 14; + constexpr int F0P0 = 15; + constexpr int F00P = 16; + constexpr int FPP0 = 17; + constexpr int FPM0 = 18; + constexpr int FP0P = 19; + constexpr int FP0M = 20; + constexpr int F0PP = 21; + constexpr int F0PM = 22; + constexpr int FPPP = 23; + constexpr int FPPM = 24; + constexpr int FPMP = 25; + constexpr int FPMM = 26; + + constexpr Compute c1over18 = 1. / 18.; + constexpr Compute c1over36 = 1. / 36.; + constexpr Compute c4dot5 = 4.5; + constexpr Compute c3 = 3.; + constexpr Compute c1 = 1.; + constexpr Compute c6 = 6.; + + feqRM[F000] = rho * Lattice::Registers::t[F000] * (c1- usqr); + + feqRM[FM00] = rho * Lattice::Registers::t[FM00] * (c1- c3* u[0] + c4dot5* u[0] * u[0] - usqr); + feqRM[FP00] = rho * Lattice::Registers::t[FP00] * (c6 * u[0]) + feqRM[FM00]; + + feqRM[F0M0] = rho * Lattice::Registers::t[F0M0] * (c1- c3* u[1] + c4dot5* u[1] * u[1] - usqr); + feqRM[F0P0] = rho * Lattice::Registers::t[F0P0] * (c6 * u[1]) + feqRM[F0M0]; + + feqRM[F00M] = rho * Lattice::Registers::t[F00M] * (c1- c3* u[2] + c4dot5* u[2] * u[2] - usqr); + feqRM[F00P] = rho * Lattice::Registers::t[F00P] * (c6 * u[2]) + feqRM[F00M]; + + feqRM[FMM0] = rho * Lattice::Registers::t[FMM0] * (c1- c3* cku1 + c4dot5* cku1 * cku1 - usqr); + feqRM[FPP0] = rho * Lattice::Registers::t[FPP0] * (c6 * cku1) + feqRM[FMM0]; + feqRM[FPM0] = rho * Lattice::Registers::t[FPM0] * (c1- c3* cku2 + c4dot5* cku2 * cku2 - usqr); + feqRM[FMP0] = rho * Lattice::Registers::t[FMP0] * (c6 * cku2) + feqRM[FPM0]; + + feqRM[FM0M] = rho * Lattice::Registers::t[FM0M] * (c1- c3* cku3 + c4dot5* cku3 * cku3 - usqr); + feqRM[FP0P] = rho * Lattice::Registers::t[FP0P] * (c6 * cku3) + feqRM[FM0M]; + feqRM[FP0M] = rho * Lattice::Registers::t[FP0M] * (c1- c3* cku4 + c4dot5* cku4 * cku4 - usqr); + feqRM[FM0P] = rho * Lattice::Registers::t[FM0P] * (c6 * cku4) + feqRM[FP0M]; + + feqRM[F0MM] = rho * Lattice::Registers::t[F0MM] * (c1- c3* cku5 + c4dot5* cku5 * cku5 - usqr); + feqRM[F0PP] = rho * Lattice::Registers::t[F0PP] * (c6 * cku5) + feqRM[F0MM]; + feqRM[F0PM] = rho * Lattice::Registers::t[F0PM] * (c1- c3* cku6 + c4dot5* cku6 * cku6 - usqr); + feqRM[F0MP] = rho * Lattice::Registers::t[F0MP] * (c6 * cku6) + feqRM[F0PM]; + + feqRM[FMMM] = rho * Lattice::Registers::t[FMMM] * (c1- c3* cku7 + c4dot5* cku7 * cku7 - usqr); + feqRM[FPPP] = rho * Lattice::Registers::t[FPPP] * (c6 * cku7) + feqRM[FMMM]; + feqRM[FPMM] = rho * Lattice::Registers::t[FPMM] * (c1- c3* cku8 + c4dot5* cku8 * cku8 - usqr); + feqRM[FMPP] = rho * Lattice::Registers::t[FMPP] * (c6 * cku8) + feqRM[FPMM]; + feqRM[FMPM] = rho * Lattice::Registers::t[FMPM] * (c1- c3* cku9 + c4dot5* cku9 * cku9 - usqr); + feqRM[FPMP] = rho * Lattice::Registers::t[FPMP] * (c6 * cku9) + feqRM[FMPM]; + feqRM[FMMP] = rho * Lattice::Registers::t[FMMP] * (c1- c3* cku0 + c4dot5* cku0 * cku0 - usqr); + feqRM[FPPM] = rho * Lattice::Registers::t[FPPM] * (c6 * cku0) + feqRM[FMMP]; + + // BGK Collision based on the second-order equilibrium + std::array foutRM; + + foutRM[F000] = (c1- omega) * static_cast(pop[F000]) + omega * feqRM[F000]; + + foutRM[FP00] = (c1- omega) * static_cast(pop[FP00]) + omega * feqRM[FP00]; + foutRM[FM00] = (c1- omega) * static_cast(pop[FM00]) + omega * feqRM[FM00]; + + foutRM[F0P0] = (c1- omega) * static_cast(pop[F0P0]) + omega * feqRM[F0P0]; + foutRM[F0M0] = (c1- omega) * static_cast(pop[F0M0]) + omega * feqRM[F0M0]; + + foutRM[F00P] = (c1- omega) * static_cast(pop[F00P]) + omega * feqRM[F00P]; + foutRM[F00M] = (c1- omega) * static_cast(pop[F00M]) + omega * feqRM[F00M]; + + foutRM[FPP0] = (c1- omega) * static_cast(pop[FPP0]) + omega * feqRM[FPP0]; + foutRM[FMP0] = (c1- omega) * static_cast(pop[FMP0]) + omega * feqRM[FMP0]; + foutRM[FPM0] = (c1- omega) * static_cast(pop[FPM0]) + omega * feqRM[FPM0]; + foutRM[FMM0] = (c1- omega) * static_cast(pop[FMM0]) + omega * feqRM[FMM0]; + + foutRM[FP0P] = (c1- omega) * static_cast(pop[FP0P]) + omega * feqRM[FP0P]; + foutRM[FM0P] = (c1- omega) * static_cast(pop[FM0P]) + omega * feqRM[FM0P]; + foutRM[FP0M] = (c1- omega) * static_cast(pop[FP0M]) + omega * feqRM[FP0M]; + foutRM[FM0M] = (c1- omega) * static_cast(pop[FM0M]) + omega * feqRM[FM0M]; + + foutRM[F0PP] = (c1- omega) * static_cast(pop[F0PP]) + omega * feqRM[F0PP]; + foutRM[F0MP] = (c1- omega) * static_cast(pop[F0MP]) + omega * feqRM[F0MP]; + foutRM[F0PM] = (c1- omega) * static_cast(pop[F0PM]) + omega * feqRM[F0PM]; + foutRM[F0MM] = (c1- omega) * static_cast(pop[F0MM]) + omega * feqRM[F0MM]; + + foutRM[FPPP] = (c1- omega) * static_cast(pop[FPPP]) + omega * feqRM[FPPP]; + foutRM[FMPP] = (c1- omega) * static_cast(pop[FMPP]) + omega * feqRM[FMPP]; + foutRM[FPMP] = (c1- omega) * static_cast(pop[FPMP]) + omega * feqRM[FPMP]; + foutRM[FPPM] = (c1- omega) * static_cast(pop[FPPM]) + omega * feqRM[FPPM]; + foutRM[FMMP] = (c1- omega) * static_cast(pop[FMMP]) + omega * feqRM[FMMP]; + foutRM[FMPM] = (c1- omega) * static_cast(pop[FMPM]) + omega * feqRM[FMPM]; + foutRM[FPMM] = (c1- omega) * static_cast(pop[FPMM]) + omega * feqRM[FPMM]; + foutRM[FMMM] = (c1- omega) * static_cast(pop[FMMM]) + omega * feqRM[FMMM]; + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + fOut(i, GOMemoryId) = static_cast(foutRM[Lattice::Memory::template mapToRegisters()]); + }); + } +}; + diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h new file mode 100644 index 00000000..a7223d5b --- /dev/null +++ b/benchmarks/lbm/src/Lbm.h @@ -0,0 +1,324 @@ +#include "./Config.h" +#include "./Methods.h" +#include "./Metrics.h" +#include "./Repoert.h" +#include "CellType.h" +#include "ContainerFactory.h" +#include "ContainersD3Q19.h" +#include "D3Q19.h" +#include "Methods.h" +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/Containter.h" +#include "Neon/skeleton/Skeleton.h" + +int backendWasReported = false; + +template +struct Lbm +{ + using Grid = Grid_; + using Lattice = Lattice_; + using Precision = Precision_; + + using PField = typename Grid::template Field; + using CField = typename Grid::template Field; + using RhoField = typename Grid::template Field; + using UField = typename Grid::template Field; + + using CommonContainerFactory = common::ContainerFactory; + + template + Lbm(Config& config, + Report& report, + Neon::index_3d const& dim, + Lambda activeMask) + { + using Idx = typename PField::Idx; + reportPtr = &report; + + // Setting the backend + Neon::Backend bk = [&] { + if (config.deviceType == "cpu") { + Neon::Backend bk(config.devices, Neon::Runtime::openmp); + return bk; + } + if (config.deviceType == "gpu") { + Neon::Backend bk(config.devices, Neon::Runtime::stream); + return bk; + } + Neon::NeonException exce("run"); + exce << config.deviceType << " is not a supported option as device type"; + NEON_THROW(exce); + }(); + + // Setting the grid + grid = Grid( + bk, {config.N, config.N, config.N}, + [&](const Neon::index_3d& p) { return activeMask(p); }, + Lattice::template getDirectionAsVector(), + 1.0, 0.0, + config.spaceCurve); + + // Allocating Populations + for (int i = 0; i < lbm::MethodUtils::getNumberOfPFields(); i++) { + std::stringstream name; + name << "PopField_0" << i; + using Storage = typename Precision::Storage; + auto field = grid.template newField(name.str(), + Lattice::Q, + Storage(0.0)); + pFieldList.push_back(field); + } + + // Allocating cell type field + CellType defaultCelltype; + cellFlagField = grid.template newField("cellFlags", 1, defaultCelltype); + + // Allocating rho and u + if (config.vti) { + std::cout << "Allocating rho and u" << std::endl; + using Storage = typename Precision::Storage; + rho = grid.template newField("rho", 1, Storage(0.0)); + u = grid.template newField("u", 3, Storage(0.0)); + } + + { // Setting Equilibrium all population field + for (auto& pField : pFieldList) { + // Set all to eq + CommonContainerFactory::setToEquilibrium(pField, cellFlagField).run(Neon::Backend::mainStreamIdx); + } + } + } + + // Lambda = void(*)(Neon::Index3d) -> std::tuple> + template + auto setBC(Lambda bcSetFunction) -> void + { + grid.getBackend().sync(Neon::Backend::mainStreamIdx); + // Compute ngh mask + CommonContainerFactory::userSettingBc(bcSetFunction, + pFieldList[0], + cellFlagField) + .run(Neon::Backend::mainStreamIdx); + + for (int i = 1; i < pFieldList.size(); i++) { + CommonContainerFactory::copyPopulation(pFieldList[0], + pFieldList[i]) + .run(Neon::Backend::mainStreamIdx); + } + cellFlagField.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + grid.getBackend().sync(Neon::Backend::mainStreamIdx); + CommonContainerFactory::computeWallNghMask(cellFlagField, + cellFlagField) + .run(Neon::Backend::mainStreamIdx); + } + + auto helpPrep() -> void + { + grid.getBackend().sync(Neon::Backend::mainStreamIdx); + // One collide if 2Pop - pull + // One iteration if 2Pop = push + if constexpr (int(lbm::Method::pull) == method) { + NEON_DEV_UNDER_CONSTRUCTION(""); + return; + } + if constexpr (int(lbm::Method::push) == method) { + auto lbmParameters = configurations.getLbmParameters(); + skeleton.resize(2); + { + iteration = 0; + auto even = common::ContainerFactory::template iteration( + configurations.stencilSemantic, + pFieldList.at(helpGetInputIdx()), + cellFlagField, + lbmParameters.omega, + pFieldList.at(helpGetOutputIdx())); + + std::vector ops; + skeleton.at(iteration) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); + Neon::skeleton::Options opt(configurations.occ, configurations.transferMode); + ops.push_back(even); + std::stringstream appName; + appName << "LBM_push_even"; + skeleton.at(iteration).sequence(ops, appName.str(), opt); + } + { + iteration = 1; + auto odd = CommonContainerFactory::template iteration( + configurations.stencilSemantic, + pFieldList.at(helpGetInputIdx()), + cellFlagField, + lbmParameters.omega, + pFieldList.at(helpGetOutputIdx())); + + std::vector ops; + skeleton.at(iteration) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); + Neon::skeleton::Options opt(configurations.occ, configurations.transferMode); + ops.push_back(odd); + std::stringstream appName; + appName << "LBM_push_odd"; + skeleton.at(iteration).sequence(ops, appName.str(), opt); + } + + { + iteration = 1; + skeleton.at(helpGetSkeletonIdx()).run(); + iteration = 0; + } + return; + } + if constexpr (int(lbm::Method::aa) == method) { + NEON_DEV_UNDER_CONSTRUCTION(""); + return; + } + } + + auto iterate() -> void + { + helpPrep(); + // Iteration keep track of all iterations + // clock_iter keeps tracks of the iteration done after the last clock reset + + auto& bk = grid.getBackend(); + auto [start, clock_iter] = metrics::restartClock(bk, true); + int time_iter = 0; + // Reset the clock, to be used when a benchmark simulation is executed. + tie(start, clock_iter) = metrics::restartClock(bk, true); + + for (time_iter = 0; time_iter < configurations.benchMaxIter; ++time_iter) { + if (!configurations.benchmark) { + bk.syncAll(); + helpExportVti(); + } + + if (configurations.benchmark && time_iter == configurations.benchIniIter) { + std::cout << "Warm up completed (" << time_iter << " iterations ).\n" + << "Starting benchmark step (" + << configurations.benchMaxIter - configurations.benchIniIter << " iterations)." + << std::endl; + tie(start, clock_iter) = metrics::restartClock(bk, false); + } + + skeleton[helpGetSkeletonIdx()].run(); + + ++clock_iter; + ++iteration; + } + std::cout << "Iterations completed" << std::endl; + metrics::recordMetrics(bk, configurations, *reportPtr, start, clock_iter); + } + + auto helpIterateOnce() -> void + { + if (int(lbm::Method::pull) == method) { + NEON_DEV_UNDER_CONSTRUCTION(""); + return; + } + if (int(lbm::Method::push) == method) { + skeleton.at(helpGetSkeletonIdx()).run(Neon::Backend::mainStreamIdx); + return; + } + if (int(lbm::Method::aa) == method) { + NEON_DEV_UNDER_CONSTRUCTION(""); + return; + } + } + + auto helpExportVti() -> void + { + grid.getBackend().syncAll(); + auto& pop = pFieldList.at(helpGetOutputIdx()); + auto computeRhoAndU = CommonContainerFactory::computeRhoAndU(pop, cellFlagField, rho, u); + computeRhoAndU.run(Neon::Backend::mainStreamIdx); + u.updateHostData(Neon::Backend::mainStreamIdx); + rho.updateHostData(Neon::Backend::mainStreamIdx); + grid.getBackend().sync(Neon::Backend::mainStreamIdx); + + size_t numDigits = 5; + std::string iterIdStr = std::to_string(iteration); + iterIdStr = std::string(numDigits - std::min(numDigits, iterIdStr.length()), '0') + iterIdStr; + + u.ioToVtk("u_" + iterIdStr, "u", false); + rho.ioToVtk("rho_" + iterIdStr, "rho", false); + cellFlagField.template ioToVtk("cellFlagField_" + iterIdStr, "flag", false); + +#if 0 + std::vector> xPosVal; + std::vector> yPosVal; + const double scale = 1.0 / ulid.v[0]; + + const Neon::index_3d grid_dim = grid.getDimension(); + u.forEachActiveCell([&](const Neon::index_3d& id, const int& card, auto& val) { + if (id.x == grid_dim.x / 2 && id.z == grid_dim.z / 2) { + if (card == 0) { + yPosVal.push_back({static_cast(id.v[1]) / static_cast(grid_dim.y), val * scale}); + } + } + + if (id.y == grid_dim.y / 2 && id.z == grid_dim.z / 2) { + if (card == 1) { + xPosVal.push_back({static_cast(id.v[0]) / static_cast(grid_dim.x), val * scale}); + } + } + }, + Neon::computeMode_t::seq); + + // sort the position so the linear interpolation works + std::sort(xPosVal.begin(), xPosVal.end(), [=](std::pair& a, std::pair& b) { + return a.first < b.first; + }); + + std::sort(yPosVal.begin(), yPosVal.end(), [=](std::pair& a, std::pair& b) { + return a.first < b.first; + }); + + auto writeToFile = [](const std::vector>& posVal, std::string filename) { + std::ofstream file; + file.open(filename); + for (auto v : posVal) { + file << v.first << " " << v.second << "\n"; + } + file.close(); + }; + writeToFile(yPosVal, "NeonUniformLBM_" + iterIdStr + "_Y.dat"); + writeToFile(xPosVal, "NeonUniformLBM_" + iterIdStr + "_X.dat"); +#endif + } + + auto helpUpdateIterationCount() -> void + { + iteration++; + } + + auto helpGetInputIdx() -> int + { + return iteration % 2; + } + auto helpGetOutputIdx() -> int + { + return (iteration + 1) % 2; + } + auto helpGetSkeletonIdx() -> int + { + return iteration % 2; + } + + Config configurations; + int iteration = 0; + bool prepDone = false; + Grid grid; + std::vector pFieldList; + CField cellFlagField; + RhoField rho; + UField u; + std::vector skeleton; + Report* reportPtr; +}; diff --git a/benchmarks/lbm/src/LbmSkeleton.h b/benchmarks/lbm/src/LbmSkeleton.h new file mode 100644 index 00000000..22ae8177 --- /dev/null +++ b/benchmarks/lbm/src/LbmSkeleton.h @@ -0,0 +1,117 @@ +#include "CellType.h" +#include "ContainerFactory.h" +#include "ContainersD3Q19.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/Containter.h" +#include "Neon/skeleton/Skeleton.h" + +template +struct LbmSkeleton +{ +}; + + +template +struct LbmSkeleton, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using ContainerFactory = common::ContainerFactory; + + LbmSkeleton(Neon::set::StencilSemantic stencilSemantic, + Neon::skeleton::Occ occ, + Neon::set::TransferMode transfer, + PopField& fIn /*! inpout population field */, + PopField& fOut, + CellTypeField& cellTypeField /*! Cell type field */, + Compute omega /*! LBM omega parameter */) + { + pop[0] = fIn; + pop[1] = fOut; + + setupSkeletons(0, stencilSemantic, occ, transfer, pop[0], pop[1], cellTypeField, omega); + setupSkeletons(1, stencilSemantic, occ, transfer, pop[1], pop[0], cellTypeField, omega); + + parity = 0; + } + + auto getInput() + -> PopField& + { + return pop[parity]; + } + + auto getOutput() + -> PopField& + { + int other = parity == 0 ? 1 : 0; + return pop[other]; + } + + auto run() + -> void + { + lbmTwoPop[parity].run(); + updateParity(); + } + + auto sync() + -> void + { + pop[0].getBackend().syncAll(); + } + + private: + auto updateParity() + -> void + { + parity = parity == 0 ? 1 : 0; + } + + auto setupSkeletons(int target, + Neon::set::StencilSemantic stencilSemantic, + Neon::skeleton::Occ occ, + Neon::set::TransferMode transfer, + PopField& inField /*! inpout population field */, + PopField& outField, + CellTypeField& cellTypeField /*! Cell type field */, + Compute omega /*! LBM omega parameter */) + { + std::vector ops; + lbmTwoPop[target] = Neon::skeleton::Skeleton(inField.getBackend()); + Neon::skeleton::Options opt(occ, transfer); + ops.push_back(ContainerFactory::template iteration(stencilSemantic, + inField, + cellTypeField, + omega, + outField)); + std::stringstream appName; + appName << "LBM_iteration_" << std::to_string(target); + lbmTwoPop[target].sequence(ops, appName.str(), opt); + } + + Neon::skeleton::Skeleton lbmTwoPop[2]; + PopField pop[2]; + int parity; +}; \ No newline at end of file diff --git a/benchmarks/lbm/src/LbmToolsTemplateOnly.h b/benchmarks/lbm/src/LbmToolsTemplateOnly.h new file mode 100644 index 00000000..489b3782 --- /dev/null +++ b/benchmarks/lbm/src/LbmToolsTemplateOnly.h @@ -0,0 +1,440 @@ +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +#define COMPUTE_CAST(VAR) static_cast((VAR)) + +template +struct LbmContainersTemplateOnly +{ +}; + +/** + * Specialization for Lattice + * @tparam PopulationField + * @tparam LbmComputeType + */ +template +struct LbmContainersTemplateOnly, + PopulationField, + LbmComputeType> +{ + using LbmStoreType = typename PopulationField::Type; + using CellTypeField = typename PopulationField::Grid::template Field; + using Lattice = D3Q19; + using Idx = typename PopulationField::Idx; + using Grid = typename PopulationField::Grid; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + +#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ + popIn[GOid] = fin.template read(gidx); \ + } else { \ + popIn[GOid] = fin.template nghVal(gidx).value; \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin.template read(gidx); \ + } else { \ + popIn[BKid] = fin.template nghVal(gidx).value; \ + } \ + } \ + } + static inline NEON_CUDA_HOST_DEVICE auto + loadPopulation(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopulationField::Partition const& fin, + NEON_OUT LbmStoreType popIn[19]) + { + // #pragma omp critical + // { + + LOADPOP(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); + LOADPOP(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); + LOADPOP(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); + LOADPOP(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); + LOADPOP(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); + LOADPOP(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); + LOADPOP(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); + LOADPOP(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); + LOADPOP(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); + // } + // Treat the case of the center (c[k] = {0, 0, 0,}). + { + popIn[Lattice::centerDirection] = fin(i, Lattice::centerDirection); + } + } +#undef LOADPOP + +#define PULL_STREAM(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + if (wallBitFlag & (uint32_t(1) << GOid)) { \ + /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ + popIn[GOid] = fin(gidx, BKid) + \ + fin.template getNghData(gidx, BKid)(); \ + } else { \ + popIn[GOid] = fin.template getNghData(gidx, GOid)(); \ + } \ + } \ + { /*BK*/ \ + if (wallBitFlag & (uint32_t(1) << BKid)) { \ + popIn[BKid] = fin(gidx, GOid) + fin.template getNghData(gidx, GOid)(); \ + } else { \ + popIn[BKid] = fin.template getNghData(gidx, BKid)(); \ + } \ + } \ + } + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopulationField::Partition const& fin, + NEON_OUT LbmStoreType popIn[19]) + { + // #pragma omp critical + // { +#if 0 + using TopologyByDirection = std::tuple; + constexpr std::array stencil{ + std::make_tuple(Neon::int32_3d(-1, 0, 0), /* GOid */ 0, /* --- */ Neon::int32_3d(1, 0, 0), /* BKid */ 10), + std::make_tuple(Neon::int32_3d(0, -1, 0), /* GOid */ 1, /* --- */ Neon::int32_3d(0, 1, 0), /* BKid */ 11), + std::make_tuple(Neon::int32_3d(0, 0, -1), /* GOid */ 2, /* --- */ Neon::int32_3d(0, 0, 1), /* BKid */ 12), + std::make_tuple(Neon::int32_3d(-1, -1, 0), /* GOid */ 3, /* --- */ Neon::int32_3d(1, 1, 0), /* BKid */ 13), + std::make_tuple(Neon::int32_3d(-1, 1, 0), /* GOid */ 4, /* --- */ Neon::int32_3d(1, -1, 0), /* BKid */ 14), + std::make_tuple(Neon::int32_3d(-1, 0, -1), /* GOid */ 5, /* --- */ Neon::int32_3d(1, 0, 1), /* BKid */ 15), + std::make_tuple(Neon::int32_3d(-1, 0, 1), /* GOid */ 6, /* --- */ Neon::int32_3d(1, 0, -1), /* BKid */ 16), + std::make_tuple(Neon::int32_3d(0, -1, -1), /* GOid */ 7, /* --- */ Neon::int32_3d(0, 1, 1), /* BKid */ 17), + std::make_tuple(Neon::int32_3d(0, -1, 1), /* GOid */ 8, /* --- */ Neon::int32_3d(0, 1, -1), /* BKid */ 18)}; + + + auto pullStream = [&]() { + static_assert(stencilIdx < 9); + constexpr int GOid = std::get<1>(stencil[stencilIdx]); + constexpr int BKid = std::get<3>(stencil[stencilIdx]); + constexpr Neon::int32_3d GoOffset = std::get<0>(stencil[stencilIdx]); + constexpr Neon::int32_3d BkOffset = std::get<2>(stencil[stencilIdx]); + { + if (wallBitFlag & (uint32_t(1) << GOid)) { + popIn[GOid] = fin(gidx, BKid) + + fin.template getNghData(gidx, BKid)(); + } else { + popIn[GOid] = fin.template getNghData(gidx, GOid)(); + } + } + { /*BK*/ + if (wallBitFlag & (uint32_t(1) << BKid)) { + popIn[BKid] = fin(gidx, GOid) + + fin.template getNghData(gidx, GOid)(); + } else { + popIn[BKid] = fin.template getNghData(gidx, BKid)(); + } + } + }; + pullStream.template operator()<0>(); + pullStream.template operator()<1>(); + pullStream.template operator()<2>(); + pullStream.template operator()<3>(); + pullStream.template operator()<4>(); + pullStream.template operator()<5>(); + pullStream.template operator()<6>(); + pullStream.template operator()<7>(); + pullStream.template operator()<8>(); +#endif + PULL_STREAM(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); + PULL_STREAM(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); + PULL_STREAM(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); + PULL_STREAM(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); + PULL_STREAM(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); + PULL_STREAM(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); + PULL_STREAM(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); + PULL_STREAM(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); + PULL_STREAM(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); + + // } + // Treat the case of the center (c[k] = {0, 0, 0,}). + { + popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection); + } + } +#undef PULL_STREAM + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const LbmStoreType pop[Lattice::Q], + NEON_OUT LbmComputeType& rho, + NEON_OUT std::array& u) + -> void + { +#define POP(IDX) static_cast(pop[IDX]) + + const LbmComputeType X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const LbmComputeType X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const LbmComputeType X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const LbmComputeType Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const LbmComputeType Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const LbmComputeType Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const LbmComputeType Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); + +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! LbmComputeType iterator */, + const LbmStoreType pop[Lattice::Q], + LbmComputeType const& rho /*! Density */, + std::array const& u /*! Velocity */, + LbmComputeType const& usqr /*! Usqr */, + LbmComputeType const& omega /*! Omega */, + typename PopulationField::Partition& fOut /*! Population */) + + -> void + { + const LbmComputeType ck_u03 = u[0] + u[1]; + const LbmComputeType ck_u04 = u[0] - u[1]; + const LbmComputeType ck_u05 = u[0] + u[2]; + const LbmComputeType ck_u06 = u[0] - u[2]; + const LbmComputeType ck_u07 = u[1] + u[2]; + const LbmComputeType ck_u08 = u[1] - u[2]; + + const LbmComputeType eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr); + const LbmComputeType eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr); + const LbmComputeType eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr); + const LbmComputeType eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr); + const LbmComputeType eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr); + const LbmComputeType eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr); + const LbmComputeType eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr); + const LbmComputeType eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr); + const LbmComputeType eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr); + + const LbmComputeType eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0]; + const LbmComputeType eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1]; + const LbmComputeType eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2]; + const LbmComputeType eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03; + const LbmComputeType eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04; + const LbmComputeType eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05; + const LbmComputeType eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06; + const LbmComputeType eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07; + const LbmComputeType eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08; + + const LbmComputeType pop_out_00 = (1. - omega) * static_cast(pop[0]) + omega * eq_00; + const LbmComputeType pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; + const LbmComputeType pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; + const LbmComputeType pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; + const LbmComputeType pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; + const LbmComputeType pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; + const LbmComputeType pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; + const LbmComputeType pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; + const LbmComputeType pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; + + const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; + const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; + const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; + const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; + const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; + const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; + const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; + const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; + const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; + + +#define COMPUTE_GO_AND_BACK(GOid, BKid) \ + { \ + fOut(i, GOid) = static_cast(pop_out_0##GOid); \ + fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ + } + + COMPUTE_GO_AND_BACK(0, 10) + COMPUTE_GO_AND_BACK(1, 11) + COMPUTE_GO_AND_BACK(2, 12) + COMPUTE_GO_AND_BACK(3, 13) + COMPUTE_GO_AND_BACK(4, 14) + COMPUTE_GO_AND_BACK(5, 15) + COMPUTE_GO_AND_BACK(6, 16) + COMPUTE_GO_AND_BACK(7, 17) + COMPUTE_GO_AND_BACK(8, 18) + +#undef COMPUTE_GO_AND_BACK + + { + const LbmComputeType eq_09 = rho * (1. / 3.) * (1. - usqr); + const LbmComputeType pop_out_09 = (1. - omega) * + static_cast(pop[Lattice::centerDirection]) + + omega * eq_09; + fOut(i, Lattice::centerDirection) = static_cast(pop_out_09); + } + } + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopulationField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const LbmComputeType omega /*! LBM omega parameter */, + PopulationField& fOutField /*! output Population field */) + -> Neon::set::Container + { + + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + LbmStoreType popIn[Lattice::Q]; + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + LbmComputeType rho; + std::array u{.0, .0, .0}; + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + LbmComputeType usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + +#define COMPUTE_MASK_WALL(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ + { \ + { /*GO*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); \ + } \ + } \ + { /*BK*/ \ + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ + if (nghCellType.classification != CellType::bulk) { \ + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << BKid)); \ + } \ + } \ + } + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, + Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + COMPUTE_MASK_WALL(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10) + COMPUTE_MASK_WALL(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11) + COMPUTE_MASK_WALL(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12) + COMPUTE_MASK_WALL(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13) + COMPUTE_MASK_WALL(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14) + COMPUTE_MASK_WALL(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15) + COMPUTE_MASK_WALL(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16) + COMPUTE_MASK_WALL(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17) + COMPUTE_MASK_WALL(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18) + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } +#undef COMPUTE_MASK_WALL + +#define BC_LOAD(GOID, DKID) \ + popIn[GOID] = fIn(gidx, GOID); \ + popIn[DKID] = fIn(gidx, DKID); + + static auto + computeRhoAndU([[maybe_unused]] const PopulationField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + LbmComputeType rho = 0; + std::array u{.0, .0, .0}; + LbmStoreType popIn[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + BC_LOAD(0, 10) + BC_LOAD(1, 11) + BC_LOAD(2, 12) + BC_LOAD(3, 13) + BC_LOAD(4, 14) + BC_LOAD(5, 15) + BC_LOAD(6, 16) + BC_LOAD(7, 17) + BC_LOAD(8, 18) + popIn[9] = fIn(gidx, 9); + + rho = 1.0; + u = std::array{COMPUTE_CAST(popIn[0]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[1]) / COMPUTE_CAST(6. * 1. / 18.), + COMPUTE_CAST(popIn[2]) / COMPUTE_CAST(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } +}; + +#undef COMPUTE_CAST \ No newline at end of file diff --git a/benchmarks/lbm/src/Methods.h b/benchmarks/lbm/src/Methods.h new file mode 100644 index 00000000..c1842652 --- /dev/null +++ b/benchmarks/lbm/src/Methods.h @@ -0,0 +1,45 @@ +#pragma once +#include "Neon/core/core.h" + +namespace lbm { +enum class Method +{ + push = 0, + pull = 1, + aa = 2 +}; + +struct MethodUtils +{ + template + static auto getNumberOfPFields() -> int + { + Method m = formInt(method); + switch (m) { + case Method::pull: + return 2; + case Method::push: + return 2; + case Method::aa: + return 1 ; + } + std::stringstream msg; + msg << "The following LBM method is not recognized" << method << std::endl; + NEON_THROW_UNSUPPORTED_OPERATION(msg.str()); + } + + static auto formInt(int method) -> Method + { + if (method == int(Method::pull)) + return Method::pull; + if (method == int(Method::push)) + return Method::push; + if (method == int(Method::aa)) + return Method::aa; + + std::stringstream msg; + msg << "The following LBM method is not recognized" << method << std::endl; + NEON_THROW_UNSUPPORTED_OPERATION(msg.str()); + } +}; +} \ No newline at end of file diff --git a/benchmarks/lbm/src/Metrics.h b/benchmarks/lbm/src/Metrics.h new file mode 100644 index 00000000..10356a4a --- /dev/null +++ b/benchmarks/lbm/src/Metrics.h @@ -0,0 +1,88 @@ +#pragma once +#include +#include "Config.h" +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Repoert.h" + +namespace metrics { +// Return a new clock for the current time, for benchmarking. +namespace { + +auto restartClock(Neon::Backend& bk, bool sync = true) +{ + if (sync) { + bk.syncAll(); + } + return make_pair(std::chrono::high_resolution_clock::now(), 0); +} + +void recordBackend(Neon::Backend& bk, + Report& report) +{ + report.recordBk(bk); +} + +void recordGrid(Neon::domain::interface::GridBase& g, + Report& report) +{ + report.recordGrid(g); +} + +} // namespace + + +// Compute the time elapsed since a starting point, and the corresponding +// benchmarks of the code in Mega Lattice site updates per second (MLups). +template +void recordMetrics(Neon::Backend& bk, + const Config& config, + Report& report, + TimePoint start, + int clock_iter) +{ + bk.syncAll(); + size_t nElements = config.N * config.N * config.N; + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(stop - start); + double mlups = static_cast(nElements * clock_iter) / duration.count(); + + report.recordLoopTime(duration.count(), "microseconds"); + report.recordMLUPS(mlups); + + std::cout << "Metrics: " << std::endl; + std::cout << "-- time: " << std::setprecision(4) << duration.count() << " microseconds" << std::endl; + std::cout << "-- MLUPS: " << std::setprecision(4) << mlups << " MLUPS" << std::endl; +} + +template +void recordGridInitMetrics(Neon::Backend& bk, + Report& report, + TimePoint start) +{ + bk.syncAll(); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(stop - start); + + report.recordNeonGridInitTime(duration.count(), "microseconds"); + + std::cout << "Metrics: " << std::endl; + std::cout << "- Grid Init: " << std::setprecision(4) << duration.count() << " microseconds" << std::endl; +} + + +template +void recordProblemSetupMetrics(Neon::Backend& bk, + Report& report, + TimePoint start) +{ + bk.syncAll(); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(stop - start); + + report.recordProblemSetupTime(duration.count(), "microseconds"); + + std::cout << "Metrics: " << std::endl; + std::cout << " Problem Setup: " << std::setprecision(4) << duration.count() << " microseconds" << std::endl; +} +} // namespace metrics \ No newline at end of file diff --git a/benchmarks/lbm/src/Precision.h b/benchmarks/lbm/src/Precision.h new file mode 100644 index 00000000..a45ff69e --- /dev/null +++ b/benchmarks/lbm/src/Precision.h @@ -0,0 +1,13 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" + +template +struct Precision +{ + using Storage = StorageFP; + using Compute = ComputeFP; +}; diff --git a/benchmarks/lbm/src/Repoert.h b/benchmarks/lbm/src/Repoert.h new file mode 100644 index 00000000..4ca0827b --- /dev/null +++ b/benchmarks/lbm/src/Repoert.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include +#include "Config.h" +#include "Neon/domain/interface/GridBase.h" +struct Report +{ + Neon::Report mReport; + std::string mFname; + + std::vector mMLUPS; + std::vector mLoopTime; + std::vector mNeonGridInitTime; + std::vector mProblemSetupTime; + + std::string mtimeUnit = ""; + + explicit Report(const Config& c); + + auto recordMLUPS(double mlups) + -> void; + + auto recordLoopTime(double time, + const std::string& unit) + -> void; + + auto recordNeonGridInitTime(double time, + const std::string& unit) + -> void; + + auto recordProblemSetupTime(double time, + const std::string& unit) + -> void; + + auto save() + -> void; + void recordBk(Neon::Backend& backend); + void recordGrid(Neon::domain::interface::GridBase& g); +}; diff --git a/benchmarks/lbm/src/Report.cpp b/benchmarks/lbm/src/Report.cpp new file mode 100644 index 00000000..049d1735 --- /dev/null +++ b/benchmarks/lbm/src/Report.cpp @@ -0,0 +1,108 @@ +#include +#include +#include "Repoert.h" + +Report::Report(const Config& c) + : mReport("lbm-lid-driven-cavity-flow") +{ + mFname = c.reportFile; + + mReport.addMember("Re", c.Re); + mReport.addMember("ulb", c.ulb); + mReport.addMember("N", c.N); + mReport.addMember("benchmark", c.benchmark); + mReport.addMember("max_t", c.max_t); + mReport.addMember("outFrequency", c.outFrequency); + mReport.addMember("dataFrequency", c.dataFrequency); + mReport.addMember("repetitions", c.repetitions); + mReport.addMember("vti", c.vti); + + + mReport.addMember("benchIniIter", c.benchIniIter); + mReport.addMember("benchMaxIter", c.benchMaxIter); + + mReport.addMember("deviceType", c.deviceType); + mReport.addMember("numDevices", c.devices.size()); + mReport.addMember("devices", c.devices); + mReport.addMember("reportFile", c.reportFile); + mReport.addMember("gridType", c.gridType); + + mReport.addMember("computeType", c.computeType); + mReport.addMember("storeType", c.storeType); + mReport.addMember("spaceCurve", Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(c.spaceCurve)); + + + mReport.addMember("occ", Neon::skeleton::OccUtils::toString(c.occ)); + mReport.addMember("transferMode", Neon::set::TransferModeUtils::toString(c.transferMode)); + mReport.addMember("transferSemantic", Neon::set::StencilSemanticUtils::toString(c.stencilSemantic)); + + mReport.addMember("nu", c.mLbmParameters.nu); + mReport.addMember("omega", c.mLbmParameters.omega); + mReport.addMember("dx", c.mLbmParameters.dx); + mReport.addMember("dt", c.mLbmParameters.dt); +} + +auto Report:: + recordMLUPS(double mlups) + -> void +{ + mMLUPS.push_back(mlups); +} + +auto Report:: + recordLoopTime(double time, + const std::string& unit) + -> void +{ + if (mtimeUnit.length() == 0) { + mtimeUnit = unit; + } + if (unit.length() != mtimeUnit.length()) { + NEON_THROW_UNSUPPORTED_OPERATION("Time unit inconsistency"); + } + mLoopTime.push_back(time); +} + +auto Report::recordNeonGridInitTime(double time, const std::string& unit) -> void +{ + if (mtimeUnit.length() == 0) { + mtimeUnit = unit; + } + if (unit.length() != mtimeUnit.length()) { + NEON_THROW_UNSUPPORTED_OPERATION("Time unit inconsistency"); + } + mNeonGridInitTime.push_back(time); +} + +auto Report::recordProblemSetupTime(double time, const std::string& unit) -> void +{ + if (mtimeUnit.length() == 0) { + mtimeUnit = unit; + } + if (unit.length() != mtimeUnit.length()) { + NEON_THROW_UNSUPPORTED_OPERATION("Time unit inconsistency"); + } + mProblemSetupTime.push_back(time); +} + +auto Report:: + save() + -> void +{ + mReport.addMember("MLUPS", mMLUPS); + mReport.addMember(std::string("Loop Time (") + mtimeUnit + ")", mLoopTime); + mReport.addMember(std::string("Problem Setup Time (") + mtimeUnit + ")", mProblemSetupTime); + mReport.addMember(std::string("Neon Grid Init Time (") + mtimeUnit + ")", mNeonGridInitTime); + + mReport.write(mFname, true); +} + +void Report::recordBk(Neon::Backend& backend) +{ + backend.toReport(mReport); +} + +void Report::recordGrid(Neon::domain::interface::GridBase& g) +{ + g.toReport(mReport, true); +} \ No newline at end of file diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu new file mode 100644 index 00000000..3119439c --- /dev/null +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -0,0 +1,203 @@ +#include "Config.h" +#include "D3Q19.h" +#include "Neon/domain/bGrid.h" +#include "Neon/domain/dGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/eGrid.h" + +#include "./Lbm.h" +#include "CellType.h" +#include "LbmSkeleton.h" +#include "Metrics.h" +#include "Repoert.h" +namespace CavityTwoPop { + +int backendWasReported = false; + +namespace details { +template +auto run(Config& config, + Report& report) -> void +{ + using Storage = Storage_; + using Compute = Compute_; + using Precision = Precision; + using Lattice = D3Q19; + using PopulationField = typename Grid::template Field; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using RhoField = typename Grid::template Field; + using UField = typename Grid::template Field; + + Neon::double_3d ulid(1., 0., 0.); + // Neon Grid and Fields initialization + Neon::index_3d dim(config.N, config.N, config.N); + + Lbm lbm(config, + report, + dim, + [](Neon::index_3d const&) { return true; }); + auto ulb = config.ulb; + auto domainDim = dim; + lbm.setBC([=] NEON_CUDA_HOST_DEVICE(Neon::index_3d const& globalIdx, + NEON_OUT Storage p[Lattice::Q], + NEON_OUT CellType::Classification& cellClass) { + typename Lattice::Precision::Storage popVal = 0; + + if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || + globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 || + globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) { + cellClass = CellType::bounceBack; + + if (globalIdx.y == domainDim.y - 1) { + cellClass = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + if (globalIdx.y == domainDim.y - 1) { + popVal = -6. * Lattice::Registers::template getT() * ulb * + (Lattice::Registers::template getDirection().v[0] * ulid.v[0] + + Lattice::Registers::template getDirection().v[1] * ulid.v[1] + + Lattice::Registers::template getDirection().v[2] * ulid.v[2]); + } else { + popVal = 0; + } + p[q] = popVal; + }); + } else { + cellClass = CellType::bulk; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + p[q] = Lattice::Registers::template getT(); + }); + } + }); + lbm.iterate(); +} + +template +auto runFilterMethod(Config& config, Report& report) -> void +{ + return run(config, report); +} + +template +auto runFilterComputeType(Config& config, Report& report) -> void +{ + if (config.computeType == "double") { + return runFilterMethod(config, report); + } + // if (config.computeType == "float") { + // return run(config, report); + // } + NEON_DEV_UNDER_CONSTRUCTION(""); +} + +template +auto runFilterStoreType(Config& config, + Report& report) + -> void +{ + if (config.storeType == "double") { + return runFilterComputeType(config, report); + } + // if (config.storeType == "float") { + // return runFilterComputeType(config, report); + // } + NEON_DEV_UNDER_CONSTRUCTION(""); +} +} // namespace details + +#ifdef NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS +constexpr bool skipTest = false; +#else +constexpr bool skipTest = false; +#endif + +auto run(Config& config, + Report& report) -> void +{ + if (config.gridType == "dGrid") { + return details::runFilterStoreType(config, report); + } + // if (config.gridType == "eGrid") { + // if constexpr (!skipTest) { + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid" || config.gridType == "bGrid_8_8_8") { + // return details::runFilterStoreType(config, report); + // } + // if (config.gridType == "bGrid_4_4_4") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_2_2_2") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_4") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_4") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_2_8") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_2") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "dGridSoA") { + // if constexpr (!skipTest) { + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + NEON_THROW_UNSUPPORTED_OPERATION("Unknown grid type: " + config.gridType); +} +} // namespace CavityTwoPop diff --git a/benchmarks/lbm/src/RunCavityTwoPop.h b/benchmarks/lbm/src/RunCavityTwoPop.h new file mode 100644 index 00000000..d30f722e --- /dev/null +++ b/benchmarks/lbm/src/RunCavityTwoPop.h @@ -0,0 +1,12 @@ +#include "Config.h" +#include "D3Q19.h" +#include "Neon/domain/dGrid.h" + +#include "Metrics.h" +#include "Repoert.h" + +namespace CavityTwoPop { + +auto run(Config& config, + Report& report) -> void; +} // namespace CavityTwoPop \ No newline at end of file diff --git a/benchmarks/lbm/src/app.cpp b/benchmarks/lbm/src/app.cpp new file mode 100644 index 00000000..6ed4b7d6 --- /dev/null +++ b/benchmarks/lbm/src/app.cpp @@ -0,0 +1,47 @@ + +#include "Config.h" +#include "Repoert.h" +#include "RunCavityTwoPop.h" + +#include "Neon/core/tools/clipp.h" +#include "Neon/domain/dGrid.h" +#include "Neon/Neon.h" + +int main(int argc, char** argv) +{ + Config config; + Neon::init(); + + config.Re = 100.; // Reynolds number + config.ulb = 0.04; // Velocity in lattice units + config.N = 160; // Number of nodes in x-direction + config.benchmark = true; // Run in benchmark mode ? + config.max_t = 10.0; // Non-benchmark mode: Total time in dim.less units + // config.out_freq = 20000000; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) + // config.data_freq = 20000000; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) + // config.bench_ini_iter = 0; // Benchmark mode: Number of warmup iterations + // config.bench_max_iter = 10000; // Benchmark mode: Total number of iterations + // config.perKeeperFile = "perf"; + // config.devices = {0}; + // config.gridType = "dGrid"; + // config.occ = Neon::skeleton::Options_t::Occ::none + + + if (config.parseArgs(argc, argv) != 0) { + return -1; + } + + std::cout << "--------------- Parameters ---------------\n"; + std::cout << config.toString(); + std::cout << "-------------------------------------------\n"; + + Report report(config); + + for(int i=0; i Date: Tue, 1 Aug 2023 17:22:13 -0400 Subject: [PATCH 46/94] WIP: new lbm benchmark --- benchmarks/lbm/src/ContainersD3Q19.h | 4 +-- benchmarks/lbm/src/DeviceD3Q19.h | 16 +++++----- benchmarks/lbm/src/Lbm.h | 32 +++++++++++-------- benchmarks/lbm/src/Methods.h | 26 +++++++++++---- benchmarks/lbm/src/RunCavityTwoPop.cu | 4 +-- .../Neon/domain/details/dGrid/dPartition.h | 18 +++++++++++ 6 files changed, 68 insertions(+), 32 deletions(-) diff --git a/benchmarks/lbm/src/ContainersD3Q19.h b/benchmarks/lbm/src/ContainersD3Q19.h index 38c30170..dedb09cb 100644 --- a/benchmarks/lbm/src/ContainersD3Q19.h +++ b/benchmarks/lbm/src/ContainersD3Q19.h @@ -187,7 +187,7 @@ struct ContainerFactory; return Factory::iteration(stencilSemantic, fInField, - fOutField, + cellTypeField, omega, fOutField); } @@ -197,7 +197,7 @@ struct ContainerFactory; return Factory::iteration(stencilSemantic, fInField, - fOutField, + cellTypeField, omega, fOutField); } diff --git a/benchmarks/lbm/src/DeviceD3Q19.h b/benchmarks/lbm/src/DeviceD3Q19.h index fd2f8d08..8bb38acc 100644 --- a/benchmarks/lbm/src/DeviceD3Q19.h +++ b/benchmarks/lbm/src/DeviceD3Q19.h @@ -67,10 +67,10 @@ struct DeviceD3Q19 static inline NEON_CUDA_HOST_DEVICE auto - pushStream(Idx const& gidx, - const uint32_t& wallNghBitFlag, - NEON_OUT Storage pOut[Lattice::Q], - NEON_OUT typename PopField::Partition const& fOut) + pushStream(Idx const& gidx, + const uint32_t& wallNghBitFlag, + NEON_OUT Storage pOut[Lattice::Q], + NEON_OUT typename PopField::Partition& fOut) { Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { using M = typename Lattice::template RegisterMapper; @@ -78,16 +78,16 @@ struct DeviceD3Q19 if constexpr (M::fwdMemIdx == M::centerMemIdx) { fOut(gidx, M::fwdMemIdx) = pOut[M::fwdRegIdx]; } else { - if (CellType::isWall()) { + if (CellType::isWall(wallNghBitFlag)) { // fout(i, opp[k]) = // pop_out + // f(nb, k); - fOut(gidx, M::bkMemIdx) = + fOut(gidx, M::bkwMemIdx) = pOut[M::fwdRegIdx] + - fOut.template getNghData(gidx, M::fwdMemIdx)(); + fOut.template getNghData(gidx, M::fwdMemIdx)(); } else { // fout(nb, k) = pop_out; - fOut.writeNgh(gidx, M::fwdMemIdx, pOut[M::fwdRegIdx]); + fOut.template writeNghData(gidx, M::fwdMemIdx, pOut[M::fwdRegIdx]); } } }); diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index a7223d5b..c8eaca05 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -15,7 +15,7 @@ int backendWasReported = false; template struct Lbm @@ -126,15 +126,17 @@ struct Lbm grid.getBackend().sync(Neon::Backend::mainStreamIdx); // One collide if 2Pop - pull // One iteration if 2Pop = push - if constexpr (int(lbm::Method::pull) == method) { + if constexpr (lbm::Method::pull == method) { NEON_DEV_UNDER_CONSTRUCTION(""); return; } - if constexpr (int(lbm::Method::push) == method) { - auto lbmParameters = configurations.getLbmParameters(); - skeleton.resize(2); + if constexpr (lbm::Method::push == method) { + using Compute = typename Precision::Compute; + auto lbmParameters = configurations.template getLbmParameters(); + skeleton = std::vector(2); { iteration = 0; + int skIdx = helpGetSkeletonIdx(); auto even = common::ContainerFactory::template iteration( configurations.stencilSemantic, pFieldList.at(helpGetInputIdx()), @@ -143,15 +145,16 @@ struct Lbm pFieldList.at(helpGetOutputIdx())); std::vector ops; - skeleton.at(iteration) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); + skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); Neon::skeleton::Options opt(configurations.occ, configurations.transferMode); ops.push_back(even); std::stringstream appName; appName << "LBM_push_even"; - skeleton.at(iteration).sequence(ops, appName.str(), opt); + skeleton.at(skIdx).sequence(ops, appName.str(), opt); } { iteration = 1; + int skIdx = helpGetSkeletonIdx(); auto odd = CommonContainerFactory::template iteration( configurations.stencilSemantic, pFieldList.at(helpGetInputIdx()), @@ -160,22 +163,23 @@ struct Lbm pFieldList.at(helpGetOutputIdx())); std::vector ops; - skeleton.at(iteration) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); + skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); Neon::skeleton::Options opt(configurations.occ, configurations.transferMode); ops.push_back(odd); std::stringstream appName; appName << "LBM_push_odd"; - skeleton.at(iteration).sequence(ops, appName.str(), opt); + skeleton.at(skIdx).sequence(ops, appName.str(), opt); } { iteration = 1; - skeleton.at(helpGetSkeletonIdx()).run(); + int skIdx = helpGetSkeletonIdx(); + skeleton.at(skIdx).run(); iteration = 0; } return; } - if constexpr (int(lbm::Method::aa) == method) { + if constexpr (lbm::Method::aa == method) { NEON_DEV_UNDER_CONSTRUCTION(""); return; } @@ -218,15 +222,15 @@ struct Lbm auto helpIterateOnce() -> void { - if (int(lbm::Method::pull) == method) { + if (lbm::Method::pull == method) { NEON_DEV_UNDER_CONSTRUCTION(""); return; } - if (int(lbm::Method::push) == method) { + if (lbm::Method::push == method) { skeleton.at(helpGetSkeletonIdx()).run(Neon::Backend::mainStreamIdx); return; } - if (int(lbm::Method::aa) == method) { + if (lbm::Method::aa == method) { NEON_DEV_UNDER_CONSTRUCTION(""); return; } diff --git a/benchmarks/lbm/src/Methods.h b/benchmarks/lbm/src/Methods.h index c1842652..11a1da6f 100644 --- a/benchmarks/lbm/src/Methods.h +++ b/benchmarks/lbm/src/Methods.h @@ -11,20 +11,34 @@ enum class Method struct MethodUtils { - template + template static auto getNumberOfPFields() -> int { - Method m = formInt(method); - switch (m) { + switch (method) { case Method::pull: return 2; case Method::push: return 2; case Method::aa: - return 1 ; + return 1; } std::stringstream msg; - msg << "The following LBM method is not recognized" << method << std::endl; + msg << "The following LBM method is not recognized" << lbm::MethodUtils::toString(method) << std::endl; + NEON_THROW_UNSUPPORTED_OPERATION(msg.str()); + } + + static auto toString(lbm::Method method) -> std::string + { + switch (method) { + case Method::pull: + return "pull"; + case Method::push: + return "push"; + case Method::aa: + return "aa"; + } + std::stringstream msg; + msg << "The following LBM method is not recognized" << lbm::MethodUtils::toString(method) << std::endl; NEON_THROW_UNSUPPORTED_OPERATION(msg.str()); } @@ -42,4 +56,4 @@ struct MethodUtils NEON_THROW_UNSUPPORTED_OPERATION(msg.str()); } }; -} \ No newline at end of file +} // namespace lbm \ No newline at end of file diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index 3119439c..85809c13 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -15,7 +15,7 @@ namespace CavityTwoPop { int backendWasReported = false; namespace details { -template @@ -85,7 +85,7 @@ auto run(Config& config, template auto runFilterMethod(Config& config, Report& report) -> void { - return run(config, report); + return run(config, report); } template diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 41c64e8b..6dfd4b8f 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -188,6 +188,24 @@ class dPartition return NghData(val, isValidNeighbour); } + template + NEON_CUDA_HOST_DEVICE inline auto + writeNghData(const Idx& gidx, + int card, + T value) + -> bool + { + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + T val; + if (isValidNeighbour) { + operator()(gidxNgh, card) = value; + } + return isValidNeighbour; + } + template NEON_CUDA_HOST_DEVICE inline auto getNghData(const Idx& gidx, From de704d3be9d87c8bac29c0c6e25fe9eff65141b3 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 2 Aug 2023 15:18:22 -0400 Subject: [PATCH 47/94] WIP --- benchmarks/lbm/src/ContainersD3Q19.h | 7 +++--- benchmarks/lbm/src/DeviceD3Q19.h | 9 ++++---- benchmarks/lbm/src/Lbm.h | 6 ++--- ...RunCavityTwoPop.cu => RunCavityTwoPop.cpp} | 22 +++++++++---------- .../Neon/domain/details/dGrid/dPartition.h | 7 +++--- .../include/Neon/domain/interface/NghData.h | 2 +- 6 files changed, 23 insertions(+), 30 deletions(-) rename benchmarks/lbm/src/{RunCavityTwoPop.cu => RunCavityTwoPop.cpp} (90%) diff --git a/benchmarks/lbm/src/ContainersD3Q19.h b/benchmarks/lbm/src/ContainersD3Q19.h index dedb09cb..806ed1fe 100644 --- a/benchmarks/lbm/src/ContainersD3Q19.h +++ b/benchmarks/lbm/src/ContainersD3Q19.h @@ -111,10 +111,10 @@ struct ContainerFactory auto { + [=](Neon::set::Loader& L) -> auto { auto& fIn = L.load(fInField, Neon::Pattern::STENCIL, stencilSemantic); - auto& fOut = L.load(fOutField); + auto fOut = L.load(fOutField); const auto& cellInfoPartition = L.load(cellTypeField); return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { @@ -133,8 +133,7 @@ struct ContainerFactory const& u /*! Velocity */, Compute const& usqr /*! Usqr */, Compute const& omega /*! Omega */, @@ -151,15 +150,15 @@ struct DeviceD3Q19 -> void { - constexpr Compute c1over18 = 1. / 18.; + // constexpr Compute c1over18 = 1. / 18.; constexpr Compute c1over36 = 1. / 36.; constexpr Compute c4dot5 = 4.5; constexpr Compute c3 = 3.; constexpr Compute c1 = 1.; constexpr Compute c6 = 6.; - constexpr int regCenter = Lattice::Registers::center; - constexpr int regFir = Lattice::Registers::center; + // constexpr int regCenter = Lattice::Registers::center; + // constexpr int regFir = Lattice::Registers::center; Neon::ConstexprFor<0, Lattice::Registers::fwdRegIdxListLen, 1>( [&](auto fwdRegIdxListIdx) { diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index c8eaca05..886c1c6a 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -34,10 +34,8 @@ struct Lbm template Lbm(Config& config, Report& report, - Neon::index_3d const& dim, Lambda activeMask) { - using Idx = typename PField::Idx; reportPtr = &report; // Setting the backend @@ -106,7 +104,7 @@ struct Lbm cellFlagField) .run(Neon::Backend::mainStreamIdx); - for (int i = 1; i < pFieldList.size(); i++) { + for (int i = 1; i < int(pFieldList.size()); i++) { CommonContainerFactory::copyPopulation(pFieldList[0], pFieldList[i]) .run(Neon::Backend::mainStreamIdx); @@ -198,7 +196,7 @@ struct Lbm tie(start, clock_iter) = metrics::restartClock(bk, true); for (time_iter = 0; time_iter < configurations.benchMaxIter; ++time_iter) { - if (!configurations.benchmark) { + if (true) { bk.syncAll(); helpExportVti(); } diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cpp similarity index 90% rename from benchmarks/lbm/src/RunCavityTwoPop.cu rename to benchmarks/lbm/src/RunCavityTwoPop.cpp index 85809c13..9eb22e53 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -26,28 +26,26 @@ auto run(Config& config, using Compute = Compute_; using Precision = Precision; using Lattice = D3Q19; - using PopulationField = typename Grid::template Field; + // using PopulationField = typename Grid::template Field; - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; + // using PopField = typename Grid::template Field; + // using CellTypeField = typename Grid::template Field; - using Idx = typename PopField::Idx; - using RhoField = typename Grid::template Field; - using UField = typename Grid::template Field; + // using Idx = typename PopField::Idx; + // using RhoField = typename Grid::template Field; + // using UField = typename Grid::template Field; Neon::double_3d ulid(1., 0., 0.); // Neon Grid and Fields initialization - Neon::index_3d dim(config.N, config.N, config.N); + Neon::index_3d domainDim(config.N, config.N, config.N); Lbm lbm(config, report, - dim, [](Neon::index_3d const&) { return true; }); - auto ulb = config.ulb; - auto domainDim = dim; + auto ulb = config.ulb; lbm.setBC([=] NEON_CUDA_HOST_DEVICE(Neon::index_3d const& globalIdx, - NEON_OUT Storage p[Lattice::Q], - NEON_OUT CellType::Classification& cellClass) { + NEON_OUT Storage p[Lattice::Q], + NEON_OUT CellType::Classification& cellClass) { typename Lattice::Precision::Storage popVal = 0; if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 6dfd4b8f..c1e17b0b 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -181,11 +181,11 @@ class dPartition { Idx gidxNgh; const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); - T val; if (isValidNeighbour) { - val = operator()(gidxNgh, card); + T val = operator()(gidxNgh, card); + return NghData(val, isValidNeighbour); } - return NghData(val, isValidNeighbour); + return NghData(); } template (gidx, gidxNgh); - T val; if (isValidNeighbour) { operator()(gidxNgh, card) = value; } diff --git a/libNeonDomain/include/Neon/domain/interface/NghData.h b/libNeonDomain/include/Neon/domain/interface/NghData.h index 487c8fd7..b7de2fca 100644 --- a/libNeonDomain/include/Neon/domain/interface/NghData.h +++ b/libNeonDomain/include/Neon/domain/interface/NghData.h @@ -10,7 +10,7 @@ struct NghData { Type mData; bool mIsValid; - NEON_CUDA_HOST_DEVICE NghData(bool status = false) + NEON_CUDA_HOST_DEVICE NghData() { this->mIsValid = false; } From 406b41c6b66f73b27d0786c19d9f0035bc58b387 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 29 Aug 2023 09:20:12 +0200 Subject: [PATCH 48/94] WIP: cleaning. --- benchmarks/lbm/src/CellType.h | 4 +- benchmarks/lbm/src/ContainersD3Q19.h | 10 ++--- benchmarks/lbm/src/D3Q19.h | 57 ++++++++++++++------------ benchmarks/lbm/src/DeviceD3Q19.h | 44 ++++++++++---------- benchmarks/lbm/src/RunCavityTwoPop.cpp | 10 ++--- 5 files changed, 64 insertions(+), 61 deletions(-) diff --git a/benchmarks/lbm/src/CellType.h b/benchmarks/lbm/src/CellType.h index 1ca70c6f..57204a45 100644 --- a/benchmarks/lbm/src/CellType.h +++ b/benchmarks/lbm/src/CellType.h @@ -32,11 +32,11 @@ struct CellType // Converting to int to exportVti operator int() const { return int(classification); } - template + template static auto isWall(const uint32_t& wallNghBitFlag) -> bool { - return wallNghBitFlag & (uint32_t(1) << fwdRegIdx); + return wallNghBitFlag & (uint32_t(1) << fwdRegQ); } auto setWall(int fwdRegIdx) diff --git a/benchmarks/lbm/src/ContainersD3Q19.h b/benchmarks/lbm/src/ContainersD3Q19.h index 806ed1fe..a27fc60b 100644 --- a/benchmarks/lbm/src/ContainersD3Q19.h +++ b/benchmarks/lbm/src/ContainersD3Q19.h @@ -223,10 +223,10 @@ struct ContainerFactory([&](auto fwdRegIdx) { using M = typename Lattice::template RegisterMapper; - if constexpr (M::centerMemIdx != M::fwdMemIdx) { - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if constexpr (M::centerMemQ != M::fwdMemQ) { + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); if (nghCellType.classification != CellType::bulk) { - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemIdx)); + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemQ)); } } }); @@ -262,7 +262,7 @@ struct ContainerFactory([&](auto q) { using M = typename Lattice::template RegisterMapper; - p(gidx, M::fwdMemIdx) = pValues[M::fwdRegIdx]; + p(gidx, M::fwdMemQ) = pValues[M::fwdRegQ]; }); }; }); @@ -417,7 +417,7 @@ struct ContainerFactory([&](auto q) { using M = typename Lattice::template RegisterMapper; - fOut(gidx, M::fwdMemIdx) = Lattice::Registers::template getT(); + fOut(gidx, M::fwdMemQ) = Lattice::Registers::template getT(); }); } }; diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h index 8659fc91..46529f59 100644 --- a/benchmarks/lbm/src/D3Q19.h +++ b/benchmarks/lbm/src/D3Q19.h @@ -102,8 +102,11 @@ struct D3Q19 return stencil[q]; } - static constexpr int fwdRegIdxListLen = (Q - 1) / 2; - static constexpr std::array fwdRegIdxList{0, 1, 2, 3, 4, 5, 6, 7, 8}; + // Identifying first half of the directions + // For each direction in the list, the opposite is not present. + // Center is also removed + static constexpr int firstHalfDirectionsLen = (Q - 1) / 2; + static constexpr std::array firstHalfDirectionsList{0, 1, 2, 3, 4, 5, 6, 7, 8}; template static inline NEON_CUDA_HOST_DEVICE auto @@ -204,38 +207,38 @@ struct D3Q19 // template // struct MemMapper // { -// constexpr static int fwdMemIdx = fwMemIdx_; -// constexpr static int fwdX = Memory::stencil[fwdMemIdx].x; -// constexpr static int fwdY = Memory::stencil[fwdMemIdx].y; -// constexpr static int fwdZ = Memory::stencil[fwdMemIdx].z; +// constexpr static int fwdMemQ = fwMemIdx_; +// constexpr static int fwdMemQX = Memory::stencil[fwdMemQ].x; +// constexpr static int fwdY = Memory::stencil[fwdMemQ].y; +// constexpr static int fwdZ = Memory::stencil[fwdMemQ].z; // -// constexpr static int bkwMemIdx = Memory::opposite[fwdMemIdx]; -// constexpr static int bkwX = Memory::stencil[bkwMemIdx].x; -// constexpr static int bkwY = Memory::stencil[bkwMemIdx].y; -// constexpr static int bkwZ = Memory::stencil[bkwMemIdx].z; +// constexpr static int bkwMemQ = Memory::opposite[fwdMemQ]; +// constexpr static int bkwX = Memory::stencil[bkwMemQ].x; +// constexpr static int bkwY = Memory::stencil[bkwMemQ].y; +// constexpr static int bkwZ = Memory::stencil[bkwMemQ].z; // -// constexpr static int fwdRegIdx = Memory::template mapToRegisters(); -// constexpr static int centerRegIdx = Registers::center; -// constexpr static int centerMemIdx = Memory::center; +// constexpr static int fwdRegQ = Memory::template mapToRegisters(); +// constexpr static int centerRegQ = Registers::center; +// constexpr static int centerMemQ = Memory::center; // }; template struct RegisterMapper { - constexpr static int fwdRegIdx = fwdRegIdx_; - constexpr static int bkwRegIdx = Registers::opposite[fwdRegIdx]; - constexpr static int fwdMemIdx = Memory::template mapToMemory(); - constexpr static int bkwMemIdx = Memory::template mapToMemory(); - constexpr static int centerRegIdx = Registers::center; - constexpr static int centerMemIdx = Memory::center; - - constexpr static int fwdX = Memory::stencil[fwdMemIdx].x; - constexpr static int fwdY = Memory::stencil[fwdMemIdx].y; - constexpr static int fwdZ = Memory::stencil[fwdMemIdx].z; - - constexpr static int bkwX = Memory::stencil[bkwMemIdx].x; - constexpr static int bkwY = Memory::stencil[bkwMemIdx].y; - constexpr static int bkwZ = Memory::stencil[bkwMemIdx].z; + constexpr static int fwdRegQ = fwdRegIdx_; + constexpr static int bkwRegQ = Registers::opposite[fwdRegQ]; + constexpr static int fwdMemQ = Memory::template mapToMemory(); + constexpr static int bkwMemQ = Memory::template mapToMemory(); + constexpr static int centerRegQ = Registers::center; + constexpr static int centerMemQ = Memory::center; + + constexpr static int fwdMemQX = Memory::stencil[fwdMemQ].x; + constexpr static int fwdMemQY = Memory::stencil[fwdMemQ].y; + constexpr static int fwdMemQZ= Memory::stencil[fwdMemQ].z; + + constexpr static int bkwMemQX = Memory::stencil[bkwMemQ].x; + constexpr static int bkwMemQY = Memory::stencil[bkwMemQ].y; + constexpr static int bkwMemQZ = Memory::stencil[bkwMemQ].z; }; public: diff --git a/benchmarks/lbm/src/DeviceD3Q19.h b/benchmarks/lbm/src/DeviceD3Q19.h index 9c20d1b0..2e772635 100644 --- a/benchmarks/lbm/src/DeviceD3Q19.h +++ b/benchmarks/lbm/src/DeviceD3Q19.h @@ -32,13 +32,13 @@ struct DeviceD3Q19 using M = typename Lattice::template MappersIdxSetWithFwdMem; if constexpr (fwMemIdx == Lattice::Memory::center) { - popIn[M::centerRegIdx] = fin(gidx, M::centerMemIdx); + popIn[M::centerRegIdx] = fin(gidx, M::centerMemQ); } else { if (CellType::isWall()) { - popIn[M::fwdRegIdx] = fin(gidx, M::bkMemIdx) + + popIn[M::fwdRegQ] = fin(gidx, M::bkMemIdx) + fin.template getNghData(gidx, M::bkMemIdx)(); } else { - popIn[M::fwdRegIdx] = fin.template getNghData(gidx, fwMemIdx)(); + popIn[M::fwdRegQ] = fin.template getNghData(gidx, fwMemIdx)(); } } }); @@ -75,19 +75,19 @@ struct DeviceD3Q19 Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { using M = typename Lattice::template RegisterMapper; - if constexpr (M::fwdMemIdx == M::centerMemIdx) { - fOut(gidx, M::fwdMemIdx) = pOut[M::fwdRegIdx]; + if constexpr (M::fwdMemQ == M::centerMemQ) { + fOut(gidx, M::fwdMemQ) = pOut[M::fwdRegQ]; } else { - if (CellType::isWall(wallNghBitFlag)) { + if (CellType::isWall(wallNghBitFlag)) { // fout(i, opp[k]) = // pop_out + // f(nb, k); - fOut(gidx, M::bkwMemIdx) = - pOut[M::fwdRegIdx] + - fOut.template getNghData(gidx, M::fwdMemIdx)(); + fOut(gidx, M::bkwMemQ) = + pOut[M::fwdRegQ] + + fOut.template getNghData(gidx, M::fwdMemQ)(); } else { // fout(nb, k) = pop_out; - fOut.template writeNghData(gidx, M::fwdMemIdx, pOut[M::fwdRegIdx]); + fOut.template writeNghData(gidx, M::fwdMemQ, pOut[M::fwdRegQ]); } } }); @@ -160,21 +160,21 @@ struct DeviceD3Q19 // constexpr int regCenter = Lattice::Registers::center; // constexpr int regFir = Lattice::Registers::center; - Neon::ConstexprFor<0, Lattice::Registers::fwdRegIdxListLen, 1>( - [&](auto fwdRegIdxListIdx) { - using M = typename Lattice::template RegisterMapper; + Neon::ConstexprFor<0, Lattice::Registers::firstHalfDirectionsLen, 1>( + [&](auto q) { + using M = typename Lattice::template RegisterMapper; using T = typename Lattice::Registers; Compute eqFw; Compute eqBk; - const Compute ck_u = T::template getCk_u(u); + const Compute ck_u = T::template getCk_u(u); // double eq = rho * t[k] * // (1. + // 3. * ck_u + // 4.5 * ck_u * ck_u - // usqr); - eqFw = rho * T::t[M::fwdRegIdx] * + eqFw = rho * T::t[M::fwdRegQ] * (c1 + c3 * ck_u + c4dot5 * ck_u * ck_u - @@ -182,20 +182,20 @@ struct DeviceD3Q19 // double eqopp = eq - 6.* rho * t[k] * ck_u; eqBk = eqFw - - c6 * rho * c1over36 * T::t[M::fwdRegIdx] * ck_u; + c6 * rho * c1over36 * T::t[M::fwdRegQ] * ck_u; // pop_out = (1. - omega) * fin(i, k) + omega * eq; - pop[M::fwdRegIdx] = (c1 - omega) * static_cast(pop[M::fwdRegIdx]) + omega * eqFw; + pop[M::fwdRegQ] = (c1 - omega) * static_cast(pop[M::fwdRegQ]) + omega * eqFw; // pop_out_opp = (1. - omega) * fin(i, opp[k]) + omega * eqopp; - pop[M::bkwRegIdx] = (c1 - omega) * static_cast(pop[M::bkwRegIdx]) + omega * eqBk; + pop[M::bkwRegQ] = (c1 - omega) * static_cast(pop[M::bkwRegQ]) + omega * eqBk; }); { // Center; using T = typename Lattice::Registers; using M = typename Lattice::template RegisterMapper; // eq = rho * t[k] * (1. - usqr); - const Compute eqCenter = rho * T::t[M::fwdRegIdx] * (c1 - usqr); + const Compute eqCenter = rho * T::t[M::fwdRegQ] * (c1 - usqr); // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; - pop[Lattice::Registers::center] = (c1 - omega) * static_cast(pop[M::fwdRegIdx]) + omega * eqCenter; + pop[Lattice::Registers::center] = (c1 - omega) * static_cast(pop[M::fwdRegQ]) + omega * eqCenter; } } static inline NEON_CUDA_HOST_DEVICE auto @@ -205,7 +205,7 @@ struct DeviceD3Q19 { Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { using M = typename Lattice::template RegisterMapper; - pOut[M::fwdRegIdx] = fOut(gidx, M::fwdMemIdx); + pOut[M::fwdRegQ] = fOut(gidx, M::fwdMemQ); }); } @@ -216,7 +216,7 @@ struct DeviceD3Q19 { Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { using M = typename Lattice::template RegisterMapper; - fOut(gidx, M::fwdMemIdx) = pOut[M::fwdRegIdx]; + fOut(gidx, M::fwdMemQ) = pOut[M::fwdRegQ]; }); } }; diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cpp index 9eb22e53..e5245473 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -60,10 +60,10 @@ auto run(Config& config, Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { using M = typename Lattice::template RegisterMapper; if (globalIdx.y == domainDim.y - 1) { - popVal = -6. * Lattice::Registers::template getT() * ulb * - (Lattice::Registers::template getDirection().v[0] * ulid.v[0] + - Lattice::Registers::template getDirection().v[1] * ulid.v[1] + - Lattice::Registers::template getDirection().v[2] * ulid.v[2]); + popVal = -6. * Lattice::Registers::template getT() * ulb * + (Lattice::Registers::template getDirection().v[0] * ulid.v[0] + + Lattice::Registers::template getDirection().v[1] * ulid.v[1] + + Lattice::Registers::template getDirection().v[2] * ulid.v[2]); } else { popVal = 0; } @@ -73,7 +73,7 @@ auto run(Config& config, cellClass = CellType::bulk; Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { using M = typename Lattice::template RegisterMapper; - p[q] = Lattice::Registers::template getT(); + p[q] = Lattice::Registers::template getT(); }); } }); From d7da72b9be3eefcb47af88f1958b59ece2cacd78 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 29 Aug 2023 15:06:29 +0200 Subject: [PATCH 49/94] WIP --- benchmarks/lbm/src/ContainersD3Q19.h | 5 +- benchmarks/lbm/src/D3Q19.h | 114 +++++++++++++-------------- benchmarks/lbm/src/DeviceD3Q19.h | 41 ++++++---- benchmarks/lbm/src/Lbm.h | 3 + 4 files changed, 87 insertions(+), 76 deletions(-) diff --git a/benchmarks/lbm/src/ContainersD3Q19.h b/benchmarks/lbm/src/ContainersD3Q19.h index a27fc60b..5c789ea5 100644 --- a/benchmarks/lbm/src/ContainersD3Q19.h +++ b/benchmarks/lbm/src/ContainersD3Q19.h @@ -225,7 +225,8 @@ struct ContainerFactory; if constexpr (M::centerMemQ != M::fwdMemQ) { CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); - if (nghCellType.classification != CellType::bulk) { + if (nghCellType.classification == CellType::bounceBack || + nghCellType.classification == CellType::movingWall) { cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemQ)); } } @@ -417,7 +418,7 @@ struct ContainerFactory([&](auto q) { using M = typename Lattice::template RegisterMapper; - fOut(gidx, M::fwdMemQ) = Lattice::Registers::template getT(); + fOut(gidx, M::fwdMemQ) = Lattice::Registers::template getT(); }); } }; diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h index 46529f59..a5c0e877 100644 --- a/benchmarks/lbm/src/D3Q19.h +++ b/benchmarks/lbm/src/D3Q19.h @@ -31,25 +31,25 @@ struct D3Q19 { using Self = D3Q19::Registers; static constexpr std::array stencil{ - Neon::index_3d(-1, 0, 0), - Neon::index_3d(0, -1, 0), - Neon::index_3d(0, 0, -1), - Neon::index_3d(-1, -1, 0), - Neon::index_3d(-1, 1, 0), - Neon::index_3d(-1, 0, -1), - Neon::index_3d(-1, 0, 1), - Neon::index_3d(0, -1, -1), - Neon::index_3d(0, -1, 1), - Neon::index_3d(0, 0, 0), - Neon::index_3d(1, 0, 0), - Neon::index_3d(0, 1, 0), - Neon::index_3d(0, 0, 1), - Neon::index_3d(1, 1, 0), - Neon::index_3d(1, -1, 0), - Neon::index_3d(1, 0, 1), - Neon::index_3d(1, 0, -1), - Neon::index_3d(0, 1, 1), - Neon::index_3d(0, 1, -1)}; + /*! 0 */ Neon::index_3d(-1, 0, 0), + /*! 1 */ Neon::index_3d(0, -1, 0), + /*! 2 */ Neon::index_3d(0, 0, -1), + /*! 3 */ Neon::index_3d(-1, -1, 0), + /*! 4 */ Neon::index_3d(-1, 1, 0), + /*! 5 */ Neon::index_3d(-1, 0, -1), + /*! 6 */ Neon::index_3d(-1, 0, 1), + /*! 7 */ Neon::index_3d(0, -1, -1), + /*! 8 */ Neon::index_3d(0, -1, 1), + /*! 9 */ Neon::index_3d(0, 0, 0), + /*! 10 */ Neon::index_3d(1, 0, 0), + /*! 11 */ Neon::index_3d(0, 1, 0), + /*! 12 */ Neon::index_3d(0, 0, 1), + /*! 13 */ Neon::index_3d(1, 1, 0), + /*! 14 */ Neon::index_3d(1, -1, 0), + /*! 15 */ Neon::index_3d(1, 0, 1), + /*! 16 */ Neon::index_3d(1, 0, -1), + /*! 17 */ Neon::index_3d(0, 1, 1), + /*! 18 */ Neon::index_3d(0, 1, -1)}; static constexpr int center = 9; /** Position of direction {0,0,0} */ @@ -105,41 +105,41 @@ struct D3Q19 // Identifying first half of the directions // For each direction in the list, the opposite is not present. // Center is also removed - static constexpr int firstHalfDirectionsLen = (Q - 1) / 2; - static constexpr std::array firstHalfDirectionsList{0, 1, 2, 3, 4, 5, 6, 7, 8}; + static constexpr int firstHalfQLen = (Q - 1) / 2; + static constexpr std::array firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8}; template static inline NEON_CUDA_HOST_DEVICE auto getCk_u(std::array const& u) -> Compute { - if constexpr (tegIdx == 0 || tegIdx == 9) { - return u[0]; + if constexpr (tegIdx == 0 || tegIdx == 10) { + return -u[0]; } - if constexpr (tegIdx == 1 || tegIdx == 10) { - return u[1]; + if constexpr (tegIdx == 1 || tegIdx == 11) { + return -u[1]; } - if constexpr (tegIdx == 2 || tegIdx == 11) { - return u[2]; + if constexpr (tegIdx == 2 || tegIdx == 12) { + return -u[2]; } - if constexpr (tegIdx == 3 || tegIdx == 12) { - return u[0] + u[1]; + if constexpr (tegIdx == 3 || tegIdx == 13) { + return -u[0] - u[1]; } - if constexpr (tegIdx == 4 || tegIdx == 13) { - return u[0] - u[1]; + if constexpr (tegIdx == 4 || tegIdx == 14) { + return -u[0] + u[1]; } - if constexpr (tegIdx == 5 || tegIdx == 14) { - return u[0] + u[2]; + if constexpr (tegIdx == 5 || tegIdx == 15) { + return -u[0] - u[2]; } - if constexpr (tegIdx == 6 || tegIdx == 15) { + if constexpr (tegIdx == 6 || tegIdx == 16) { - return u[0] - u[2]; + return -u[0] + u[2]; } - if constexpr (tegIdx == 7 || tegIdx == 16) { + if constexpr (tegIdx == 7 || tegIdx == 17) { - return u[1] + u[2]; + return -u[1] - u[2]; } - if constexpr (tegIdx == 8 || tegIdx == 17) { - return u[1] - u[2]; + if constexpr (tegIdx == 8 || tegIdx == 18) { + return -u[1] + u[2]; } } }; @@ -204,23 +204,23 @@ struct D3Q19 10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8}; }; -// template -// struct MemMapper -// { -// constexpr static int fwdMemQ = fwMemIdx_; -// constexpr static int fwdMemQX = Memory::stencil[fwdMemQ].x; -// constexpr static int fwdY = Memory::stencil[fwdMemQ].y; -// constexpr static int fwdZ = Memory::stencil[fwdMemQ].z; -// -// constexpr static int bkwMemQ = Memory::opposite[fwdMemQ]; -// constexpr static int bkwX = Memory::stencil[bkwMemQ].x; -// constexpr static int bkwY = Memory::stencil[bkwMemQ].y; -// constexpr static int bkwZ = Memory::stencil[bkwMemQ].z; -// -// constexpr static int fwdRegQ = Memory::template mapToRegisters(); -// constexpr static int centerRegQ = Registers::center; -// constexpr static int centerMemQ = Memory::center; -// }; + // template + // struct MemMapper + // { + // constexpr static int fwdMemQ = fwMemIdx_; + // constexpr static int fwdMemQX = Memory::stencil[fwdMemQ].x; + // constexpr static int fwdY = Memory::stencil[fwdMemQ].y; + // constexpr static int fwdZ = Memory::stencil[fwdMemQ].z; + // + // constexpr static int bkwMemQ = Memory::opposite[fwdMemQ]; + // constexpr static int bkwX = Memory::stencil[bkwMemQ].x; + // constexpr static int bkwY = Memory::stencil[bkwMemQ].y; + // constexpr static int bkwZ = Memory::stencil[bkwMemQ].z; + // + // constexpr static int fwdRegQ = Memory::template mapToRegisters(); + // constexpr static int centerRegQ = Registers::center; + // constexpr static int centerMemQ = Memory::center; + // }; template struct RegisterMapper @@ -234,7 +234,7 @@ struct D3Q19 constexpr static int fwdMemQX = Memory::stencil[fwdMemQ].x; constexpr static int fwdMemQY = Memory::stencil[fwdMemQ].y; - constexpr static int fwdMemQZ= Memory::stencil[fwdMemQ].z; + constexpr static int fwdMemQZ = Memory::stencil[fwdMemQ].z; constexpr static int bkwMemQX = Memory::stencil[bkwMemQ].x; constexpr static int bkwMemQY = Memory::stencil[bkwMemQ].y; diff --git a/benchmarks/lbm/src/DeviceD3Q19.h b/benchmarks/lbm/src/DeviceD3Q19.h index 2e772635..1eb6e2e3 100644 --- a/benchmarks/lbm/src/DeviceD3Q19.h +++ b/benchmarks/lbm/src/DeviceD3Q19.h @@ -36,7 +36,7 @@ struct DeviceD3Q19 } else { if (CellType::isWall()) { popIn[M::fwdRegQ] = fin(gidx, M::bkMemIdx) + - fin.template getNghData(gidx, M::bkMemIdx)(); + fin.template getNghData(gidx, M::bkMemIdx)(); } else { popIn[M::fwdRegQ] = fin.template getNghData(gidx, fwMemIdx)(); } @@ -76,18 +76,25 @@ struct DeviceD3Q19 using M = typename Lattice::template RegisterMapper; if constexpr (M::fwdMemQ == M::centerMemQ) { - fOut(gidx, M::fwdMemQ) = pOut[M::fwdRegQ]; + // fOut(gidx, M::centerMemQ) = pOut[M::centerRegQ]; } else { if (CellType::isWall(wallNghBitFlag)) { + const auto pop_out = pOut[M::fwdRegQ]; + const auto f_nb_k = fOut.template getNghData(gidx, M::fwdMemQ)(); + // fout(i, opp[k]) = - // pop_out + - // f(nb, k); fOut(gidx, M::bkwMemQ) = - pOut[M::fwdRegQ] + - fOut.template getNghData(gidx, M::fwdMemQ)(); + // pop_out + + pop_out + + // f(nb, k); + f_nb_k; } else { - // fout(nb, k) = pop_out; - fOut.template writeNghData(gidx, M::fwdMemQ, pOut[M::fwdRegQ]); + // fout(nb, + fOut.template writeNghData(gidx, + // k) + M::fwdMemQ, + // = pop_out; + pOut[M::fwdRegQ]); } } }); @@ -151,7 +158,6 @@ struct DeviceD3Q19 { // constexpr Compute c1over18 = 1. / 18.; - constexpr Compute c1over36 = 1. / 36.; constexpr Compute c4dot5 = 4.5; constexpr Compute c3 = 3.; constexpr Compute c1 = 1.; @@ -160,7 +166,7 @@ struct DeviceD3Q19 // constexpr int regCenter = Lattice::Registers::center; // constexpr int regFir = Lattice::Registers::center; - Neon::ConstexprFor<0, Lattice::Registers::firstHalfDirectionsLen, 1>( + Neon::ConstexprFor<0, Lattice::Registers::firstHalfQLen, 1>( [&](auto q) { using M = typename Lattice::template RegisterMapper; using T = typename Lattice::Registers; @@ -169,6 +175,7 @@ struct DeviceD3Q19 Compute eqBk; const Compute ck_u = T::template getCk_u(u); + // double eq = rho * t[k] * // (1. + // 3. * ck_u + @@ -182,20 +189,20 @@ struct DeviceD3Q19 // double eqopp = eq - 6.* rho * t[k] * ck_u; eqBk = eqFw - - c6 * rho * c1over36 * T::t[M::fwdRegQ] * ck_u; + c6 * rho * T::t[M::fwdRegQ] * ck_u; - // pop_out = (1. - omega) * fin(i, k) + omega * eq; + // pop_out = (1. - omega) * fin(i, k) + omega * eq; pop[M::fwdRegQ] = (c1 - omega) * static_cast(pop[M::fwdRegQ]) + omega * eqFw; - // pop_out_opp = (1. - omega) * fin(i, opp[k]) + omega * eqopp; + // pop_out_opp = (1. - omega) * fin(i, opp[k]) + omega * eqopp; pop[M::bkwRegQ] = (c1 - omega) * static_cast(pop[M::bkwRegQ]) + omega * eqBk; }); { // Center; using T = typename Lattice::Registers; using M = typename Lattice::template RegisterMapper; - // eq = rho * t[k] * (1. - usqr); - const Compute eqCenter = rho * T::t[M::fwdRegQ] * (c1 - usqr); - // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; - pop[Lattice::Registers::center] = (c1 - omega) * static_cast(pop[M::fwdRegQ]) + omega * eqCenter; + // eq = rho * t[k] * (1. - usqr); + const Compute eqCenter = rho * T::t[M::centerRegQ] * (c1 - usqr); + // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; + pop[M::centerRegQ] = (c1 - omega) * static_cast(pop[M::centerRegQ]) + omega * eqCenter; } } static inline NEON_CUDA_HOST_DEVICE auto diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index 886c1c6a..670010fb 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -36,6 +36,7 @@ struct Lbm Report& report, Lambda activeMask) { + configurations = config; reportPtr = &report; // Setting the backend @@ -242,12 +243,14 @@ struct Lbm computeRhoAndU.run(Neon::Backend::mainStreamIdx); u.updateHostData(Neon::Backend::mainStreamIdx); rho.updateHostData(Neon::Backend::mainStreamIdx); + //pop.updateHostData(Neon::Backend::mainStreamIdx); grid.getBackend().sync(Neon::Backend::mainStreamIdx); size_t numDigits = 5; std::string iterIdStr = std::to_string(iteration); iterIdStr = std::string(numDigits - std::min(numDigits, iterIdStr.length()), '0') + iterIdStr; + //pop.ioToVtk("pop_" + iterIdStr, "pop", false); u.ioToVtk("u_" + iterIdStr, "u", false); rho.ioToVtk("rho_" + iterIdStr, "rho", false); cellFlagField.template ioToVtk("cellFlagField_" + iterIdStr, "flag", false); From 852eaf5d9fdd42e77e2d17d8eb6a789ba95ca3ef Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 29 Aug 2023 20:12:48 +0200 Subject: [PATCH 50/94] WIP --- benchmarks/lbm/src/DeviceD3Q19.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/benchmarks/lbm/src/DeviceD3Q19.h b/benchmarks/lbm/src/DeviceD3Q19.h index 1eb6e2e3..60a4033d 100644 --- a/benchmarks/lbm/src/DeviceD3Q19.h +++ b/benchmarks/lbm/src/DeviceD3Q19.h @@ -28,24 +28,23 @@ struct DeviceD3Q19 typename PopField::Partition const& fin, NEON_OUT Storage popIn[Lattice::Q]) { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx) { - using M = typename Lattice::template MappersIdxSetWithFwdMem; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; - if constexpr (fwMemIdx == Lattice::Memory::center) { - popIn[M::centerRegIdx] = fin(gidx, M::centerMemQ); + if constexpr (M::fwdMemQ == M::centerMemQ) { + popIn[M::centerRegQ] = fin(gidx, M::centerMemQ); } else { if (CellType::isWall()) { popIn[M::fwdRegQ] = fin(gidx, M::bkMemIdx) + - fin.template getNghData(gidx, M::bkMemIdx)(); + fin.template getNghData(gidx, M::bkwMemIdx)(); } else { - popIn[M::fwdRegQ] = fin.template getNghData(gidx, fwMemIdx)(); + popIn[M::fwdRegQ] = fin.template getNghData(gidx, M::fwdMemIdx)(); } } }); } }; -#undef CAST_TO_COMPUTE } // namespace pull namespace push { @@ -76,7 +75,7 @@ struct DeviceD3Q19 using M = typename Lattice::template RegisterMapper; if constexpr (M::fwdMemQ == M::centerMemQ) { - // fOut(gidx, M::centerMemQ) = pOut[M::centerRegQ]; + fOut(gidx, M::centerMemQ) = pOut[M::centerRegQ]; } else { if (CellType::isWall(wallNghBitFlag)) { const auto pop_out = pOut[M::fwdRegQ]; From 389d8e1d0326f746a9bbec8892a962002d88649d Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 30 Aug 2023 11:37:15 +0200 Subject: [PATCH 51/94] Parametric Refactoring --- benchmarks/lbm/src/ContainersD3QXX.h | 378 +++++++++++++++++++++++++++ benchmarks/lbm/src/D3Q19.h | 6 + benchmarks/lbm/src/DeviceD3QXX.h | 231 ++++++++++++++++ benchmarks/lbm/src/Lbm.h | 18 +- 4 files changed, 625 insertions(+), 8 deletions(-) create mode 100644 benchmarks/lbm/src/ContainersD3QXX.h create mode 100644 benchmarks/lbm/src/DeviceD3QXX.h diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h new file mode 100644 index 00000000..1655c6a5 --- /dev/null +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -0,0 +1,378 @@ +#pragma once + +#include "./Methods.h" +#include "CellType.h" +#include "D3Q19.h" +#include "DeviceD3Q19.h" +#include "DeviceD3QXX.h" +#include "Methods.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +/** + * Specialization for D3Q19 + */ +template +struct ContainerFactoryD3QXX +{ + using Lattice = Lattice_; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using PullFunctions = pull::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; + using Device = DeviceD3QXX; +struct Pull{ + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + PullFunctions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + PullFunctions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } +}; +struct Push { + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop", + [=](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popIn, + NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + Device::Common::collideBgkUnrolled(rho, u, + usqr, omega, + NEON_IO popIn); + + Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut); + } + }; + }); + return container; + } +}; +struct Common { + + template + static auto + iteration([[maybe_unused]] Neon::set::StencilSemantic stencilSemantic, + [[maybe_unused]] const PopField fInField /*! Input population field */, + [[maybe_unused]] const CellTypeField& cellTypeField /*! Cell type field */, + [[maybe_unused]] const Compute omega /*! LBM omega parameter */, + [[maybe_unused]] PopField fOutField /*! Output Population field */) + -> Neon::set::Container + { + if constexpr (method_ == lbm::Method::push) { + using Factory = push::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + cellTypeField, + omega, + fOutField); + } + if constexpr (method_ == lbm::Method::pull) { + using Factory = pull::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + cellTypeField, + omega, + fOutField); + } + NEON_DEV_UNDER_CONSTRUCTION(""); + } + + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwdRegIdx) { + using M = typename Lattice::template RegisterMapper; + if constexpr (M::centerMemQ != M::fwdMemQ) { + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification == CellType::bounceBack || + nghCellType.classification == CellType::movingWall) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemQ)); + } + } + }); + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } + + + template + static auto + userSettingBc(UserLambda userLambda, + PopField& pField, + CellTypeField& cellTypeField /*! Cell type field */) + -> Neon::set::Container + { + Neon::set::Container container = pField.getGrid().newContainer( + "UserSettingBc", + [&](Neon::set::Loader& L) -> auto { + auto& p = L.load(pField, Neon::Pattern::MAP); + auto& flag = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globalIdx = p.getGlobalIndex(gidx); + Storage pValues[Lattice::Q]; + CellType::Classification cellClass; + userLambda(globalIdx, pValues, cellClass); + + CellType flagVal(cellClass); + flag(gidx, 0) = flagVal; + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + p(gidx, M::fwdMemQ) = pValues[M::fwdRegQ]; + }); + }; + }); + return container; + } + + static auto + copyPopulation(PopField& fInField, + PopField& foutField) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto const& pIn = L.load(fInField, Neon::Pattern::MAP); + auto& pOut = L.load(foutField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + pOut(gidx, q) = pIn(gidx, q); + }); + }; + }); + return container; + } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = + fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + + Storage popIn[Lattice::Q]; + CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); + + if (cellInfo.classification == CellType::bulk) { + CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + + static auto + problemSetup(PopField& fInField /*! inpout population field */, + PopField& fOutField, + CellTypeField& cellTypeField, + Neon::double_3d ulid, + double ulb) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, ulid, ulb](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, Neon::Pattern::MAP); + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globalIdx = fIn.getGlobalIndex(gidx); + const auto domainDim = fIn.getDomainSize(); + + CellType flagVal; + flagVal.classification = CellType::bulk; + flagVal.wallNghBitflag = 0; + + typename Lattice::Precision::Storage popVal = 0; + + if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || + globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 || + globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) { + flagVal.classification = CellType::bounceBack; + + if (globalIdx.y == domainDim.y - 1) { + flagVal.classification = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + if (globalIdx.y == domainDim.y - 1) { + popVal = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); + } else { + popVal = 0; + } + fIn(gidx, q) = popVal; + fOut(gidx, q) = popVal; + }); + } else { + flagVal.classification = CellType::bulk; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + fIn(gidx, q) = Lattice::Memory::template getT(); + fOut(gidx, q) = Lattice::Memory::template getT(); + }); + } + cellInfoPartition(gidx, 0) = flagVal; + }; + }); + return container; + } + + static auto + setToEquilibrium(PopField& fOutField, + CellTypeField& cellTypeField) + -> Neon::set::Container + { + Neon::set::Container container = fOutField.getGrid().newContainer( + "LBM_setToEquilibrium", + [&](Neon::set::Loader& L) -> auto { + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + { // All pints are pre-set to bulk + CellType flagVal; + flagVal.classification = CellType::bulk; + cellInfoPartition(gidx, 0) = flagVal; + } + + { // All cells are pre-set to Equilibrium + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + fOut(gidx, M::fwdMemQ) = Lattice::Registers::template getT(); + }); + } + }; + }); + return container; + } +}; +}; \ No newline at end of file diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h index a5c0e877..816e46f6 100644 --- a/benchmarks/lbm/src/D3Q19.h +++ b/benchmarks/lbm/src/D3Q19.h @@ -51,6 +51,12 @@ struct D3Q19 /*! 17 */ Neon::index_3d(0, 1, 1), /*! 18 */ Neon::index_3d(0, 1, -1)}; + template + static inline NEON_CUDA_HOST_DEVICE auto + getComponentOfDirection() -> int{ + return stencil[qIdx].v[cIdx]; + } + static constexpr int center = 9; /** Position of direction {0,0,0} */ template diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h new file mode 100644 index 00000000..c37927f9 --- /dev/null +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -0,0 +1,231 @@ +#pragma once +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +template +struct DeviceD3QXX +{ + using Lattice = Lattice_; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + struct Pull + { + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + + if constexpr (M::fwdMemQ == M::centerMemQ) { + popIn[M::centerRegQ] = fin(gidx, M::centerMemQ); + } else { + if (CellType::isWall()) { + popIn[M::fwdRegQ] = fin(gidx, M::bkMemIdx) + + fin.template getNghData(gidx, M::bkwMemIdx)(); + } else { + popIn[M::fwdRegQ] = fin.template getNghData(gidx, M::fwdMemIdx)(); + } + } + }); + } + }; + + struct Push + { + static inline NEON_CUDA_HOST_DEVICE auto + pushStream(Idx const& gidx, + const uint32_t& wallNghBitFlag, + NEON_OUT Storage pOut[Lattice::Q], + NEON_OUT typename PopField::Partition& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + + if constexpr (M::fwdMemQ == M::centerMemQ) { + fOut(gidx, M::centerMemQ) = pOut[M::centerRegQ]; + } else { + if (CellType::isWall(wallNghBitFlag)) { + const auto pop_out = pOut[M::fwdRegQ]; + const auto f_nb_k = fOut.template getNghData(gidx, M::fwdMemQ)(); + + // fout(i, opp[k]) = + fOut(gidx, M::bkwMemQ) = + // pop_out + + pop_out + + // f(nb, k); + f_nb_k; + } else { + // fout(nb, + fOut.template writeNghData(gidx, + // k) + M::fwdMemQ, + // = pop_out; + pOut[M::fwdRegQ]); + } + } + }); + } + }; + + + struct Common + { + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const Storage pop[Lattice::Q], + NEON_OUT Compute& rho, + NEON_OUT std::array& u) + -> void + { + if constexpr (Lattice::Q == 19) { +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + return; + } + if constexpr (Lattice::Q == 27) { +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6) + POP(9) + POP(10) + POP(11) + POP(12); + const Compute X_P1 = POP(14) + POP(17) + POP(18) + POP(19) + POP(20) + POP(23) + POP(24) + POP(25) + POP(26); + const Compute X_0 = POP(1) + POP(2) + POP(7) + POP(8) + POP(13) + POP(15) + POP(16) + POP(21) + POP(22); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(9) + POP(10) + POP(18) + POP(25) + POP(26); + const Compute Y_P1 = POP(15) + POP(17) + POP(21) + POP(22) + POP(23) + POP(24) + POP(4) + POP(11) + POP(12); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(9) + POP(11) + POP(20) + POP(22) + POP(24) + POP(26); + const Compute Z_P1 = POP(16) + POP(19) + POP(21) + POP(23) + POP(25) + POP(6) + POP(8) + POP(10) + POP(12); +#undef POP + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + return; + } + printf("Error: macroscopic function does not support the selected lattice.\n"); + } + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + NEON_IO Storage pop[Lattice::Q]) + + -> void + { + + // constexpr Compute c1over18 = 1. / 18.; + constexpr Compute c4dot5 = 4.5; + constexpr Compute c3 = 3.; + constexpr Compute c1 = 1.; + constexpr Compute c6 = 6.; + + // constexpr int regCenter = Lattice::Registers::center; + // constexpr int regFir = Lattice::Registers::center; + + Neon::ConstexprFor<0, Lattice::Registers::firstHalfQLen, 1>( + [&](auto q) { + using M = typename Lattice::template RegisterMapper; + using T = typename Lattice::Registers; + + Compute eqFw; + Compute eqBk; + + const Compute ck_u = u[0] * Lattice::Registers::template getComponentOfDirection() + + u[1] * Lattice::Registers::template getComponentOfDirection() + + u[2] * Lattice::Registers::template getComponentOfDirection(); + + // double eq = rho * t[k] * + // (1. + + // 3. * ck_u + + // 4.5 * ck_u * ck_u - + // usqr); + eqFw = rho * T::t[M::fwdRegQ] * + (c1 + + c3 * ck_u + + c4dot5 * ck_u * ck_u - + usqr); + + // double eqopp = eq - 6.* rho * t[k] * ck_u; + eqBk = eqFw - + c6 * rho * T::t[M::fwdRegQ] * ck_u; + + // pop_out = (1. - omega) * fin(i, k) + omega * eq; + pop[M::fwdRegQ] = (c1 - omega) * static_cast(pop[M::fwdRegQ]) + omega * eqFw; + // pop_out_opp = (1. - omega) * fin(i, opp[k]) + omega * eqopp; + pop[M::bkwRegQ] = (c1 - omega) * static_cast(pop[M::bkwRegQ]) + omega * eqBk; + }); + { // Center; + using T = typename Lattice::Registers; + using M = typename Lattice::template RegisterMapper; + // eq = rho * t[k] * (1. - usqr); + const Compute eqCenter = rho * T::t[M::centerRegQ] * (c1 - usqr); + // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; + pop[M::centerRegQ] = (c1 - omega) * static_cast(pop[M::centerRegQ]) + omega * eqCenter; + } + } + + static inline NEON_CUDA_HOST_DEVICE auto + localLoad(Idx const& gidx, + NEON_IN typename PopField::Partition const& fOut, + Storage NEON_RESTRICT pOut[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + pOut[M::fwdRegQ] = fOut(gidx, M::fwdMemQ); + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + localStore(Idx const& gidx, + Storage NEON_RESTRICT pOut[Lattice::Q], + NEON_IN typename PopField::Partition& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + fOut(gidx, M::fwdMemQ) = pOut[M::fwdRegQ]; + }); + } + }; +}; \ No newline at end of file diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index 670010fb..cdcc929f 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -5,6 +5,7 @@ #include "CellType.h" #include "ContainerFactory.h" #include "ContainersD3Q19.h" +#include "ContainersD3QXX.h" #include "D3Q19.h" #include "Methods.h" #include "Neon/Neon.h" @@ -29,7 +30,8 @@ struct Lbm using RhoField = typename Grid::template Field; using UField = typename Grid::template Field; - using CommonContainerFactory = common::ContainerFactory; + //using CommonContainerFactory = common::ContainerFactory; + using ContainerFactory = ContainerFactoryD3QXX; template Lbm(Config& config, @@ -89,7 +91,7 @@ struct Lbm { // Setting Equilibrium all population field for (auto& pField : pFieldList) { // Set all to eq - CommonContainerFactory::setToEquilibrium(pField, cellFlagField).run(Neon::Backend::mainStreamIdx); + ContainerFactory::Common::setToEquilibrium(pField, cellFlagField).run(Neon::Backend::mainStreamIdx); } } } @@ -100,13 +102,13 @@ struct Lbm { grid.getBackend().sync(Neon::Backend::mainStreamIdx); // Compute ngh mask - CommonContainerFactory::userSettingBc(bcSetFunction, + ContainerFactory::Common::userSettingBc(bcSetFunction, pFieldList[0], cellFlagField) .run(Neon::Backend::mainStreamIdx); for (int i = 1; i < int(pFieldList.size()); i++) { - CommonContainerFactory::copyPopulation(pFieldList[0], + ContainerFactory::Common::copyPopulation(pFieldList[0], pFieldList[i]) .run(Neon::Backend::mainStreamIdx); } @@ -115,7 +117,7 @@ struct Lbm Neon::Execution::device) .run(Neon::Backend::mainStreamIdx); grid.getBackend().sync(Neon::Backend::mainStreamIdx); - CommonContainerFactory::computeWallNghMask(cellFlagField, + ContainerFactory::Common::computeWallNghMask(cellFlagField, cellFlagField) .run(Neon::Backend::mainStreamIdx); } @@ -136,7 +138,7 @@ struct Lbm { iteration = 0; int skIdx = helpGetSkeletonIdx(); - auto even = common::ContainerFactory::template iteration( + auto even = ContainerFactory::Push::iteration( configurations.stencilSemantic, pFieldList.at(helpGetInputIdx()), cellFlagField, @@ -154,7 +156,7 @@ struct Lbm { iteration = 1; int skIdx = helpGetSkeletonIdx(); - auto odd = CommonContainerFactory::template iteration( + auto odd = ContainerFactory::Push::iteration( configurations.stencilSemantic, pFieldList.at(helpGetInputIdx()), cellFlagField, @@ -239,7 +241,7 @@ struct Lbm { grid.getBackend().syncAll(); auto& pop = pFieldList.at(helpGetOutputIdx()); - auto computeRhoAndU = CommonContainerFactory::computeRhoAndU(pop, cellFlagField, rho, u); + auto computeRhoAndU = ContainerFactory::Common::computeRhoAndU(pop, cellFlagField, rho, u); computeRhoAndU.run(Neon::Backend::mainStreamIdx); u.updateHostData(Neon::Backend::mainStreamIdx); rho.updateHostData(Neon::Backend::mainStreamIdx); From b8627f5c2b3866196ec954fc057bb0cedd0a2511 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 30 Aug 2023 21:36:55 +0200 Subject: [PATCH 52/94] WIP: test with D3Q27 --- benchmarks/lbm/src/ContainersD3QXX.h | 2 +- benchmarks/lbm/src/D3Q27.h | 59 +++++++++++++++++++------- benchmarks/lbm/src/DeviceD3QXX.h | 14 ------ benchmarks/lbm/src/RunCavityTwoPop.cpp | 2 +- 4 files changed, 45 insertions(+), 32 deletions(-) diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h index 1655c6a5..88306dc1 100644 --- a/benchmarks/lbm/src/ContainersD3QXX.h +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -96,7 +96,7 @@ struct Push { if (cellInfo.classification == CellType::bulk) { Storage popIn[Lattice::Q]; - CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); + Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); Compute rho; std::array u{.0, .0, .0}; diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h index 9f2c7f95..5a95b815 100644 --- a/benchmarks/lbm/src/D3Q27.h +++ b/benchmarks/lbm/src/D3Q27.h @@ -85,6 +85,18 @@ struct D3Q27 8. / 27., 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; + + template + static constexpr auto getT() -> const typename Precision::Storage + { + return t[q]; + } + + template + static constexpr auto getDirection() -> const typename Neon::index_3d + { + return stencil[q]; + } }; struct Memory @@ -121,33 +133,29 @@ struct D3Q27 static constexpr int center = 13; /** Position of direction {0,0,0} */ - + + static constexpr std::array memoryToRegister{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}; + + static constexpr std::array registerToMemory{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}; + template - static constexpr auto mapToRegisters() + NEON_CUDA_HOST_DEVICE static constexpr auto mapToRegisters() -> int { - auto direction = stencil[go]; - for (int i = 0; i < Q; ++i) { - if (Registers::stencil[i] == direction) { - return i; - } - } + return memoryToRegister[go]; } template - static constexpr auto mapFromRegisters() + NEON_CUDA_HOST_DEVICE static constexpr auto mapToMemory() -> int { - auto direction = Registers::stencil[go]; - for (int i = 0; i < Q; ++i) { - if (Self::stencil[i] == direction) { - return i; - } - } + return registerToMemory[go]; } template - static constexpr auto getOpposite() + NEON_CUDA_HOST_DEVICE static constexpr auto getOpposite() -> int { auto opposite3d = stencil[go] * -1; @@ -197,4 +205,23 @@ struct D3Q27 } return vec; } + + template + struct RegisterMapper + { + constexpr static int fwdRegQ = fwdRegIdx_; + constexpr static int bkwRegQ = Registers::opposite[fwdRegQ]; + constexpr static int fwdMemQ = Memory::template mapToMemory(); + constexpr static int bkwMemQ = Memory::template mapToMemory(); + constexpr static int centerRegQ = Registers::center; + constexpr static int centerMemQ = Memory::center; + + constexpr static int fwdMemQX = Memory::stencil[fwdMemQ].x; + constexpr static int fwdMemQY = Memory::stencil[fwdMemQ].y; + constexpr static int fwdMemQZ = Memory::stencil[fwdMemQ].z; + + constexpr static int bkwMemQX = Memory::stencil[bkwMemQ].x; + constexpr static int bkwMemQY = Memory::stencil[bkwMemQ].y; + constexpr static int bkwMemQZ = Memory::stencil[bkwMemQ].z; + }; }; diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index c37927f9..a7d232bc 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -85,20 +85,6 @@ struct DeviceD3QXX struct Common { - using Lattice = D3Q19; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - static inline NEON_CUDA_HOST_DEVICE auto macroscopic(const Storage pop[Lattice::Q], NEON_OUT Compute& rho, diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cpp index e5245473..e15b24c7 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -25,7 +25,7 @@ auto run(Config& config, using Storage = Storage_; using Compute = Compute_; using Precision = Precision; - using Lattice = D3Q19; + using Lattice = D3Q27; // using PopulationField = typename Grid::template Field; // using PopField = typename Grid::template Field; From 6759005c85989b0252142a1e060102de31e588cd Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 31 Aug 2023 09:45:06 +0200 Subject: [PATCH 53/94] D3Q27 tested --- benchmarks/lbm/src/ContainersD3QXX.h | 5 +++-- benchmarks/lbm/src/D3Q27.h | 11 +++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h index 88306dc1..194ed222 100644 --- a/benchmarks/lbm/src/ContainersD3QXX.h +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -247,6 +247,7 @@ struct Common { -> Neon::set::Container { + Neon::set::Container container = fInField.getGrid().newContainer( "LBM_iteration", @@ -264,10 +265,10 @@ struct Common { std::array u{.0, .0, .0}; Storage popIn[Lattice::Q]; - CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); + Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); if (cellInfo.classification == CellType::bulk) { - CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); } else { if (cellInfo.classification == CellType::movingWall) { rho = 1.0; diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h index 5a95b815..e0221fe2 100644 --- a/benchmarks/lbm/src/D3Q27.h +++ b/benchmarks/lbm/src/D3Q27.h @@ -59,6 +59,12 @@ struct D3Q27 Neon::index_3d(1, -1, 1), Neon::index_3d(1, -1, -1)}; + template + static inline NEON_CUDA_HOST_DEVICE auto + getComponentOfDirection() -> int{ + return stencil[qIdx].v[cIdx]; + } + static constexpr int center = 13; /** Position of direction {0,0,0} */ template @@ -97,6 +103,11 @@ struct D3Q27 { return stencil[q]; } + // Identifying first half of the directions + // For each direction in the list, the opposite is not present. + // Center is also removed + static constexpr int firstHalfQLen = (Q - 1) / 2; + static constexpr std::array firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; }; struct Memory From 6adff5a4bea23b2cd95c3971d625d34f267fd490 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 31 Aug 2023 14:35:56 +0200 Subject: [PATCH 54/94] WIP: refactoring CLI --- benchmarks/lbm/src/Config.cpp | 117 +++++++++--------- benchmarks/lbm/src/Config.h | 84 +++++++++---- benchmarks/lbm/src/Lbm.h | 10 +- benchmarks/lbm/src/Report.cpp | 13 +- benchmarks/lbm/src/RunCavityTwoPop.cpp | 35 ++++-- .../include/Neon/domain/tools/SpaceCurves.h | 28 ++++- .../src/domain/tools/SpaceCurves.cpp | 100 ++++++++++++++- libNeonSet/include/Neon/set/StencilSemantic.h | 13 +- libNeonSet/include/Neon/set/TransferMode.h | 9 +- libNeonSet/src/set/StencilSemantic.cpp | 39 ++++-- libNeonSet/src/set/TransferMode.cpp | 36 ++++-- libNeonSkeleton/include/Neon/skeleton/Occ.h | 9 +- libNeonSkeleton/src/skeleton/Occ.cpp | 16 ++- 13 files changed, 366 insertions(+), 143 deletions(-) diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp index 115125bd..43e2147a 100644 --- a/benchmarks/lbm/src/Config.cpp +++ b/benchmarks/lbm/src/Config.cpp @@ -21,37 +21,41 @@ auto Config::toString() const -> std::string return s.str(); }; - s << ".................. Re " << c.Re << std::endl; - s << "................. ulb " << c.ulb << std::endl; - s << "................... N " << c.N << std::endl; - s << "........... benchmark " << c.benchmark << std::endl; - s << "............... max_t " << c.max_t << std::endl; - s << "........ outFrequency " << c.outFrequency << std::endl; - s << "....... dataFrequency " << c.dataFrequency << std::endl; - s << "................. vti " << c.vti << std::endl; - - s << "........ benchIniIter " << c.benchIniIter << std::endl; - s << "........ benchMaxIter " << c.benchMaxIter << std::endl; - + s << "Neon Runtime Parameters" << std::endl; s << ".......... deviceType " << c.deviceType << std::endl; s << ".......... numDevices " << c.devices.size() << std::endl; s << "............. devices " << vecToSting(c.devices) << std::endl; s << ".......... reportFile " << c.reportFile << std::endl; s << "............ gridType " << c.gridType << std::endl; - s << "......... computeType " << c.computeType << std::endl; - s << "........... storeType " << c.storeType << std::endl; - s << "............... curve " << c.curve << std::endl; + s << ".......... spaceCurve " << c.spaceCurveCli.getStringOptions() << std::endl; + s << "................. occ " << c.occCli.getStringOptions() << std::endl; + s << "....... transferMode " << c.transferModeCli.getStringOptions() << std::endl; + s << ".... stencilSemantic " << c.stencilSemanticCli.getStringOptions() << std::endl; - s << ". ............... occ " << Neon::skeleton::OccUtils::toString(c.occ) << std::endl; - s << "....... transfer Mode " << Neon::set::TransferModeUtils::toString(c.transferMode) << std::endl; - s << "... transfer Semantic " << Neon::set::StencilSemanticUtils::toString(c.stencilSemantic) << std::endl; + s << "LBM Implementation" << std::endl; + s << "............ lattice " << c.lattice << std::endl; + s << "... streaming method " << c.streamingMethod << std::endl; + s << "......... computeType " << c.computeTypeStr << std::endl; + s << "........... storeType " << c.storeTypeStr << std::endl; - s << ". ............... nu " << mLbmParameters.nu << std::endl; + s << "Physics Parameters" << std::endl; + s << ".................. Re " << c.Re << std::endl; + s << "................. ulb " << c.ulb << std::endl; + s << "................... N " << c.N << std::endl; + s << "................. nu " << mLbmParameters.nu << std::endl; s << ".............. omega " << mLbmParameters.omega << std::endl; s << "................. dx " << mLbmParameters.dx << std::endl; s << "................. dt " << mLbmParameters.dt << std::endl; + s << "Test Parameters" << std::endl; + s << "........... benchmark " << c.benchmark << std::endl; + s << "............... max_t " << c.max_t << std::endl; + s << "................. vti " << c.vti << std::endl; + s << "........ benchIniIter " << c.benchIniIter << std::endl; + s << "........ benchMaxIter " << c.benchMaxIter << std::endl; + + return s.str(); } @@ -61,34 +65,34 @@ auto Config::parseArgs(const int argc, char* argv[]) auto& config = *this; auto cli = - (clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device ids to use", - clipp::required("--deviceIds") & clipp::integers("gpus", config.devices) % "Device ids to use", - clipp::option("--grid") & clipp::value("grid", config.gridType) % "Could be dGrid, eGrid, bGrid", - clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain", - clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters", - clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations", - clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run.", - clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename", - - clipp::option("--computeFP") & clipp::value("computeFP", config.computeType) % "Could be double or float", - clipp::option("--storageFP") & clipp::value("storageFP", config.storeType) % "Could be double or float", - - clipp::option("--curve") & clipp::value("curve", config.curve) % "Could be sweep (the default), morton, or hilber", - ( - (clipp::option("--sOCC").set(config.occ, Neon::skeleton::Occ::standard) % "Standard OCC") | - (clipp::option("--nOCC").set(config.occ, Neon::skeleton::Occ::none) % "No OCC (on by default)")), - ( - (clipp::option("--put").set(config.transferMode, Neon::set::TransferMode::put) % "Set transfer mode to PUT") | - (clipp::option("--get").set(config.transferMode, Neon::set::TransferMode::get) % "Set transfer mode to GET (on by default)")), - ( - (clipp::option("--huLattice").set(config.stencilSemantic, Neon::set::StencilSemantic::streaming) % "Halo update with lattice semantic (on by default)") | - (clipp::option("--huGrid").set(config.stencilSemantic, Neon::set::StencilSemantic::standard) % "Halo update with grid semantic ")), - ( - (clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode") | - (clipp::option("--visual").set(config.benchmark, false) % "Run export partial data")), - - ( - clipp::option("--vti").set(config.vti, true) % "Standard OCC") + ( + + clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device type (cpu or gpu)", + clipp::required("--deviceIds") & clipp::integers("ids", config.devices) % "Device ids", + + clipp::option("--grid") & clipp::value("grid", config.gridType) % Config::getOptionList(config.gridTypeOptions, config.gridType), + clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain", + clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations", + clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename", + + clipp::option("--computeFP") & clipp::value("computeFP", config.computeTypeStr) % Config::getOptionList(config.gridTypeOptions, config.gridType), + clipp::option("--storageFP") & clipp::value("storageFP", config.storeTypeStr) % "double, float", + + clipp::option("--occ")([&config](const std::string& s) { config.occCli.set(s); }) % config.occCli.getDoc(), + clipp::option("--transferMode")([&config](const std::string& s) { config.transferModeCli.set(s); }) % config.transferModeCli.getDoc(), + clipp::option("--stencilSemantic")([&config](const std::string& s) { config.stencilSemanticCli.set(s); }) % config.stencilSemanticCli.getDoc(), + clipp::option("--spaceCurve")([&config](const std::string& s) { config.spaceCurveCli.set(s); }) % config.spaceCurveCli.getDoc(), + + clipp::option("--streamingMethod") & clipp::value("streamingMethod", config.streamingMethod) % Config::getOptionList(config.streamingMethodOption, config.streamingMethod), + clipp::option("--lattice") & clipp::value("lattice", config.lattice) % Config::getOptionList(config.latticeOptions, config.lattice), + ( + ( + clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode", + clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters", + clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run." + + ) | + (clipp::option("--vti") & clipp::integer("OutputFrequency", config.vti) % "Voxels along each dimension of the cube domain")) ); @@ -96,20 +100,13 @@ auto Config::parseArgs(const int argc, char* argv[]) if (!clipp::parse(argc, argv, cli)) { auto fmt = clipp::doc_formatting{}.doc_column(31); std::cout << make_man_page(cli, argv[0], fmt) << '\n'; - return -1; - } + std::cout << '\n'; + std::cout << '\n'; + std::cout << "Export example" << '\n'; + std::cout << "./lbm --deviceType cpu --deviceIds 0 --grid dGrid --domain-size 100 --max-iter 2000 --nOCC --huGrid --vti 1" << '\n'; + std::cout << "Benchmark example " << '\n'; + std::cout << "./lbm --deviceType gpu --deviceIds 0 1 2 3 4 --grid dGrid --domain-size 100 --max-iter 2000 --computeFP double --storageFP double --nOCC --huGrid --benchmark --warmup-iter 10 --repetitions 5" << '\n'; - if (config.curve == "sweep") - config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep; - if (config.curve == "morton") - config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::morton; - if (config.curve == "hilbert") - config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::hilbert; - - if (config.curve != "sweep" && config.curve != "morton" && config.curve != "hilbert") { - auto fmt = clipp::doc_formatting{}.doc_column(31); - std::cout << config.curve << " is not a supported configuration" << std::endl; - std::cout << make_man_page(cli, argv[0], fmt) << '\n'; return -1; } diff --git a/benchmarks/lbm/src/Config.h b/benchmarks/lbm/src/Config.h index 18695ce4..fbd7dd3c 100644 --- a/benchmarks/lbm/src/Config.h +++ b/benchmarks/lbm/src/Config.h @@ -17,29 +17,67 @@ struct LbmParameters struct Config { - double Re = 100.; // Reynolds number - double ulb = 0.04; // Velocity in lattice units - int N = 160; // Number of nodes in x-direction - bool benchmark = false; // Run in benchmark mode ? - double max_t = 10.0; // Non-benchmark mode: Total time in dim.less units - int outFrequency = 200; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) - int dataFrequency = 0; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) - int benchIniIter = 1000; // Benchmark mode: Number of warmup iterations - int benchMaxIter = 2000; // Benchmark mode: Total number of iterations - int repetitions = 1; // Benchmark mode: number of time the test is run - std::string deviceType = "gpu"; - std::vector devices = std::vector(0); // Devices for the execution - std::string reportFile = "lbm-lid-driven-cavity-flow"; // Report file name - std::string gridType = "dGrid"; // Neon grid type - Neon::skeleton::Occ occ = Neon::skeleton::Occ::none; // Neon OCC type - Neon::set::TransferMode transferMode = Neon::set::TransferMode::get; // Neon transfer mode for halo update - Neon::set::StencilSemantic stencilSemantic = Neon::set::StencilSemantic::streaming; - bool vti = false; // Export vti file - std::string computeType = "double"; - std::string storeType = "double"; - std::string curve = "sweep"; - Neon::domain::tool::spaceCurves::EncoderType spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep; - LbmParameters mLbmParameters; + double Re = 100.; // Reynolds number + double ulb = 0.04; // Velocity in lattice units + int N = 160; // Number of nodes in x-direction + bool benchmark = false; // Run in benchmark mode ? + double max_t = 10.0; // Non-benchmark mode: Total time in dim.less units + int outFrequency = 200; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) + int dataFrequency = 0; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) + int benchIniIter = 1000; // Benchmark mode: Number of warmup iterations + int benchMaxIter = 2000; // Benchmark mode: Total number of iterations + int repetitions = 1; // Benchmark mode: number of time the test is run + + std::string deviceType = "gpu"; + std::vector devices = std::vector(0); // Devices for the execution + std::string reportFile = "lbm-lid-driven-cavity-flow"; // Report file name + + std::vector gridTypeOptions = {"dGrid", "eGrid", "bGrid"}; + std::string gridType = gridTypeOptions[0]; // Neon grid type + + Neon::skeleton::OccUtils::Cli occCli{Neon::skeleton::Occ::none}; // Neon OCC type + Neon::set::TransferModeUtils::Cli transferModeCli{Neon::set::TransferMode::get}; // Neon transfer mode for halo update + Neon::set::StencilSemanticUtils::Cli stencilSemanticCli{Neon::set::StencilSemantic::streaming}; + Neon::domain::tool::spaceCurves::EncoderTypeUtil::Cli spaceCurveCli{Neon::domain::tool::spaceCurves::EncoderType::sweep}; + + int vti = 0; // Export vti file + + std::vector computeTypeOptions = {"double", "float"}; + std::string computeTypeStr = computeTypeOptions[0]; + + std::vector storeTypeOptions = {"double", "float"}; + std::string storeTypeStr = storeTypeOptions[0]; + + + std::vector latticeOptions = {"d3q19", "d3q27"}; + std::string lattice = latticeOptions[0]; + + std::vector streamingMethodOption = {"push", "pull"}; + std::string streamingMethod = "push"; + + LbmParameters mLbmParameters; + + auto getOptionList(std::vector list, std::string defaultVal) -> std::string + { + std::stringstream s; + for (int i = 0; i < int(list.size()); i++) { + s << list[i]; + if (list[i] == defaultVal) { + s << " (default) "; + } + } + return s.str(); + } + + auto check(std::vector list, std::string userValue) -> bool + { + for (int i = 0; i < int(list.size()); i++) { + if (list[i] == userValue) { + return true; + } + } + return false; + } auto toString() const -> std::string; diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index cdcc929f..ed51e013 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -62,7 +62,7 @@ struct Lbm [&](const Neon::index_3d& p) { return activeMask(p); }, Lattice::template getDirectionAsVector(), 1.0, 0.0, - config.spaceCurve); + config.spaceCurveCli.getOption()); // Allocating Populations for (int i = 0; i < lbm::MethodUtils::getNumberOfPFields(); i++) { @@ -139,7 +139,7 @@ struct Lbm iteration = 0; int skIdx = helpGetSkeletonIdx(); auto even = ContainerFactory::Push::iteration( - configurations.stencilSemantic, + configurations.stencilSemanticCli.getOption(), pFieldList.at(helpGetInputIdx()), cellFlagField, lbmParameters.omega, @@ -147,7 +147,7 @@ struct Lbm std::vector ops; skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); - Neon::skeleton::Options opt(configurations.occ, configurations.transferMode); + Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption()); ops.push_back(even); std::stringstream appName; appName << "LBM_push_even"; @@ -157,7 +157,7 @@ struct Lbm iteration = 1; int skIdx = helpGetSkeletonIdx(); auto odd = ContainerFactory::Push::iteration( - configurations.stencilSemantic, + configurations.stencilSemanticCli.getOption(), pFieldList.at(helpGetInputIdx()), cellFlagField, lbmParameters.omega, @@ -165,7 +165,7 @@ struct Lbm std::vector ops; skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); - Neon::skeleton::Options opt(configurations.occ, configurations.transferMode); + Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption()); ops.push_back(odd); std::stringstream appName; appName << "LBM_push_odd"; diff --git a/benchmarks/lbm/src/Report.cpp b/benchmarks/lbm/src/Report.cpp index 049d1735..308eee6d 100644 --- a/benchmarks/lbm/src/Report.cpp +++ b/benchmarks/lbm/src/Report.cpp @@ -27,14 +27,13 @@ Report::Report(const Config& c) mReport.addMember("reportFile", c.reportFile); mReport.addMember("gridType", c.gridType); - mReport.addMember("computeType", c.computeType); - mReport.addMember("storeType", c.storeType); - mReport.addMember("spaceCurve", Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(c.spaceCurve)); + mReport.addMember("computeTypeStr", c.computeTypeStr); + mReport.addMember("storeTypeStr", c.storeTypeStr); - - mReport.addMember("occ", Neon::skeleton::OccUtils::toString(c.occ)); - mReport.addMember("transferMode", Neon::set::TransferModeUtils::toString(c.transferMode)); - mReport.addMember("transferSemantic", Neon::set::StencilSemanticUtils::toString(c.stencilSemantic)); + c.occCli.addToReport(mReport); + c.transferModeCli.addToReport(mReport); + c.stencilSemanticCli.addToReport(mReport); + c.spaceCurveCli.addToReport(mReport); mReport.addMember("nu", c.mLbmParameters.nu); mReport.addMember("omega", c.mLbmParameters.omega); diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cpp index e15b24c7..4b1f56eb 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -16,6 +16,7 @@ int backendWasReported = false; namespace details { template @@ -25,7 +26,7 @@ auto run(Config& config, using Storage = Storage_; using Compute = Compute_; using Precision = Precision; - using Lattice = D3Q27; + using Lattice = Lattice_;//D3Q27; // using PopulationField = typename Grid::template Field; // using PopField = typename Grid::template Field; @@ -80,19 +81,37 @@ auto run(Config& config, lbm.iterate(); } -template + +template auto runFilterMethod(Config& config, Report& report) -> void { - return run(config, report); + return run(config, report); +} + +template +auto runFilterLattice(Config& config, Report& report) -> void +{ + using Precision = Precision; + + if (config.lattice == "d3q19") { + using Lattice = D3Q19; + return runFilterMethod(config, report); + } + if (config.lattice == "d3q27") { + using Lattice = D3Q19; + return runFilterMethod(config, report); + } + NEON_DEV_UNDER_CONSTRUCTION(""); } + template auto runFilterComputeType(Config& config, Report& report) -> void { - if (config.computeType == "double") { - return runFilterMethod(config, report); + if (config.computeTypeStr == "double") { + return runFilterLattice(config, report); } - // if (config.computeType == "float") { + // if (config.computeTypeStr == "float") { // return run(config, report); // } NEON_DEV_UNDER_CONSTRUCTION(""); @@ -103,10 +122,10 @@ auto runFilterStoreType(Config& config, Report& report) -> void { - if (config.storeType == "double") { + if (config.storeTypeStr == "double") { return runFilterComputeType(config, report); } - // if (config.storeType == "float") { + // if (config.storeTypeStr == "float") { // return runFilterComputeType(config, report); // } NEON_DEV_UNDER_CONSTRUCTION(""); diff --git a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h index f1fda5a5..e36cd223 100644 --- a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h +++ b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h @@ -1,5 +1,6 @@ #pragma once #include "Neon/Neon.h" +#include "Neon/Report.h" namespace Neon::domain::tool::spaceCurves { @@ -34,11 +35,30 @@ struct EncoderTypeUtil * Returns all valid configuration for DataView * @return */ - static auto validOptions() -> std::array; + static auto getOptions() -> std::array; static auto fromInt(int val) -> EncoderType; - + static auto fromString(const std::string& opt) -> EncoderType; static auto toInt(EncoderType encoderType) -> int; + + struct Cli + { + explicit Cli(std::string); + explicit Cli(EncoderType model); + Cli(); + + auto getOption() const -> EncoderType; + auto set(const std::string& opt) -> void; + auto getStringOptions() const -> std::string; + auto getDoc() const -> std::string; + + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; + auto addToReport(Neon::Report& report) const -> void; + + private: + bool mSet = false; + EncoderType mOption; + }; }; @@ -289,7 +309,6 @@ class Encoder } public: - static inline auto mortonEncode([[maybe_unused]] Neon::index_3d dim, Neon::index_3d idx) -> uint64_t { @@ -315,7 +334,8 @@ class Encoder return res; } - static inline auto encode(EncoderType type, Neon::index_3d dim, Neon::index_3d idx){ + static inline auto encode(EncoderType type, Neon::index_3d dim, Neon::index_3d idx) + { switch (type) { case EncoderType::morton: return mortonEncode(dim, idx); diff --git a/libNeonDomain/src/domain/tools/SpaceCurves.cpp b/libNeonDomain/src/domain/tools/SpaceCurves.cpp index 9bd700dd..d45ec571 100644 --- a/libNeonDomain/src/domain/tools/SpaceCurves.cpp +++ b/libNeonDomain/src/domain/tools/SpaceCurves.cpp @@ -3,11 +3,11 @@ namespace Neon::domain::tool::spaceCurves { -auto EncoderTypeUtil::validOptions() -> std::array +auto EncoderTypeUtil::getOptions() -> std::array { std::array options = {EncoderType::sweep, - EncoderType::morton, - EncoderType::hilbert}; + EncoderType::morton, + EncoderType::hilbert}; return options; } @@ -47,6 +47,17 @@ auto EncoderTypeUtil::fromInt(int val) -> EncoderType } } +auto EncoderTypeUtil::fromString(const std::string& occ) -> EncoderType +{ + std::array opts = getOptions(); + for (auto a : opts) { + if (toString(a) == occ) { + return a; + } + } + NEON_THROW_UNSUPPORTED_OPTION(""); +} + auto EncoderTypeUtil::toInt(EncoderType dataView) -> int { return static_cast(dataView); @@ -57,4 +68,87 @@ std::ostream& operator<<(std::ostream& os, EncoderType const& m) return os << std::string(EncoderTypeUtil::toString(m)); } + +EncoderTypeUtil::Cli::Cli() +{ + mSet = false; +} + +EncoderTypeUtil::Cli::Cli(std::string s) +{ + set(s); +} + +EncoderTypeUtil::Cli::Cli(EncoderType model) +{ + mOption = model; +} + +auto EncoderTypeUtil::Cli::getOption() const -> EncoderType +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "TransferSemantic was not set."; + NEON_ERROR(errorMsg.str()); + } + return mOption; +} + +auto EncoderTypeUtil::Cli::set(const std::string& opt) + -> void +{ + try { + mOption = EncoderTypeUtil::fromString(opt); + } catch (...) { + std::stringstream errorMsg; + errorMsg << "TransferSemantic: " << opt << " is not a valid option (valid options are {"; + auto options = EncoderTypeUtil::getOptions(); + int i = 0; + for (auto o : options) { + if (i != 0) { + errorMsg << ", " << EncoderTypeUtil::toString(o); + } + errorMsg << EncoderTypeUtil::toString(o); + i = 1; + } + errorMsg << "})"; + NEON_ERROR(errorMsg.str()); + } + mSet = true; +} + +auto EncoderTypeUtil::Cli::getStringOptions() const -> std::string +{ + std::stringstream s; + auto options = EncoderTypeUtil::getOptions(); + int i = 0; + for (auto o : options) { + if (i != 0) { + s << ", "; + } + s << EncoderTypeUtil::toString(o); + i = 1; + } + std::string msg = s.str(); + return msg; +} + +auto EncoderTypeUtil::Cli::getDoc() const -> std::string +{ + std::stringstream s; + s << getStringOptions(); + s << " default: " << getStringOptions(); + return s.str(); +} + +auto EncoderTypeUtil::Cli::addToReport(Neon::Report& report) const -> void +{ + report.addMember("EncoderType", EncoderTypeUtil::toString(this->getOption())); +} + +auto EncoderTypeUtil::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void +{ + report.addMember("EncoderType", EncoderTypeUtil::toString(this->getOption()), &subBlock); +} + } // namespace Neon::domain::tool::spaceCurves diff --git a/libNeonSet/include/Neon/set/StencilSemantic.h b/libNeonSet/include/Neon/set/StencilSemantic.h index cd512ae7..3e22f4f4 100644 --- a/libNeonSet/include/Neon/set/StencilSemantic.h +++ b/libNeonSet/include/Neon/set/StencilSemantic.h @@ -2,6 +2,7 @@ #include #include +#include "Neon/Report.h" #include "Neon/core/core.h" namespace Neon::set { @@ -20,19 +21,23 @@ struct StencilSemanticUtils static auto toString(StencilSemantic opt) -> std::string; static auto fromString(const std::string& opt) -> StencilSemantic; static auto getOptions() -> std::array; - + struct Cli { explicit Cli(std::string); explicit Cli(StencilSemantic model); Cli(); - auto getOption() -> StencilSemantic; + auto getOption() const -> StencilSemantic; auto set(const std::string& opt) -> void; - auto getStringOptions() -> std::string; + auto getStringOptions() const -> std::string; + auto getDoc() const -> std::string; + + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; + auto addToReport(Neon::Report& report) const -> void; private: - bool mSet = false; + bool mSet = false; StencilSemantic mOption; }; }; diff --git a/libNeonSet/include/Neon/set/TransferMode.h b/libNeonSet/include/Neon/set/TransferMode.h index b6f4ec86..1b6881c8 100644 --- a/libNeonSet/include/Neon/set/TransferMode.h +++ b/libNeonSet/include/Neon/set/TransferMode.h @@ -3,6 +3,7 @@ #include #include "Neon/core/core.h" +#include "Neon/Report.h" namespace Neon::set { @@ -26,9 +27,13 @@ class TransferModeUtils explicit Cli(TransferMode model); Cli(); - auto getOption() -> TransferMode; + auto getOption() const -> TransferMode; auto set(const std::string& opt) -> void; - auto getStringOptions() -> std::string; + auto getStringOptions() const -> std::string; + auto getDoc () const -> std::string; + + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const ->void; + auto addToReport(Neon::Report& report) const ->void; private: bool mSet = false; diff --git a/libNeonSet/src/set/StencilSemantic.cpp b/libNeonSet/src/set/StencilSemantic.cpp index 560b687a..f134fb8f 100644 --- a/libNeonSet/src/set/StencilSemantic.cpp +++ b/libNeonSet/src/set/StencilSemantic.cpp @@ -17,7 +17,7 @@ auto StencilSemanticUtils::toString(StencilSemantic option) -> std::string auto StencilSemanticUtils::fromString(const std::string& occ) -> StencilSemantic { - std::array opts{StencilSemantic::standard, StencilSemantic::streaming}; + std::array opts{StencilSemantic::standard, StencilSemantic::streaming}; for (auto a : opts) { if (toString(a) == occ) { return a; @@ -47,7 +47,7 @@ StencilSemanticUtils::Cli::Cli(StencilSemantic model) mOption = model; } -auto StencilSemanticUtils::Cli::getOption() -> StencilSemantic +auto StencilSemanticUtils::Cli::getOption() const -> StencilSemantic { if (!mSet) { std::stringstream errorMsg; @@ -66,13 +66,13 @@ auto StencilSemanticUtils::Cli::set(const std::string& opt) std::stringstream errorMsg; errorMsg << "TransferSemantic: " << opt << " is not a valid option (valid options are {"; auto options = StencilSemanticUtils::getOptions(); - int i = 0; + int i = 0; for (auto o : options) { - if(i!=0){ - errorMsg << ", "<< StencilSemanticUtils::toString(o) ; + if (i != 0) { + errorMsg << ", " << StencilSemanticUtils::toString(o); } errorMsg << StencilSemanticUtils::toString(o); - i=1; + i = 1; } errorMsg << "})"; NEON_ERROR(errorMsg.str()); @@ -80,19 +80,38 @@ auto StencilSemanticUtils::Cli::set(const std::string& opt) mSet = true; } -auto StencilSemanticUtils::Cli::getStringOptions() -> std::string +auto StencilSemanticUtils::Cli::getStringOptions() const -> std::string { std::stringstream s; auto options = StencilSemanticUtils::getOptions(); int i = 0; for (auto o : options) { if (i != 0) { - s << ", " ; + s << ", "; } s << StencilSemanticUtils::toString(o); i = 1; } - std::string msg= s.str(); + std::string msg = s.str(); return msg; } -} // namespace Neon + +auto StencilSemanticUtils::Cli::getDoc() const-> std::string +{ + std::stringstream s; + s << getStringOptions(); + s << " default: " << getStringOptions(); + return s.str(); +} + + +auto StencilSemanticUtils::Cli::addToReport(Neon::Report& report) const -> void +{ + report.addMember("StencilSemantic", StencilSemanticUtils::toString(this->getOption())); +} + +auto StencilSemanticUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void +{ + report.addMember("StencilSemantic", StencilSemanticUtils::toString(this->getOption()), &subBlock); +} +} // namespace Neon::set diff --git a/libNeonSet/src/set/TransferMode.cpp b/libNeonSet/src/set/TransferMode.cpp index 9ef657eb..7c3668bf 100644 --- a/libNeonSet/src/set/TransferMode.cpp +++ b/libNeonSet/src/set/TransferMode.cpp @@ -47,7 +47,7 @@ TransferModeUtils::Cli::Cli(TransferMode model) mOption = model; } -auto TransferModeUtils::Cli::getOption() -> TransferMode +auto TransferModeUtils::Cli::getOption() const -> TransferMode { if (!mSet) { std::stringstream errorMsg; @@ -66,13 +66,13 @@ auto TransferModeUtils::Cli::set(const std::string& opt) std::stringstream errorMsg; errorMsg << "Transfer: " << opt << " is not a valid option (valid options are {"; auto options = TransferModeUtils::getOptions(); - int i = 0; + int i = 0; for (auto o : options) { - if(i!=0){ - errorMsg << ", "<< TransferModeUtils::toString(o) ; + if (i != 0) { + errorMsg << ", " << TransferModeUtils::toString(o); } errorMsg << TransferModeUtils::toString(o); - i=1; + i = 1; } errorMsg << "})"; NEON_ERROR(errorMsg.str()); @@ -80,19 +80,37 @@ auto TransferModeUtils::Cli::set(const std::string& opt) mSet = true; } -auto TransferModeUtils::Cli::getStringOptions() -> std::string +auto TransferModeUtils::Cli::getStringOptions() const -> std::string { std::stringstream s; auto options = TransferModeUtils::getOptions(); int i = 0; for (auto o : options) { if (i != 0) { - s << ", " ; + s << ", "; } s << TransferModeUtils::toString(o); i = 1; } - std::string msg= s.str(); + std::string msg = s.str(); return msg; } -} // namespace Neon + +auto TransferModeUtils::Cli::getDoc() const -> std::string +{ + std::stringstream s; + s << getStringOptions(); + s << " default: " << getStringOptions(); + return s.str(); +} + +auto TransferModeUtils::Cli::addToReport(Neon::Report& report) const -> void +{ + report.addMember("TransferMode", TransferModeUtils::toString(this->getOption())); +} + +auto TransferModeUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void +{ + report.addMember("TransferMode", TransferModeUtils::toString(this->getOption()), &subBlock); +} +} // namespace Neon::set diff --git a/libNeonSkeleton/include/Neon/skeleton/Occ.h b/libNeonSkeleton/include/Neon/skeleton/Occ.h index a54f799a..4b2e3522 100644 --- a/libNeonSkeleton/include/Neon/skeleton/Occ.h +++ b/libNeonSkeleton/include/Neon/skeleton/Occ.h @@ -27,12 +27,13 @@ struct OccUtils explicit Cli(Occ model); Cli(); - auto getOption() -> Occ; + auto getOption() const -> Occ; auto set(const std::string& opt) -> void; - auto getStringOptions() -> std::string; + auto getStringOptions() const -> std::string; + auto getDoc() const -> std::string; - auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock)->void; - auto addToReport(Neon::Report& report)->void; + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; + auto addToReport(Neon::Report& report) const -> void; private: bool mSet = false; diff --git a/libNeonSkeleton/src/skeleton/Occ.cpp b/libNeonSkeleton/src/skeleton/Occ.cpp index 44ac9155..a3fddbe2 100644 --- a/libNeonSkeleton/src/skeleton/Occ.cpp +++ b/libNeonSkeleton/src/skeleton/Occ.cpp @@ -53,7 +53,7 @@ OccUtils::Cli::Cli(Occ model) mOption = model; } -auto OccUtils::Cli::getOption() -> Occ +auto OccUtils::Cli::getOption() const -> Occ { if (!mSet) { std::stringstream errorMsg; @@ -86,7 +86,7 @@ auto OccUtils::Cli::set(const std::string& opt) mSet = true; } -auto OccUtils::Cli::getStringOptions() -> std::string +auto OccUtils::Cli::getStringOptions() const -> std::string { std::stringstream s; auto options = OccUtils::getOptions(); @@ -102,12 +102,20 @@ auto OccUtils::Cli::getStringOptions() -> std::string return msg; } -auto OccUtils::Cli::addToReport(Neon::Report& report) -> void +auto OccUtils::Cli::getDoc() const -> std::string +{ + std::stringstream s; + s << getStringOptions(); + s << " default: " << getStringOptions(); + return s.str(); +} + +auto OccUtils::Cli::addToReport(Neon::Report& report) const -> void { report.addMember("Occ", OccUtils::toString(this->getOption())); } -auto OccUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) -> void +auto OccUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void { report.addMember("Occ", OccUtils::toString(this->getOption()), &subBlock); } From f59901754022a73b0ad9fd6119b4061818175618 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 31 Aug 2023 23:39:25 +0200 Subject: [PATCH 55/94] WIP --- benchmarks/lbm/src/Config.cpp | 2 +- benchmarks/lbm/src/Config.h | 2 -- benchmarks/lbm/src/Lbm.h | 2 +- benchmarks/lbm/src/Report.cpp | 3 +-- 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp index 43e2147a..95775e2d 100644 --- a/benchmarks/lbm/src/Config.cpp +++ b/benchmarks/lbm/src/Config.cpp @@ -78,7 +78,7 @@ auto Config::parseArgs(const int argc, char* argv[]) clipp::option("--computeFP") & clipp::value("computeFP", config.computeTypeStr) % Config::getOptionList(config.gridTypeOptions, config.gridType), clipp::option("--storageFP") & clipp::value("storageFP", config.storeTypeStr) % "double, float", - clipp::option("--occ")([&config](const std::string& s) { config.occCli.set(s); }) % config.occCli.getDoc(), + clipp::option("--occ") & clipp::value("occ")([&config](const std::string& s) { config.occCli.set(s); }) % config.occCli.getDoc(), clipp::option("--transferMode")([&config](const std::string& s) { config.transferModeCli.set(s); }) % config.transferModeCli.getDoc(), clipp::option("--stencilSemantic")([&config](const std::string& s) { config.stencilSemanticCli.set(s); }) % config.stencilSemanticCli.getDoc(), clipp::option("--spaceCurve")([&config](const std::string& s) { config.spaceCurveCli.set(s); }) % config.spaceCurveCli.getDoc(), diff --git a/benchmarks/lbm/src/Config.h b/benchmarks/lbm/src/Config.h index fbd7dd3c..481f02dd 100644 --- a/benchmarks/lbm/src/Config.h +++ b/benchmarks/lbm/src/Config.h @@ -22,8 +22,6 @@ struct Config int N = 160; // Number of nodes in x-direction bool benchmark = false; // Run in benchmark mode ? double max_t = 10.0; // Non-benchmark mode: Total time in dim.less units - int outFrequency = 200; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) - int dataFrequency = 0; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) int benchIniIter = 1000; // Benchmark mode: Number of warmup iterations int benchMaxIter = 2000; // Benchmark mode: Total number of iterations int repetitions = 1; // Benchmark mode: number of time the test is run diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index ed51e013..e8e6c54e 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -199,7 +199,7 @@ struct Lbm tie(start, clock_iter) = metrics::restartClock(bk, true); for (time_iter = 0; time_iter < configurations.benchMaxIter; ++time_iter) { - if (true) { + if (time_iter % configurations.vti) { bk.syncAll(); helpExportVti(); } diff --git a/benchmarks/lbm/src/Report.cpp b/benchmarks/lbm/src/Report.cpp index 308eee6d..a7d0995f 100644 --- a/benchmarks/lbm/src/Report.cpp +++ b/benchmarks/lbm/src/Report.cpp @@ -12,8 +12,7 @@ Report::Report(const Config& c) mReport.addMember("N", c.N); mReport.addMember("benchmark", c.benchmark); mReport.addMember("max_t", c.max_t); - mReport.addMember("outFrequency", c.outFrequency); - mReport.addMember("dataFrequency", c.dataFrequency); + mReport.addMember("vtiFrequency", c.vti); mReport.addMember("repetitions", c.repetitions); mReport.addMember("vti", c.vti); From 04133927a0186276e74092d9d9c720acd252d704 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 1 Sep 2023 13:52:19 +0200 Subject: [PATCH 56/94] WIP: CLI refactoring. --- benchmarks/lbm/src/Config.cpp | 6 +++--- benchmarks/lbm/src/Lbm.h | 2 +- benchmarks/lbm/src/RunCavityTwoPop.cpp | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp index 95775e2d..a667f44f 100644 --- a/benchmarks/lbm/src/Config.cpp +++ b/benchmarks/lbm/src/Config.cpp @@ -79,9 +79,9 @@ auto Config::parseArgs(const int argc, char* argv[]) clipp::option("--storageFP") & clipp::value("storageFP", config.storeTypeStr) % "double, float", clipp::option("--occ") & clipp::value("occ")([&config](const std::string& s) { config.occCli.set(s); }) % config.occCli.getDoc(), - clipp::option("--transferMode")([&config](const std::string& s) { config.transferModeCli.set(s); }) % config.transferModeCli.getDoc(), - clipp::option("--stencilSemantic")([&config](const std::string& s) { config.stencilSemanticCli.set(s); }) % config.stencilSemanticCli.getDoc(), - clipp::option("--spaceCurve")([&config](const std::string& s) { config.spaceCurveCli.set(s); }) % config.spaceCurveCli.getDoc(), + clipp::option("--transferMode")& clipp::value("transferMode")([&config](const std::string& s) { config.transferModeCli.set(s); }) % config.transferModeCli.getDoc(), + clipp::option("--stencilSemantic")& clipp::value("stencilSemantic")([&config](const std::string& s) { config.stencilSemanticCli.set(s); }) % config.stencilSemanticCli.getDoc(), + clipp::option("--spaceCurve")& clipp::value("spaceCurve")([&config](const std::string& s) { config.spaceCurveCli.set(s); }) % config.spaceCurveCli.getDoc(), clipp::option("--streamingMethod") & clipp::value("streamingMethod", config.streamingMethod) % Config::getOptionList(config.streamingMethodOption, config.streamingMethod), clipp::option("--lattice") & clipp::value("lattice", config.lattice) % Config::getOptionList(config.latticeOptions, config.lattice), diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index e8e6c54e..9e2d2596 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -199,7 +199,7 @@ struct Lbm tie(start, clock_iter) = metrics::restartClock(bk, true); for (time_iter = 0; time_iter < configurations.benchMaxIter; ++time_iter) { - if (time_iter % configurations.vti) { + if ((time_iter % configurations.vti)==0) { bk.syncAll(); helpExportVti(); } diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cpp index 4b1f56eb..4d0f03a0 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -26,7 +26,7 @@ auto run(Config& config, using Storage = Storage_; using Compute = Compute_; using Precision = Precision; - using Lattice = Lattice_;//D3Q27; + using Lattice = Lattice_; // D3Q27; // using PopulationField = typename Grid::template Field; // using PopField = typename Grid::template Field; @@ -93,11 +93,11 @@ auto runFilterLattice(Config& config, Report& report) -> void { using Precision = Precision; - if (config.lattice == "d3q19") { + if (config.lattice == "d3q19" || config.lattice == "D3Q19") { using Lattice = D3Q19; return runFilterMethod(config, report); } - if (config.lattice == "d3q27") { + if (config.lattice == "d3q27" || config.lattice == "D3Q27") { using Lattice = D3Q19; return runFilterMethod(config, report); } From 6ff1aa63a5b96ae0820a6867e85aacfbe8de0cc1 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 1 Sep 2023 23:38:51 +0200 Subject: [PATCH 57/94] WIP: KBC for D3Q27 --- benchmarks/lbm/src/Collision.cpp | 127 +++++ benchmarks/lbm/src/Collision.h | 43 ++ benchmarks/lbm/src/Config.cpp | 18 +- benchmarks/lbm/src/Config.h | 5 +- benchmarks/lbm/src/ContainersD3QXX.h | 669 +++++++++++++------------ benchmarks/lbm/src/D3Q27.h | 75 ++- benchmarks/lbm/src/DeviceD3QXX.h | 94 ++++ benchmarks/lbm/src/Lbm.h | 3 +- benchmarks/lbm/src/Report.cpp | 11 +- benchmarks/lbm/src/RunCavityTwoPop.cpp | 35 +- libNeonSkeleton/src/skeleton/Occ.cpp | 3 +- 11 files changed, 734 insertions(+), 349 deletions(-) create mode 100644 benchmarks/lbm/src/Collision.cpp create mode 100644 benchmarks/lbm/src/Collision.h diff --git a/benchmarks/lbm/src/Collision.cpp b/benchmarks/lbm/src/Collision.cpp new file mode 100644 index 00000000..3f7510cd --- /dev/null +++ b/benchmarks/lbm/src/Collision.cpp @@ -0,0 +1,127 @@ +#include "Collision.h" + + +auto CollisionUtils::toString(Collision occ) -> std::string +{ + switch (occ) { + case Collision::bgk: { + return "bgk"; + } + case Collision::kbc: { + return "kbc"; + } + } + NEON_THROW_UNSUPPORTED_OPTION(""); +} + +auto CollisionUtils::fromString(const std::string& occ) -> Collision +{ + std::array occs{Collision::bgk, Collision::kbc}; + for (auto a : occs) { + if (toString(a) == occ) { + return a; + } + } + NEON_THROW_UNSUPPORTED_OPTION(""); +} + +auto CollisionUtils::getOptions() -> std::array +{ + std::array opts = {Collision::bgk, Collision::kbc}; + return opts; +} + +CollisionUtils::Cli::Cli() +{ + mSet = false; +} + +CollisionUtils::Cli::Cli(std::string s) +{ + set(s); +} + +CollisionUtils::Cli::Cli(Collision model) +{ + mOption = model; + mSet = true; +} + +auto CollisionUtils::Cli::getOption() const -> Collision +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "Collision model was not set."; + NEON_ERROR(errorMsg.str()); + } + return mOption; +} + +auto CollisionUtils::Cli::getOptionStr() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "Collision model was not set."; + NEON_ERROR(errorMsg.str()); + } + return CollisionUtils::toString(mOption); +} + +auto CollisionUtils::Cli::set(const std::string& opt) + -> void +{ + try { + mOption = CollisionUtils::fromString(opt); + } catch (...) { + std::stringstream errorMsg; + errorMsg << "Collision: " << opt << " is not a valid option (valid options are {"; + auto options = CollisionUtils::getOptions(); + int i = 0; + for (auto o : options) { + if (i != 0) { + errorMsg << ", " << CollisionUtils::toString(o); + } + errorMsg << CollisionUtils::toString(o); + i = 1; + } + errorMsg << "})"; + NEON_ERROR(errorMsg.str()); + } + mSet = true; +} + +auto CollisionUtils::Cli::getAllOptionsStr() const -> std::string +{ + std::stringstream s; + auto options = CollisionUtils::getOptions(); + int i = 0; + for (auto o : options) { + if (i != 0) { + s << ", "; + } + s << CollisionUtils::toString(o); + i = 1; + } + std::string msg = s.str(); + return msg; +} + + +auto CollisionUtils::Cli::getDoc() const -> std::string +{ + std::stringstream s; + s << getAllOptionsStr(); + s << " default: " << CollisionUtils::toString(getOption()); + return s.str(); +} + +auto CollisionUtils::Cli::addToReport(Neon::Report& report) const -> void +{ + report.addMember("Collision", CollisionUtils::toString(this->getOption())); +} + +auto CollisionUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void +{ + report.addMember("Collision", CollisionUtils::toString(this->getOption()), &subBlock); +} + diff --git a/benchmarks/lbm/src/Collision.h b/benchmarks/lbm/src/Collision.h new file mode 100644 index 00000000..c022a018 --- /dev/null +++ b/benchmarks/lbm/src/Collision.h @@ -0,0 +1,43 @@ +#pragma once +#include "Neon/Report.h" +#include "Neon/set/Backend.h" +#include "Neon/set/Containter.h" + + +enum class Collision +{ + bgk, + kbc +}; + +struct CollisionUtils +{ + static constexpr int nOptions = 2; + + static auto toString(Collision occ) -> std::string; + static auto fromString(const std::string& occ) -> Collision; + static auto getOptions() -> std::array; + + struct Cli + { + explicit Cli(std::string); + explicit Cli(Collision model); + Cli(); + + auto getOption() const -> Collision; + auto getOptionStr() const -> std::string; + + auto set(const std::string& opt) -> void; + auto getAllOptionsStr() const -> std::string; + auto getDoc() const -> std::string; + + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; + auto addToReport(Neon::Report& report) const -> void; + + private: + bool mSet = false; + Collision mOption; + }; +}; + + diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp index a667f44f..a2744860 100644 --- a/benchmarks/lbm/src/Config.cpp +++ b/benchmarks/lbm/src/Config.cpp @@ -21,7 +21,7 @@ auto Config::toString() const -> std::string return s.str(); }; - s << "Neon Runtime Parameters" << std::endl; + s << "\n==>[Neon Runtime Parameters]" << std::endl; s << ".......... deviceType " << c.deviceType << std::endl; s << ".......... numDevices " << c.devices.size() << std::endl; s << "............. devices " << vecToSting(c.devices) << std::endl; @@ -30,16 +30,17 @@ auto Config::toString() const -> std::string s << ".......... spaceCurve " << c.spaceCurveCli.getStringOptions() << std::endl; s << "................. occ " << c.occCli.getStringOptions() << std::endl; - s << "....... transferMode " << c.transferModeCli.getStringOptions() << std::endl; - s << ".... stencilSemantic " << c.stencilSemanticCli.getStringOptions() << std::endl; + s << "........ transferMode " << c.transferModeCli.getStringOptions() << std::endl; + s << "..... stencilSemantic " << c.stencilSemanticCli.getStringOptions() << std::endl; - s << "LBM Implementation" << std::endl; - s << "............ lattice " << c.lattice << std::endl; - s << "... streaming method " << c.streamingMethod << std::endl; + s << "\n==>[LBM Implementation]" << std::endl; + s << "............. lattice " << c.lattice << std::endl; + s << ".... streaming method " << c.streamingMethod << std::endl; + s << "........... collision " << c.collisionCli.getOptionStr() << std::endl; s << "......... computeType " << c.computeTypeStr << std::endl; s << "........... storeType " << c.storeTypeStr << std::endl; - s << "Physics Parameters" << std::endl; + s << "\n==>[Physics Parameters]" << std::endl; s << ".................. Re " << c.Re << std::endl; s << "................. ulb " << c.ulb << std::endl; s << "................... N " << c.N << std::endl; @@ -48,7 +49,7 @@ auto Config::toString() const -> std::string s << "................. dx " << mLbmParameters.dx << std::endl; s << "................. dt " << mLbmParameters.dt << std::endl; - s << "Test Parameters" << std::endl; + s << "\n==>[Test Parameters]" << std::endl; s << "........... benchmark " << c.benchmark << std::endl; s << "............... max_t " << c.max_t << std::endl; s << "................. vti " << c.vti << std::endl; @@ -82,6 +83,7 @@ auto Config::parseArgs(const int argc, char* argv[]) clipp::option("--transferMode")& clipp::value("transferMode")([&config](const std::string& s) { config.transferModeCli.set(s); }) % config.transferModeCli.getDoc(), clipp::option("--stencilSemantic")& clipp::value("stencilSemantic")([&config](const std::string& s) { config.stencilSemanticCli.set(s); }) % config.stencilSemanticCli.getDoc(), clipp::option("--spaceCurve")& clipp::value("spaceCurve")([&config](const std::string& s) { config.spaceCurveCli.set(s); }) % config.spaceCurveCli.getDoc(), + clipp::option("--collision")& clipp::value("collision")([&config](const std::string& s) { config.collisionCli.set(s); }) % config.collisionCli.getDoc(), clipp::option("--streamingMethod") & clipp::value("streamingMethod", config.streamingMethod) % Config::getOptionList(config.streamingMethodOption, config.streamingMethod), clipp::option("--lattice") & clipp::value("lattice", config.lattice) % Config::getOptionList(config.latticeOptions, config.lattice), diff --git a/benchmarks/lbm/src/Config.h b/benchmarks/lbm/src/Config.h index 481f02dd..adb5824d 100644 --- a/benchmarks/lbm/src/Config.h +++ b/benchmarks/lbm/src/Config.h @@ -2,6 +2,7 @@ #include #include +#include "Collision.h" #include "Neon/core/tools/clipp.h" #include "Neon/domain/tools/SpaceCurves.h" #include "Neon/skeleton/Skeleton.h" @@ -37,8 +38,8 @@ struct Config Neon::set::TransferModeUtils::Cli transferModeCli{Neon::set::TransferMode::get}; // Neon transfer mode for halo update Neon::set::StencilSemanticUtils::Cli stencilSemanticCli{Neon::set::StencilSemantic::streaming}; Neon::domain::tool::spaceCurves::EncoderTypeUtil::Cli spaceCurveCli{Neon::domain::tool::spaceCurves::EncoderType::sweep}; - - int vti = 0; // Export vti file + CollisionUtils::Cli collisionCli{Collision::bgk}; + int vti = 0; // Export vti file std::vector computeTypeOptions = {"double", "float"}; std::string computeTypeStr = computeTypeOptions[0]; diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h index 194ed222..ee492bdd 100644 --- a/benchmarks/lbm/src/ContainersD3QXX.h +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -12,7 +12,7 @@ /** * Specialization for D3Q19 */ -template +template struct ContainerFactoryD3QXX { using Lattice = Lattice_; @@ -30,350 +30,379 @@ struct ContainerFactoryD3QXX using PullFunctions = pull::DeviceD3Q19; using CommonFunctions = common::DeviceD3Q19; - using Device = DeviceD3QXX; -struct Pull{ - static auto - iteration(Neon::set::StencilSemantic stencilSemantic, - const PopField& fInField /*! Input population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - const Compute omega /*! LBM omega parameter */, - PopField& fOutField /*! Output Population field */) - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "D3Q19_TwoPop_Pull", - [&, omega](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL, stencilSemantic); - auto& fOut = L.load(fOutField); - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - if (cellInfo.classification == CellType::bulk) { - - Storage popIn[Lattice::Q]; - PullFunctions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - - Compute rho; - std::array u{.0, .0, .0}; - CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - - Compute usqr = 1.5 * (u[0] * u[0] + - u[1] * u[1] + - u[2] * u[2]); - - PullFunctions::collideBgkUnrolled(gidx, - popIn, - rho, u, - usqr, omega, - NEON_OUT fOut); - } - }; - }); - return container; - } -}; -struct Push { - static auto - iteration(Neon::set::StencilSemantic stencilSemantic, - const PopField& fInField /*! Input population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - const Compute omega /*! LBM omega parameter */, - PopField& fOutField /*! Output Population field */) - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "D3Q19_TwoPop", - [=](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL, stencilSemantic); - auto fOut = L.load(fOutField); - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - if (cellInfo.classification == CellType::bulk) { - - Storage popIn[Lattice::Q]; - Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); - - Compute rho; - std::array u{.0, .0, .0}; - Device::Common::macroscopic(popIn, - NEON_OUT rho, NEON_OUT u); - - Compute usqr = 1.5 * (u[0] * u[0] + - u[1] * u[1] + - u[2] * u[2]); - - Device::Common::collideBgkUnrolled(rho, u, - usqr, omega, - NEON_IO popIn); - - Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut); - } - }; - }); - return container; - } -}; -struct Common { - - template - static auto - iteration([[maybe_unused]] Neon::set::StencilSemantic stencilSemantic, - [[maybe_unused]] const PopField fInField /*! Input population field */, - [[maybe_unused]] const CellTypeField& cellTypeField /*! Cell type field */, - [[maybe_unused]] const Compute omega /*! LBM omega parameter */, - [[maybe_unused]] PopField fOutField /*! Output Population field */) - -> Neon::set::Container + using Device = DeviceD3QXX; + + + struct Pull { - if constexpr (method_ == lbm::Method::push) { - using Factory = push::ContainerFactory, - Grid_>; - return Factory::iteration(stencilSemantic, - fInField, - cellTypeField, - omega, - fOutField); - } - if constexpr (method_ == lbm::Method::pull) { - using Factory = pull::ContainerFactory, - Grid_>; - return Factory::iteration(stencilSemantic, - fInField, - cellTypeField, - omega, - fOutField); - } - NEON_DEV_UNDER_CONSTRUCTION(""); - } + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { - static auto - computeWallNghMask(const CellTypeField& infoInField, - CellTypeField& infoOutpeField) + Storage popIn[Lattice::Q]; + PullFunctions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - -> Neon::set::Container - { - Neon::set::Container container = infoInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); - auto& infoOut = L.load(infoOutpeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellType = infoIn(gidx, 0); - cellType.wallNghBitflag = 0; - - if (cellType.classification == CellType::bulk) { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwdRegIdx) { - using M = typename Lattice::template RegisterMapper; - if constexpr (M::centerMemQ != M::fwdMemQ) { - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); - if (nghCellType.classification == CellType::bounceBack || - nghCellType.classification == CellType::movingWall) { - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemQ)); - } - } - }); - infoOut(gidx, 0) = cellType; - } - }; - }); - return container; - } - - - template - static auto - userSettingBc(UserLambda userLambda, - PopField& pField, - CellTypeField& cellTypeField /*! Cell type field */) - -> Neon::set::Container - { - Neon::set::Container container = pField.getGrid().newContainer( - "UserSettingBc", - [&](Neon::set::Loader& L) -> auto { - auto& p = L.load(pField, Neon::Pattern::MAP); - auto& flag = L.load(cellTypeField, Neon::Pattern::MAP); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - const auto globalIdx = p.getGlobalIndex(gidx); - Storage pValues[Lattice::Q]; - CellType::Classification cellClass; - userLambda(globalIdx, pValues, cellClass); - - CellType flagVal(cellClass); - flag(gidx, 0) = flagVal; - - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - using M = typename Lattice::template RegisterMapper; - p(gidx, M::fwdMemQ) = pValues[M::fwdRegQ]; - }); - }; - }); - return container; - } - - static auto - copyPopulation(PopField& fInField, - PopField& foutField) - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto const& pIn = L.load(fInField, Neon::Pattern::MAP); - auto& pOut = L.load(foutField, Neon::Pattern::MAP); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - pOut(gidx, q) = pIn(gidx, q); - }); - }; - }); - return container; - } - - static auto - computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - Rho& rhoField /*! output Population field */, - U& uField /*! output Population field */) - - -> Neon::set::Container - { + Compute rho; + std::array u{.0, .0, .0}; + CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - Neon::set::Container container = - fInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL); - auto& rhoXpu = L.load(rhoField); - auto& uXpu = L.load(uField); + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + if constexpr (CollisionId == Collision::bgk) { + PullFunctions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + if constexpr (CollisionId == Collision::kbc) { + PullFunctions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, invBeta, + NEON_OUT fOut); + } + } + }; + }); + return container; + } + }; + struct Push + { + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM-iteration", + [=](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto fOut = L.load(fOutField); const auto& cellInfoPartition = L.load(cellTypeField); + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - Compute rho = 0; - std::array u{.0, .0, .0}; + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { - Storage popIn[Lattice::Q]; - Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); + Storage popIn[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); - if (cellInfo.classification == CellType::bulk) { - Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - } else { - if (cellInfo.classification == CellType::movingWall) { - rho = 1.0; - u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), - static_cast(popIn[1]) / static_cast(6. * 1. / 18.), - static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popIn, + NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + + if constexpr (CollisionId == Collision::bgk) { + Device::Common::collideBgkUnrolled(rho, u, + usqr, omega, + NEON_IO popIn); } - } + if constexpr (CollisionId == Collision::kbc) { + Device::Common::collideKBCUnrolled(rho, u, + usqr, omega, + invBeta, + NEON_IO popIn); + } + - rhoXpu(gidx, 0) = static_cast(rho); - uXpu(gidx, 0) = static_cast(u[0]); - uXpu(gidx, 1) = static_cast(u[1]); - uXpu(gidx, 2) = static_cast(u[2]); + Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut); + } }; }); - return container; - } + return container; + } + }; + struct Common + { + + template + static auto + iteration([[maybe_unused]] Neon::set::StencilSemantic stencilSemantic, + [[maybe_unused]] const PopField fInField /*! Input population field */, + [[maybe_unused]] const CellTypeField& cellTypeField /*! Cell type field */, + [[maybe_unused]] const Compute omega /*! LBM omega parameter */, + [[maybe_unused]] PopField fOutField /*! Output Population field */) + -> Neon::set::Container + { + if constexpr (method_ == lbm::Method::push) { + using Factory = push::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + cellTypeField, + omega, + fOutField); + } + if constexpr (method_ == lbm::Method::pull) { + using Factory = pull::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + cellTypeField, + omega, + fOutField); + } + NEON_DEV_UNDER_CONSTRUCTION(""); + } - static auto - problemSetup(PopField& fInField /*! inpout population field */, - PopField& fOutField, - CellTypeField& cellTypeField, - Neon::double_3d ulid, - double ulb) - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&, ulid, ulb](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, Neon::Pattern::MAP); - auto& fOut = L.load(fOutField, Neon::Pattern::MAP); - auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - const auto globalIdx = fIn.getGlobalIndex(gidx); - const auto domainDim = fIn.getDomainSize(); - - CellType flagVal; - flagVal.classification = CellType::bulk; - flagVal.wallNghBitflag = 0; - - typename Lattice::Precision::Storage popVal = 0; - - if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || - globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 || - globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) { - flagVal.classification = CellType::bounceBack; - - if (globalIdx.y == domainDim.y - 1) { - flagVal.classification = CellType::movingWall; + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwdRegIdx) { + using M = typename Lattice::template RegisterMapper; + if constexpr (M::centerMemQ != M::fwdMemQ) { + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification == CellType::bounceBack || + nghCellType.classification == CellType::movingWall) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemQ)); + } + } + }); + infoOut(gidx, 0) = cellType; } + }; + }); + return container; + } + + + template + static auto + userSettingBc(UserLambda userLambda, + PopField& pField, + CellTypeField& cellTypeField /*! Cell type field */) + -> Neon::set::Container + { + Neon::set::Container container = pField.getGrid().newContainer( + "UserSettingBc", + [&](Neon::set::Loader& L) -> auto { + auto& p = L.load(pField, Neon::Pattern::MAP); + auto& flag = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globalIdx = p.getGlobalIndex(gidx); + Storage pValues[Lattice::Q]; + CellType::Classification cellClass; + userLambda(globalIdx, pValues, cellClass); + + CellType flagVal(cellClass); + flag(gidx, 0) = flagVal; Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - if (globalIdx.y == domainDim.y - 1) { - popVal = -6. * Lattice::Memory::template getT() * ulb * - (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + - Lattice::Memory::template getDirection().v[1] * ulid.v[1] + - Lattice::Memory::template getDirection().v[2] * ulid.v[2]); - } else { - popVal = 0; - } - fIn(gidx, q) = popVal; - fOut(gidx, q) = popVal; + using M = typename Lattice::template RegisterMapper; + p(gidx, M::fwdMemQ) = pValues[M::fwdRegQ]; }); - } else { - flagVal.classification = CellType::bulk; + }; + }); + return container; + } + + static auto + copyPopulation(PopField& fInField, + PopField& foutField) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto const& pIn = L.load(fInField, Neon::Pattern::MAP); + auto& pOut = L.load(foutField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - fIn(gidx, q) = Lattice::Memory::template getT(); - fOut(gidx, q) = Lattice::Memory::template getT(); + pOut(gidx, q) = pIn(gidx, q); }); - } - cellInfoPartition(gidx, 0) = flagVal; - }; - }); - return container; - } - - static auto - setToEquilibrium(PopField& fOutField, - CellTypeField& cellTypeField) - -> Neon::set::Container - { - Neon::set::Container container = fOutField.getGrid().newContainer( - "LBM_setToEquilibrium", - [&](Neon::set::Loader& L) -> auto { - auto& fOut = L.load(fOutField, Neon::Pattern::MAP); - auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - { // All pints are pre-set to bulk + }; + }); + return container; + } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + + Neon::set::Container container = + fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + + Storage popIn[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); + + if (cellInfo.classification == CellType::bulk) { + Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + + static auto + problemSetup(PopField& fInField /*! inpout population field */, + PopField& fOutField, + CellTypeField& cellTypeField, + Neon::double_3d ulid, + double ulb) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, ulid, ulb](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, Neon::Pattern::MAP); + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globalIdx = fIn.getGlobalIndex(gidx); + const auto domainDim = fIn.getDomainSize(); + CellType flagVal; flagVal.classification = CellType::bulk; + flagVal.wallNghBitflag = 0; + + typename Lattice::Precision::Storage popVal = 0; + + if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || + globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 || + globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) { + flagVal.classification = CellType::bounceBack; + + if (globalIdx.y == domainDim.y - 1) { + flagVal.classification = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + if (globalIdx.y == domainDim.y - 1) { + popVal = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); + } else { + popVal = 0; + } + fIn(gidx, q) = popVal; + fOut(gidx, q) = popVal; + }); + } else { + flagVal.classification = CellType::bulk; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + fIn(gidx, q) = Lattice::Memory::template getT(); + fOut(gidx, q) = Lattice::Memory::template getT(); + }); + } cellInfoPartition(gidx, 0) = flagVal; - } + }; + }); + return container; + } - { // All cells are pre-set to Equilibrium - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - using M = typename Lattice::template RegisterMapper; - fOut(gidx, M::fwdMemQ) = Lattice::Registers::template getT(); - }); - } - }; - }); - return container; - } -}; + static auto + setToEquilibrium(PopField& fOutField, + CellTypeField& cellTypeField) + -> Neon::set::Container + { + Neon::set::Container container = fOutField.getGrid().newContainer( + "LBM_setToEquilibrium", + [&](Neon::set::Loader& L) -> auto { + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + { // All pints are pre-set to bulk + CellType flagVal; + flagVal.classification = CellType::bulk; + cellInfoPartition(gidx, 0) = flagVal; + } + + { // All cells are pre-set to Equilibrium + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + fOut(gidx, M::fwdMemQ) = Lattice::Registers::template getT(); + }); + } + }; + }); + return container; + } + }; }; \ No newline at end of file diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h index e0221fe2..09b08cbd 100644 --- a/benchmarks/lbm/src/D3Q27.h +++ b/benchmarks/lbm/src/D3Q27.h @@ -57,11 +57,13 @@ struct D3Q27 Neon::index_3d(1, 1, 1), Neon::index_3d(1, 1, -1), Neon::index_3d(1, -1, 1), + Neon::index_3d(1, -1, -1)}; template - static inline NEON_CUDA_HOST_DEVICE auto - getComponentOfDirection() -> int{ + static constexpr inline NEON_CUDA_HOST_DEVICE auto + getComponentOfDirection() -> int + { return stencil[qIdx].v[cIdx]; } @@ -82,8 +84,7 @@ struct D3Q27 static constexpr std::array opposite{ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 13, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 - }; + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; static constexpr std::array t{ 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., @@ -92,14 +93,21 @@ struct D3Q27 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; + template + static inline NEON_CUDA_HOST_DEVICE auto + getWeightOfDirection() -> int + { + return t[qIdx]; + } + template - static constexpr auto getT() -> const typename Precision::Storage + static constexpr NEON_CUDA_HOST_DEVICE auto getT() -> const typename Precision::Storage { return t[q]; } template - static constexpr auto getDirection() -> const typename Neon::index_3d + static constexpr NEON_CUDA_HOST_DEVICE auto getDirection() -> const typename Neon::index_3d { return stencil[q]; } @@ -108,6 +116,56 @@ struct D3Q27 // Center is also removed static constexpr int firstHalfQLen = (Q - 1) / 2; static constexpr std::array firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + + struct Moment + { + int v[6]; + Moment(int a0, int a1, int a2, int a3, int a4, int a5) + { + v[0] = a0; + v[1] = a1; + v[2] = a2; + v[3] = a3; + v[4] = a4; + v[5] = a5; + } + }; + + static constexpr std::array latticeMoment{ + {1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, 0, 0}, + {0, 0, 0, 0, 0, 1}, + {1, 1, 0, 1, 0, 0}, + {1, -1, 0, 1, 0, 0}, + {1, 0, 1, 0, 0, 1}, + {1, 0, -1, 0, 0, 1}, + {0, 0, 0, 1, 1, 1}, + {0, 0, 0, 1, -1, 1}, + {1, 1, 1, 1, 1, 1}, + {1, 1, -1, 1, -1, 1}, + {1, -1, 1, 1, -1, 1}, + {1, -1, -1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0}, + {1, 0, 0, 0, 0, 0}, + {0, 0, 0, 1, 0, 0}, + {0, 0, 0, 0, 0, 1}, + {1, 1, 0, 1, 0, 0}, + {1, -1, 0, 1, 0, 0}, + {1, 0, 1, 0, 0, 1}, + {1, 0, -1, 0, 0, 1}, + {0, 0, 0, 1, 1, 1}, + {0, 0, 0, 1, -1, 1}, + {1, 1, 1, 1, 1, 1}, + {1, 1, -1, 1, -1, 1}, + {1, -1, 1, 1, -1, 1}, + {1, -1, -1, 1, 1, 1}}; + + template + static constexpr inline NEON_CUDA_HOST_DEVICE auto + getMomentByDirection() -> int + { + return latticeMoment[qIdx].v[mIdx]; + } }; struct Memory @@ -143,7 +201,7 @@ struct D3Q27 Neon::index_3d(1, -1, -1)}; - static constexpr int center = 13; /** Position of direction {0,0,0} */ + static constexpr int center = 13; /** Position of direction {0,0,0} */ static constexpr std::array memoryToRegister{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}; @@ -180,8 +238,7 @@ struct D3Q27 static constexpr std::array opposite{ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 13, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 - }; + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; template static constexpr auto helpGetValueforT() diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index a7d232bc..d3716f28 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -213,5 +213,99 @@ struct DeviceD3QXX fOut(gidx, M::fwdMemQ) = pOut[M::fwdRegQ]; }); } + + static inline NEON_CUDA_HOST_DEVICE auto + collideKBCUnrolled(Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + Compute const& invBeta /*! invBeta */, + NEON_IO Storage pop[Lattice::Q]) + + -> void + { + if constexpr (Lattice::Q == 27) { + constexpr Compute tiny = Compute(1e-7); + + Compute Pi[6] = {0, 0, 0, 0, 0, 0}; + Compute e0 = 0; + Compute e1 = 0; + Compute deltaS[Lattice::Q]; + Compute fneq[Lattice::Q]; + Compute feq[Lattice::Q]; + const Compute beta = omega * 0.5; + + auto fdecompose_shear = [&](const int q) -> Compute { + const Compute Nxz = Pi[0] - Pi[5]; + const Compute Nyz = Pi[3] - Pi[5]; + if (q == 9) { + return (2.0 * Nxz - Nyz) / 6.0; + } else if (q == 18) { + return (2.0 * Nxz - Nyz) / 6.0; + } else if (q == 3) { + return (-Nxz + 2.0 * Nyz) / 6.0; + } else if (q == 6) { + return (-Nxz + 2.0 * Nyz) / 6.0; + } else if (q == 1) { + return (-Nxz - Nyz) / 6.0; + } else if (q == 2) { + return (-Nxz - Nyz) / 6.0; + } else if (q == 12 || q == 24) { + return Pi[1] / 4.0; + } else if (q == 21 || q == 15) { + return -Pi[1] / 4.0; + } else if (q == 10 || q == 20) { + return Pi[2] / 4.0; + } else if (q == 19 || q == 11) { + return -Pi[2] / 4.0; + } else if (q == 8 || q == 4) { + return Pi[4] / 4.0; + } else if (q == 7 || q == 5) { + return -Pi[4] / 4.0; + } else { + return Compute(0); + } + }; + + // equilibrium + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + const Compute cu = Compute(3) * + (u[0] * Lattice::Registers::template getComponentOfDirection() + + u[1] * Lattice::Registers::template getComponentOfDirection() + + u[2] * Lattice::Registers::template getComponentOfDirection()); + + feq[q] = rho * Lattice::Registers::template getWeightOfDirection() * (1. + cu + 0.5 * cu * cu - usqr); + + fneq[q] = pop[q] - feq[q]; + }); + + // momentum_flux + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + Neon::ConstexprFor<0, 6, 1>([&](auto i) { + Pi[i] += fneq[q] * Lattice::Registers::template getMomentByDirection(); + }); + }); + + // fdecompose_shear + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + deltaS[q] = rho * fdecompose_shear(q); + + Compute deltaH = fneq[q] - deltaS[q]; + + e0 += (deltaS[q] * deltaH / feq[q]); + e1 += (deltaH * deltaH / feq[q]); + }); + + // gamma + Compute gamma = invBeta - (2.0 - invBeta) * e0 / (tiny + e1); + + + // fout + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + Compute deltaH = fneq[q] - deltaS[q]; + pop[q] = pop[q] - beta * (2.0 * deltaS[q] + gamma * deltaH); + }); + } + } }; }; \ No newline at end of file diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index 9e2d2596..31a9d08f 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -17,6 +17,7 @@ int backendWasReported = false; template struct Lbm @@ -31,7 +32,7 @@ struct Lbm using UField = typename Grid::template Field; //using CommonContainerFactory = common::ContainerFactory; - using ContainerFactory = ContainerFactoryD3QXX; + using ContainerFactory = ContainerFactoryD3QXX; template Lbm(Config& config, diff --git a/benchmarks/lbm/src/Report.cpp b/benchmarks/lbm/src/Report.cpp index a7d0995f..98b9980c 100644 --- a/benchmarks/lbm/src/Report.cpp +++ b/benchmarks/lbm/src/Report.cpp @@ -12,7 +12,6 @@ Report::Report(const Config& c) mReport.addMember("N", c.N); mReport.addMember("benchmark", c.benchmark); mReport.addMember("max_t", c.max_t); - mReport.addMember("vtiFrequency", c.vti); mReport.addMember("repetitions", c.repetitions); mReport.addMember("vti", c.vti); @@ -26,13 +25,19 @@ Report::Report(const Config& c) mReport.addMember("reportFile", c.reportFile); mReport.addMember("gridType", c.gridType); - mReport.addMember("computeTypeStr", c.computeTypeStr); - mReport.addMember("storeTypeStr", c.storeTypeStr); + c.occCli.addToReport(mReport); c.transferModeCli.addToReport(mReport); c.stencilSemanticCli.addToReport(mReport); c.spaceCurveCli.addToReport(mReport); + c.collisionCli.addToReport(mReport); + + mReport.addMember("computeTypeStr", c.computeTypeStr); + mReport.addMember("storeTypeStr", c.storeTypeStr); + mReport.addMember("streamingMethod", c.streamingMethod); + mReport.addMember("lattice", c.lattice); + mReport.addMember("nu", c.mLbmParameters.nu); mReport.addMember("omega", c.mLbmParameters.omega); diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cpp index 4d0f03a0..655fb6c2 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -16,6 +16,7 @@ int backendWasReported = false; namespace details { template lbm(config, + Lbm lbm(config, report, [](Neon::index_3d const&) { return true; }); auto ulb = config.ulb; @@ -82,10 +83,34 @@ auto run(Config& config, } -template +template auto runFilterMethod(Config& config, Report& report) -> void { - return run(config, report); + if (config.streamingMethod == "push") { + return run(config, report); + } + if (config.streamingMethod == "pull") { + return run(config, report); + } + NEON_DEV_UNDER_CONSTRUCTION(""); +} + +template +auto runFilterCollision(Config& config, Report& report) -> void +{ + if (config.collisionCli.getOption() == Collision::bgk) { + + return runFilterMethod(config, report); + } + if (config.collisionCli.getOption() == Collision::kbc) { + if(config.lattice != "d3q27" && config.lattice != "D3Q27"){ + Neon::NeonException e("runFilterCollision"); + e << "LBM kbc collision model only supports d3q27 lattice"; + NEON_THROW(e); + } + return runFilterMethod(config, report); + } + NEON_DEV_UNDER_CONSTRUCTION(""); } template @@ -95,11 +120,11 @@ auto runFilterLattice(Config& config, Report& report) -> void if (config.lattice == "d3q19" || config.lattice == "D3Q19") { using Lattice = D3Q19; - return runFilterMethod(config, report); + return runFilterCollision(config, report); } if (config.lattice == "d3q27" || config.lattice == "D3Q27") { using Lattice = D3Q19; - return runFilterMethod(config, report); + return runFilterCollision(config, report); } NEON_DEV_UNDER_CONSTRUCTION(""); } diff --git a/libNeonSkeleton/src/skeleton/Occ.cpp b/libNeonSkeleton/src/skeleton/Occ.cpp index a3fddbe2..7690ce55 100644 --- a/libNeonSkeleton/src/skeleton/Occ.cpp +++ b/libNeonSkeleton/src/skeleton/Occ.cpp @@ -51,6 +51,7 @@ OccUtils::Cli::Cli(std::string s) OccUtils::Cli::Cli(Occ model) { mOption = model; + mSet = true; } auto OccUtils::Cli::getOption() const -> Occ @@ -106,7 +107,7 @@ auto OccUtils::Cli::getDoc() const -> std::string { std::stringstream s; s << getStringOptions(); - s << " default: " << getStringOptions(); + s << " default: " << OccUtils::toString(getOption()); return s.str(); } From d0667a38bba355d20a9e2ad76b002975816eb63a Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 4 Sep 2023 12:09:27 +0200 Subject: [PATCH 58/94] Pull method. --- benchmarks/lbm/src/ContainersD3QXX.h | 253 ++++++++++++++++++--------- benchmarks/lbm/src/DeviceD3QXX.h | 21 ++- benchmarks/lbm/src/Lbm.h | 85 +++++++-- 3 files changed, 256 insertions(+), 103 deletions(-) diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h index ee492bdd..a9e0b92b 100644 --- a/benchmarks/lbm/src/ContainersD3QXX.h +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -28,8 +28,8 @@ struct ContainerFactoryD3QXX using Rho = typename Grid::template Field; using U = typename Grid::template Field; - using PullFunctions = pull::DeviceD3Q19; - using CommonFunctions = common::DeviceD3Q19; + // using PullFunctions = pull::DeviceD3Q19; + // using CommonFunctions = common::DeviceD3Q19; using Device = DeviceD3QXX; @@ -58,35 +58,132 @@ struct ContainerFactoryD3QXX if (cellInfo.classification == CellType::bulk) { Storage popIn[Lattice::Q]; - PullFunctions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); Compute rho; std::array u{.0, .0, .0}; - CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); Compute usqr = 1.5 * (u[0] * u[0] + u[1] * u[1] + u[2] * u[2]); if constexpr (CollisionId == Collision::bgk) { - PullFunctions::collideBgkUnrolled(gidx, - popIn, - rho, u, - usqr, omega, - NEON_OUT fOut); + Device::Common::collideBgkUnrolled(rho, u, + usqr, omega, + NEON_IO popIn); + } + if constexpr (CollisionId == Collision::kbc) { + Device::Common::collideKBCUnrolled(rho, u, + usqr, omega, + invBeta, + NEON_IO popIn); + } + Device::Common::localStore(gidx, popIn, fOut); + } + }; + }); + return container; + } + + static auto + collideForStep0(const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + if constexpr (CollisionId == Collision::bgk) { + Device::Common::collideBgkUnrolled(rho, u, + usqr, omega, + NEON_IO popIn); } if constexpr (CollisionId == Collision::kbc) { - PullFunctions::collideBgkUnrolled(gidx, - popIn, - rho, u, - usqr, omega, invBeta, - NEON_OUT fOut); + Device::Common::collideKBCUnrolled(rho, u, + usqr, omega, + invBeta, + NEON_IO popIn); } + Device::Common::localStore(gidx, popIn, fOut); } }; }); return container; } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + + Neon::set::Container container = + fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + + Storage popIn[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + Storage popIn[Lattice::Q]; + Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); + if (cellInfo.classification == CellType::movingWall) { + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } }; struct Push { @@ -137,14 +234,60 @@ struct ContainerFactoryD3QXX invBeta, NEON_IO popIn); } - - Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut); } }; }); return container; } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + + Neon::set::Container container = + fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + + Storage popIn[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); + + if (cellInfo.classification == CellType::bulk) { + Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } }; struct Common { @@ -159,24 +302,24 @@ struct ContainerFactoryD3QXX -> Neon::set::Container { if constexpr (method_ == lbm::Method::push) { - using Factory = push::ContainerFactory, - Grid_>; - return Factory::iteration(stencilSemantic, - fInField, - cellTypeField, - omega, - fOutField); + using FactoryPush = push::ContainerFactory, + Grid_>; + return FactoryPush::iteration(stencilSemantic, + fInField, + cellTypeField, + omega, + fOutField); } if constexpr (method_ == lbm::Method::pull) { - using Factory = pull::ContainerFactory, - Grid_>; - return Factory::iteration(stencilSemantic, - fInField, - cellTypeField, - omega, - fOutField); + using FactoryPull = pull::ContainerFactory, + Grid_>; + return FactoryPull::iteration(stencilSemantic, + fInField, + cellTypeField, + omega, + fOutField); } NEON_DEV_UNDER_CONSTRUCTION(""); } @@ -268,53 +411,7 @@ struct ContainerFactoryD3QXX return container; } - static auto - computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - Rho& rhoField /*! output Population field */, - U& uField /*! output Population field */) - - -> Neon::set::Container - { - - Neon::set::Container container = - fInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL); - auto& rhoXpu = L.load(rhoField); - auto& uXpu = L.load(uField); - - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - Compute rho = 0; - std::array u{.0, .0, .0}; - - Storage popIn[Lattice::Q]; - Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); - - if (cellInfo.classification == CellType::bulk) { - Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - } else { - if (cellInfo.classification == CellType::movingWall) { - rho = 1.0; - u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), - static_cast(popIn[1]) / static_cast(6. * 1. / 18.), - static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; - } - } - rhoXpu(gidx, 0) = static_cast(rho); - uXpu(gidx, 0) = static_cast(u[0]); - uXpu(gidx, 1) = static_cast(u[1]); - uXpu(gidx, 2) = static_cast(u[2]); - }; - }); - return container; - } static auto problemSetup(PopField& fInField /*! inpout population field */, diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index d3716f28..bfcfda9c 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -29,16 +29,21 @@ struct DeviceD3QXX NEON_OUT Storage popIn[Lattice::Q]) { Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - using M = typename Lattice::template RegisterMapper; + using QPullingReference = typename Lattice::template RegisterMapper; - if constexpr (M::fwdMemQ == M::centerMemQ) { - popIn[M::centerRegQ] = fin(gidx, M::centerMemQ); + if constexpr (QPullingReference::fwdRegQ == QPullingReference::centerRegQ) { + popIn[QPullingReference::centerRegQ] = fin(gidx, QPullingReference::centerMemQ); } else { - if (CellType::isWall()) { - popIn[M::fwdRegQ] = fin(gidx, M::bkMemIdx) + - fin.template getNghData(gidx, M::bkwMemIdx)(); + if (CellType::isWall(wallBitFlag)) { + // The cell in the opposite direction of the pull is a wall + popIn[QPullingReference::fwdRegQ] = fin(gidx, QPullingReference::bkwRegQ) + + fin.template getNghData(gidx, QPullingReference::fwdMemQ)(); } else { - popIn[M::fwdRegQ] = fin.template getNghData(gidx, M::fwdMemIdx)(); + popIn[QPullingReference::fwdRegQ] = fin.template getNghData(gidx, QPullingReference::fwdMemQ)(); } } }); @@ -206,7 +211,7 @@ struct DeviceD3QXX static inline NEON_CUDA_HOST_DEVICE auto localStore(Idx const& gidx, Storage NEON_RESTRICT pOut[Lattice::Q], - NEON_IN typename PopField::Partition& fOut) + NEON_OUT typename PopField::Partition& fOut) { Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { using M = typename Lattice::template RegisterMapper; diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index 31a9d08f..19633163 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -17,7 +17,7 @@ int backendWasReported = false; template struct Lbm @@ -31,13 +31,13 @@ struct Lbm using RhoField = typename Grid::template Field; using UField = typename Grid::template Field; - //using CommonContainerFactory = common::ContainerFactory; - using ContainerFactory = ContainerFactoryD3QXX; + // using CommonContainerFactory = common::ContainerFactory; + using ContainerFactory = ContainerFactoryD3QXX; template - Lbm(Config& config, - Report& report, - Lambda activeMask) + Lbm(Config& config, + Report& report, + Lambda activeMask) { configurations = config; reportPtr = &report; @@ -82,7 +82,7 @@ struct Lbm cellFlagField = grid.template newField("cellFlags", 1, defaultCelltype); // Allocating rho and u - if (config.vti) { + if (config.vti != 0) { std::cout << "Allocating rho and u" << std::endl; using Storage = typename Precision::Storage; rho = grid.template newField("rho", 1, Storage(0.0)); @@ -104,13 +104,13 @@ struct Lbm grid.getBackend().sync(Neon::Backend::mainStreamIdx); // Compute ngh mask ContainerFactory::Common::userSettingBc(bcSetFunction, - pFieldList[0], - cellFlagField) + pFieldList[0], + cellFlagField) .run(Neon::Backend::mainStreamIdx); for (int i = 1; i < int(pFieldList.size()); i++) { ContainerFactory::Common::copyPopulation(pFieldList[0], - pFieldList[i]) + pFieldList[i]) .run(Neon::Backend::mainStreamIdx); } cellFlagField.newHaloUpdate(Neon::set::StencilSemantic::standard, @@ -119,7 +119,7 @@ struct Lbm .run(Neon::Backend::mainStreamIdx); grid.getBackend().sync(Neon::Backend::mainStreamIdx); ContainerFactory::Common::computeWallNghMask(cellFlagField, - cellFlagField) + cellFlagField) .run(Neon::Backend::mainStreamIdx); } @@ -129,7 +129,46 @@ struct Lbm // One collide if 2Pop - pull // One iteration if 2Pop = push if constexpr (lbm::Method::pull == method) { - NEON_DEV_UNDER_CONSTRUCTION(""); + // For pull we set up the system in a way that it does one single collide as first operation + using Compute = typename Precision::Compute; + auto lbmParameters = configurations.template getLbmParameters(); + { + skeleton = std::vector(2); + for (int itr_ : {0, 1}) { + iteration = itr_; + int skIdx = helpGetSkeletonIdx(); + auto even = ContainerFactory::Pull::iteration( + configurations.stencilSemanticCli.getOption(), + pFieldList.at(helpGetInputIdx()), + cellFlagField, + lbmParameters.omega, + pFieldList.at(helpGetOutputIdx())); + + std::vector ops; + skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); + Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption()); + ops.push_back(even); + std::stringstream appName; + + if (iteration % 2 == 0) + appName << "LBM_push_even"; + else + appName << "LBM_pull_even"; + + skeleton.at(skIdx).sequence(ops, appName.str(), opt); + } + } + { + // Let's compute 1 collide operation to prepare the input of the first iteration + iteration = 1; + ContainerFactory::Pull::collideForStep0(pFieldList.at(helpGetInputIdx()), + cellFlagField, + lbmParameters.omega, + pFieldList.at(helpGetOutputIdx())) + .run(Neon::Backend::mainStreamIdx); + pFieldList[0].getBackend().syncAll(); + iteration = 0; + } return; } if constexpr (lbm::Method::push == method) { @@ -200,7 +239,7 @@ struct Lbm tie(start, clock_iter) = metrics::restartClock(bk, true); for (time_iter = 0; time_iter < configurations.benchMaxIter; ++time_iter) { - if ((time_iter % configurations.vti)==0) { + if ((time_iter % configurations.vti) == 0) { bk.syncAll(); helpExportVti(); } @@ -242,18 +281,30 @@ struct Lbm { grid.getBackend().syncAll(); auto& pop = pFieldList.at(helpGetOutputIdx()); - auto computeRhoAndU = ContainerFactory::Common::computeRhoAndU(pop, cellFlagField, rho, u); - computeRhoAndU.run(Neon::Backend::mainStreamIdx); + bool done = false; + if constexpr (method == lbm::Method::push) { + auto computeRhoAndU = ContainerFactory::Push::computeRhoAndU(pop, cellFlagField, rho, u); + computeRhoAndU.run(Neon::Backend::mainStreamIdx); + done= true; + } + if constexpr (method == lbm::Method::pull) { + auto computeRhoAndU = ContainerFactory::Pull::computeRhoAndU(pop, cellFlagField, rho, u); + computeRhoAndU.run(Neon::Backend::mainStreamIdx); + done= true; + } + if(!done){ + NEON_DEV_UNDER_CONSTRUCTION("helpExportVti"); + } u.updateHostData(Neon::Backend::mainStreamIdx); rho.updateHostData(Neon::Backend::mainStreamIdx); - //pop.updateHostData(Neon::Backend::mainStreamIdx); + // pop.updateHostData(Neon::Backend::mainStreamIdx); grid.getBackend().sync(Neon::Backend::mainStreamIdx); size_t numDigits = 5; std::string iterIdStr = std::to_string(iteration); iterIdStr = std::string(numDigits - std::min(numDigits, iterIdStr.length()), '0') + iterIdStr; - //pop.ioToVtk("pop_" + iterIdStr, "pop", false); + // pop.ioToVtk("pop_" + iterIdStr, "pop", false); u.ioToVtk("u_" + iterIdStr, "u", false); rho.ioToVtk("rho_" + iterIdStr, "rho", false); cellFlagField.template ioToVtk("cellFlagField_" + iterIdStr, "flag", false); From 0572e66ecde6b6d3eeaef5581c2cc071caa408df Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 4 Sep 2023 15:57:35 +0200 Subject: [PATCH 59/94] WIP: kbc --- benchmarks/lbm/src/D3Q27.h | 152 ++++++++++++++----------- benchmarks/lbm/src/DeviceD3QXX.h | 36 +++--- benchmarks/lbm/src/RunCavityTwoPop.cpp | 8 +- 3 files changed, 112 insertions(+), 84 deletions(-) diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h index 09b08cbd..3830ecda 100644 --- a/benchmarks/lbm/src/D3Q27.h +++ b/benchmarks/lbm/src/D3Q27.h @@ -31,34 +31,33 @@ struct D3Q27 { using Self = D3Q27::Registers; static constexpr std::array stencil{ - Neon::index_3d(-1, 0, 0), - Neon::index_3d(0, -1, 0), - Neon::index_3d(0, 0, -1), - Neon::index_3d(-1, -1, 0), - Neon::index_3d(-1, 1, 0), - Neon::index_3d(-1, 0, -1), - Neon::index_3d(-1, 0, 1), - Neon::index_3d(0, -1, -1), - Neon::index_3d(0, -1, 1), - Neon::index_3d(-1, -1, -1), - Neon::index_3d(-1, -1, 1), - Neon::index_3d(-1, 1, -1), - Neon::index_3d(-1, 1, 1), - Neon::index_3d(0, 0, 0), - Neon::index_3d(1, 0, 0), - Neon::index_3d(0, 1, 0), - Neon::index_3d(0, 0, 1), - Neon::index_3d(1, 1, 0), - Neon::index_3d(1, -1, 0), - Neon::index_3d(1, 0, 1), - Neon::index_3d(1, 0, -1), - Neon::index_3d(0, 1, 1), - Neon::index_3d(0, 1, -1), - Neon::index_3d(1, 1, 1), - Neon::index_3d(1, 1, -1), - Neon::index_3d(1, -1, 1), - - Neon::index_3d(1, -1, -1)}; + /* 00 */ Neon::index_3d(-1, 0, 0), + /* 01 */ Neon::index_3d(0, -1, 0), + /* 02 */ Neon::index_3d(0, 0, -1), + /* 03 */ Neon::index_3d(-1, -1, 0), + /* 04 */ Neon::index_3d(-1, 1, 0), + /* 05 */ Neon::index_3d(-1, 0, -1), + /* 06 */ Neon::index_3d(-1, 0, 1), + /* 07 */ Neon::index_3d(0, -1, -1), + /* 08 */ Neon::index_3d(0, -1, 1), + /* 09 */ Neon::index_3d(-1, -1, -1), + /* 00 */ Neon::index_3d(-1, -1, 1), + /* 11 */ Neon::index_3d(-1, 1, -1), + /* 12 */ Neon::index_3d(-1, 1, 1), + /* 13 */ Neon::index_3d(0, 0, 0), + /* 14 */ Neon::index_3d(1, 0, 0), + /* 15 */ Neon::index_3d(0, 1, 0), + /* 16 */ Neon::index_3d(0, 0, 1), + /* 17 */ Neon::index_3d(1, 1, 0), + /* 18 */ Neon::index_3d(1, -1, 0), + /* 19 */ Neon::index_3d(1, 0, 1), + /* 20 */ Neon::index_3d(1, 0, -1), + /* 21 */ Neon::index_3d(0, 1, 1), + /* 22 */ Neon::index_3d(0, 1, -1), + /* 23 */ Neon::index_3d(1, 1, 1), + /* 24 */ Neon::index_3d(1, 1, -1), + /* 25 */ Neon::index_3d(1, -1, 1), + /* 26 */ Neon::index_3d(1, -1, -1)}; template static constexpr inline NEON_CUDA_HOST_DEVICE auto @@ -86,16 +85,38 @@ struct D3Q27 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - static constexpr std::array t{ - 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., - 1. / 216., 1. / 216., 1. / 216., 1. / 216., - 8. / 27., - 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., - 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; + static constexpr std::array t{ + /* 00 */ 2. / 27., + /* 01 */ 2. / 27., + /* 02 */ 2. / 27., + /* 03 */ 1. / 54., + /* 04 */ 1. / 54., + /* 05 */ 1. / 54., + /* 06 */ 1. / 54., + /* 07 */ 1. / 54., + /* 08 */ 1. / 54., + /* 09 */ 1. / 216., + /* 00 */ 1. / 216., + /* 11 */ 1. / 216., + /* 12 */ 1. / 216., + /* 13 */ 8. / 27., + /* 14 */ 2. / 27., + /* 15 */ 2. / 27., + /* 16 */ 2. / 27., + /* 17 */ 1. / 54., + /* 18 */ 1. / 54., + /* 19 */ 1. / 54., + /* 20 */ 1. / 54., + /* 21 */ 1. / 54., + /* 22 */ 1. / 54., + /* 23 */ 1. / 216., + /* 24 */ 1. / 216., + /* 25 */ 1. / 216., + /* 26 */ 1. / 216.}; template static inline NEON_CUDA_HOST_DEVICE auto - getWeightOfDirection() -> int + getWeightOfDirection() -> const typename Precision::Compute { return t[qIdx]; } @@ -119,8 +140,8 @@ struct D3Q27 struct Moment { - int v[6]; - Moment(int a0, int a1, int a2, int a3, int a4, int a5) + std::array v{0, 0, 0, 0, 0, 0}; + inline constexpr Moment(int a0, int a1, int a2, int a3, int a4, int a5) { v[0] = a0; v[1] = a1; @@ -132,37 +153,38 @@ struct D3Q27 }; static constexpr std::array latticeMoment{ - {1, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 0, 0}, - {0, 0, 0, 0, 0, 1}, - {1, 1, 0, 1, 0, 0}, - {1, -1, 0, 1, 0, 0}, - {1, 0, 1, 0, 0, 1}, - {1, 0, -1, 0, 0, 1}, - {0, 0, 0, 1, 1, 1}, - {0, 0, 0, 1, -1, 1}, - {1, 1, 1, 1, 1, 1}, - {1, 1, -1, 1, -1, 1}, - {1, -1, 1, 1, -1, 1}, - {1, -1, -1, 1, 1, 1}, - {0, 0, 0, 0, 0, 0}, - {1, 0, 0, 0, 0, 0}, - {0, 0, 0, 1, 0, 0}, - {0, 0, 0, 0, 0, 1}, - {1, 1, 0, 1, 0, 0}, - {1, -1, 0, 1, 0, 0}, - {1, 0, 1, 0, 0, 1}, - {1, 0, -1, 0, 0, 1}, - {0, 0, 0, 1, 1, 1}, - {0, 0, 0, 1, -1, 1}, - {1, 1, 1, 1, 1, 1}, - {1, 1, -1, 1, -1, 1}, - {1, -1, 1, 1, -1, 1}, - {1, -1, -1, 1, 1, 1}}; + Moment(1, 0, 0, 0, 0, 0), + Moment(0, 0, 0, 1, 0, 0), + Moment(0, 0, 0, 0, 0, 1), + Moment(1, 1, 0, 1, 0, 0), + Moment(1, -1, 0, 1, 0, 0), + Moment(1, 0, 1, 0, 0, 1), + Moment(1, 0, -1, 0, 0, 1), + Moment(0, 0, 0, 1, 1, 1), + Moment(0, 0, 0, 1, -1, 1), + Moment(1, 1, 1, 1, 1, 1), + Moment(1, 1, -1, 1, -1, 1), + Moment(1, -1, 1, 1, -1, 1), + Moment(1, -1, -1, 1, 1, 1), + Moment(0, 0, 0, 0, 0, 0), + Moment(1, 0, 0, 0, 0, 0), + Moment(0, 0, 0, 1, 0, 0), + Moment(0, 0, 0, 0, 0, 1), + Moment(1, 1, 0, 1, 0, 0), + Moment(1, -1, 0, 1, 0, 0), + Moment(1, 0, 1, 0, 0, 1), + Moment(1, 0, -1, 0, 0, 1), + Moment(0, 0, 0, 1, 1, 1), + Moment(0, 0, 0, 1, -1, 1), + Moment(1, 1, 1, 1, 1, 1), + Moment(1, 1, -1, 1, -1, 1), + Moment(1, -1, 1, 1, -1, 1), + Moment(1, -1, -1, 1, 1, 1)}; template static constexpr inline NEON_CUDA_HOST_DEVICE auto - getMomentByDirection() -> int + getMomentByDirection() + -> int { return latticeMoment[qIdx].v[mIdx]; } diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index bfcfda9c..73a2b4c1 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -3,7 +3,6 @@ #include "D3Q19.h" #include "Neon/Neon.h" #include "Neon/set/Containter.h" - template struct DeviceD3QXX { @@ -209,8 +208,8 @@ struct DeviceD3QXX } static inline NEON_CUDA_HOST_DEVICE auto - localStore(Idx const& gidx, - Storage NEON_RESTRICT pOut[Lattice::Q], + localStore(Idx const& gidx, + Storage NEON_RESTRICT pOut[Lattice::Q], NEON_OUT typename PopField::Partition& fOut) { Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { @@ -243,29 +242,29 @@ struct DeviceD3QXX auto fdecompose_shear = [&](const int q) -> Compute { const Compute Nxz = Pi[0] - Pi[5]; const Compute Nyz = Pi[3] - Pi[5]; - if (q == 9) { + if (q == 0 /* -1, 0, 0 */) { return (2.0 * Nxz - Nyz) / 6.0; - } else if (q == 18) { + } else if (q == 20 /* 1, 0, -1 */) { return (2.0 * Nxz - Nyz) / 6.0; - } else if (q == 3) { + }else if (q == 1 /* 0, -1, 0 */) { return (-Nxz + 2.0 * Nyz) / 6.0; - } else if (q == 6) { + } else if (q == 15 /* 0, 1, 0 */) { return (-Nxz + 2.0 * Nyz) / 6.0; - } else if (q == 1) { + } else if (q == 2 /* 0, 0, -1 */) { return (-Nxz - Nyz) / 6.0; - } else if (q == 2) { + } else if (q == 16 /* 0, 0, 1 */) { return (-Nxz - Nyz) / 6.0; - } else if (q == 12 || q == 24) { + } else if (q == 3 /* -1, -1, 0 */ || q == 17 /* 1, 1, 0 */) { return Pi[1] / 4.0; - } else if (q == 21 || q == 15) { + } else if (q == 18 /* 1, -1, 0 */ || q == 4 /* -1, 1, 0 */) { return -Pi[1] / 4.0; - } else if (q == 10 || q == 20) { + } else if (q == 5 /* -1, 0, -1 */ || q == 19 /* 1, 0, 1 */) { return Pi[2] / 4.0; - } else if (q == 19 || q == 11) { + } else if (q == 20 /* 1, 0, -1 */ || q == 6 /* -1, 0, 1 */) { return -Pi[2] / 4.0; - } else if (q == 8 || q == 4) { + } else if (q == 21 /* 0, 1, 1 */ || q == 7 /* 0, -1, -1 */) { return Pi[4] / 4.0; - } else if (q == 7 || q == 5) { + } else if (q == 22 /* 0, 1, -1 */ || q == 8 /* 0, -1, 1 */) { return -Pi[4] / 4.0; } else { return Compute(0); @@ -279,8 +278,8 @@ struct DeviceD3QXX u[1] * Lattice::Registers::template getComponentOfDirection() + u[2] * Lattice::Registers::template getComponentOfDirection()); - feq[q] = rho * Lattice::Registers::template getWeightOfDirection() * (1. + cu + 0.5 * cu * cu - usqr); + feq[q] = rho * Lattice::Registers::template getWeightOfDirection() * (1. + cu + 0.5 * cu * cu - usqr); fneq[q] = pop[q] - feq[q]; }); @@ -309,7 +308,12 @@ struct DeviceD3QXX Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { Compute deltaH = fneq[q] - deltaS[q]; pop[q] = pop[q] - beta * (2.0 * deltaS[q] + gamma * deltaH); + if (pop[q] != pop[q]) { + printf("ERROR %d \n", Lattice::Q); + } }); + } else { + printf("ERROR %d \n", Lattice::Q); } } }; diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cpp index 655fb6c2..974012cf 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -13,7 +13,8 @@ namespace CavityTwoPop { int backendWasReported = false; - +// #include +#include "/usr/include/fenv.h" namespace details { template auto runFilterMethod(Config& config, Report& report) -> void { + feenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); // Enable all floating point exceptions but FE_INEXACT if (config.streamingMethod == "push") { return run(config, report); } @@ -123,10 +125,10 @@ auto runFilterLattice(Config& config, Report& report) -> void return runFilterCollision(config, report); } if (config.lattice == "d3q27" || config.lattice == "D3Q27") { - using Lattice = D3Q19; + using Lattice = D3Q27; return runFilterCollision(config, report); } - NEON_DEV_UNDER_CONSTRUCTION(""); + NEON_DEV_UNDER_CONSTRUCTION("Lattice type not supported. Available options: D3Q19 and D3Q27"); } From 7c537e8e511b43845efa2c770cb1e8a402aacd82 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 4 Sep 2023 18:42:08 +0200 Subject: [PATCH 60/94] Fix for kbc --- benchmarks/lbm/src/DeviceD3QXX.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index 73a2b4c1..3327f2dc 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -244,7 +244,7 @@ struct DeviceD3QXX const Compute Nyz = Pi[3] - Pi[5]; if (q == 0 /* -1, 0, 0 */) { return (2.0 * Nxz - Nyz) / 6.0; - } else if (q == 20 /* 1, 0, -1 */) { + } else if (q == 14 /* 1, 0, -1 */) { return (2.0 * Nxz - Nyz) / 6.0; }else if (q == 1 /* 0, -1, 0 */) { return (-Nxz + 2.0 * Nyz) / 6.0; @@ -308,9 +308,6 @@ struct DeviceD3QXX Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { Compute deltaH = fneq[q] - deltaS[q]; pop[q] = pop[q] - beta * (2.0 * deltaS[q] + gamma * deltaH); - if (pop[q] != pop[q]) { - printf("ERROR %d \n", Lattice::Q); - } }); } else { printf("ERROR %d \n", Lattice::Q); From d363a04b7f2df64592e2d667482fead1be6383f0 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 4 Sep 2023 21:21:16 +0200 Subject: [PATCH 61/94] WIP: AA --- benchmarks/lbm/src/ContainersD3QXX.h | 138 ++++++++++++++++++++++++++- benchmarks/lbm/src/Lbm.h | 11 ++- 2 files changed, 139 insertions(+), 10 deletions(-) diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h index a9e0b92b..429f6a33 100644 --- a/benchmarks/lbm/src/ContainersD3QXX.h +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -32,6 +32,135 @@ struct ContainerFactoryD3QXX // using CommonFunctions = common::DeviceD3Q19; using Device = DeviceD3QXX; + struct AA + { + struct Even + { + + static auto + iteration(const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + NEON_IO PopField& fpopField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fpopField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fpop = L.load(fpopField); + const auto& cellInfoPartition = L.load(cellTypeField); + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + Device::Common::localLoad(gidx, fpop, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + if constexpr (CollisionId == Collision::bgk) { + Device::Common::collideBgkUnrolled(rho, u, + usqr, omega, + NEON_IO popIn); + } + if constexpr (CollisionId == Collision::kbc) { + Device::Common::collideKBCUnrolled(rho, u, + usqr, omega, + invBeta, + NEON_IO popIn); + } + Device::Common::localStore(gidx, popIn, fpop); + } + }; + }); + return container; + } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + return Pull::computeRhoAndU(fInField, cellTypeField, rhoField, uField); + } + }; + struct Odd + { + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + NEON_IO PopField& fPopField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fPopField.getGrid().newContainer( + "LBM-iteration", + [=](Neon::set::Loader& L) -> auto { + auto& fPop = L.load(fPopField, + Neon::Pattern::STENCIL, stencilSemantic); + const auto& cellInfoPartition = L.load(cellTypeField); + + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fPop, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popIn, + NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + + if constexpr (CollisionId == Collision::bgk) { + Device::Common::collideBgkUnrolled(rho, u, + usqr, omega, + NEON_IO popIn); + } + if constexpr (CollisionId == Collision::kbc) { + Device::Common::collideKBCUnrolled(rho, u, + usqr, omega, + invBeta, + NEON_IO popIn); + } + Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fPop); + } + }; + }); + return container; + } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + return Push::computeRhoAndU(fInField, cellTypeField, rhoField, uField); + } + }; + }; struct Pull { @@ -87,10 +216,10 @@ struct ContainerFactoryD3QXX } static auto - collideForStep0(const PopField& fInField /*! Input population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - const Compute omega /*! LBM omega parameter */, - PopField& fOutField /*! Output Population field */) + localCollide(const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) -> Neon::set::Container { Neon::set::Container container = fInField.getGrid().newContainer( @@ -412,7 +541,6 @@ struct ContainerFactoryD3QXX } - static auto problemSetup(PopField& fInField /*! inpout population field */, PopField& fOutField, diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index 19633163..361040f7 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -161,10 +161,10 @@ struct Lbm { // Let's compute 1 collide operation to prepare the input of the first iteration iteration = 1; - ContainerFactory::Pull::collideForStep0(pFieldList.at(helpGetInputIdx()), - cellFlagField, - lbmParameters.omega, - pFieldList.at(helpGetOutputIdx())) + ContainerFactory::Pull::localCollide(pFieldList.at(helpGetInputIdx()), + cellFlagField, + lbmParameters.omega, + pFieldList.at(helpGetOutputIdx())) .run(Neon::Backend::mainStreamIdx); pFieldList[0].getBackend().syncAll(); iteration = 0; @@ -221,9 +221,10 @@ struct Lbm return; } if constexpr (lbm::Method::aa == method) { - NEON_DEV_UNDER_CONSTRUCTION(""); return; } + NEON_DEV_UNDER_CONSTRUCTION(""); + } auto iterate() -> void From 64d3d3018ef1fc7cf80326ffedb467683e745e51 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 5 Sep 2023 14:57:24 +0200 Subject: [PATCH 62/94] WIP --- benchmarks/lbm/src/Lbm.h | 59 ++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index 361040f7..03bbb5c4 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -175,8 +175,8 @@ struct Lbm using Compute = typename Precision::Compute; auto lbmParameters = configurations.template getLbmParameters(); skeleton = std::vector(2); - { - iteration = 0; + for (int itr_ : {0, 1}) { + iteration = itr_; int skIdx = helpGetSkeletonIdx(); auto even = ContainerFactory::Push::iteration( configurations.stencilSemanticCli.getOption(), @@ -190,25 +190,46 @@ struct Lbm Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption()); ops.push_back(even); std::stringstream appName; - appName << "LBM_push_even"; + if (iteration % 2 == 0) + appName << "LBM_push_even"; + else + appName << "LBM_pull_even"; skeleton.at(skIdx).sequence(ops, appName.str(), opt); } + { iteration = 1; - int skIdx = helpGetSkeletonIdx(); - auto odd = ContainerFactory::Push::iteration( - configurations.stencilSemanticCli.getOption(), - pFieldList.at(helpGetInputIdx()), - cellFlagField, - lbmParameters.omega, - pFieldList.at(helpGetOutputIdx())); - + int skIdx = helpGetSkeletonIdx(); + skeleton.at(skIdx).run(); + iteration = 0; + } + return; + } + if constexpr (lbm::Method::aa == method) { + using Compute = typename Precision::Compute; + auto lbmParameters = configurations.template getLbmParameters(); + skeleton = std::vector(2); + for (int itr_ : {0, 1}) { + iteration = itr_; + int skIdx = helpGetSkeletonIdx(); + Neon::set::Container lbmIteration; + if ((iteration + 2) % 2 == 0) { + auto even = ContainerFactory::AA::Odd::iteration( + configurations.stencilSemanticCli.getOption(), + pFieldList.at(helpGetInputIdx()), + cellFlagField, + lbmParameters.omega, + pFieldList.at(helpGetOutputIdx())); + } std::vector ops; skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption()); - ops.push_back(odd); + ops.push_back(even); std::stringstream appName; - appName << "LBM_push_odd"; + if (iteration % 2 == 0) + appName << "LBM_push_even"; + else + appName << "LBM_pull_even"; skeleton.at(skIdx).sequence(ops, appName.str(), opt); } @@ -220,11 +241,7 @@ struct Lbm } return; } - if constexpr (lbm::Method::aa == method) { - return; - } NEON_DEV_UNDER_CONSTRUCTION(""); - } auto iterate() -> void @@ -282,18 +299,18 @@ struct Lbm { grid.getBackend().syncAll(); auto& pop = pFieldList.at(helpGetOutputIdx()); - bool done = false; + bool done = false; if constexpr (method == lbm::Method::push) { auto computeRhoAndU = ContainerFactory::Push::computeRhoAndU(pop, cellFlagField, rho, u); computeRhoAndU.run(Neon::Backend::mainStreamIdx); - done= true; + done = true; } if constexpr (method == lbm::Method::pull) { auto computeRhoAndU = ContainerFactory::Pull::computeRhoAndU(pop, cellFlagField, rho, u); computeRhoAndU.run(Neon::Backend::mainStreamIdx); - done= true; + done = true; } - if(!done){ + if (!done) { NEON_DEV_UNDER_CONSTRUCTION("helpExportVti"); } u.updateHostData(Neon::Backend::mainStreamIdx); From d203e9e21dfc907f5cc448e06a9f076ba9aa7245 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 5 Sep 2023 22:16:55 +0200 Subject: [PATCH 63/94] AA working for D3Q19 and bgk. --- benchmarks/lbm/src/ContainersD3QXX.h | 116 +++++++-------- benchmarks/lbm/src/DeviceD3QXX.h | 43 +++++- benchmarks/lbm/src/Lbm.h | 195 ++++++++++++++++--------- benchmarks/lbm/src/RunCavityTwoPop.cpp | 9 ++ 4 files changed, 234 insertions(+), 129 deletions(-) diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h index 429f6a33..cfb931d6 100644 --- a/benchmarks/lbm/src/ContainersD3QXX.h +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -36,6 +36,7 @@ struct ContainerFactoryD3QXX { struct Even { + // collide static auto iteration(const CellTypeField& cellTypeField /*! Cell type field */, @@ -46,7 +47,7 @@ struct ContainerFactoryD3QXX Neon::set::Container container = fpopField.getGrid().newContainer( "D3Q19_TwoPop_Pull", [&, omega](Neon::set::Loader& L) -> auto { - auto& fpop = L.load(fpopField); + auto& popMem = L.load(fpopField); const auto& cellInfoPartition = L.load(cellTypeField); [[maybe_unused]] const Compute beta = omega * 0.5; [[maybe_unused]] const Compute invBeta = 1.0 / beta; @@ -55,12 +56,12 @@ struct ContainerFactoryD3QXX CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { - Storage popIn[Lattice::Q]; - Device::Common::localLoad(gidx, fpop, NEON_OUT popIn); + Storage popRegisters[Lattice::Q]; + Device::Common::localLoad(gidx, popMem, NEON_OUT popRegisters); Compute rho; std::array u{.0, .0, .0}; - Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); Compute usqr = 1.5 * (u[0] * u[0] + u[1] * u[1] + @@ -69,15 +70,15 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, usqr, omega, - NEON_IO popIn); + NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, usqr, omega, invBeta, - NEON_IO popIn); + NEON_IO popRegisters); } - Device::Common::localStore(gidx, popIn, fpop); + Device::Common::localStoreOpposite(gidx, popRegisters, popMem); } }; }); @@ -92,25 +93,24 @@ struct ContainerFactoryD3QXX -> Neon::set::Container { - return Pull::computeRhoAndU(fInField, cellTypeField, rhoField, uField); + return Push::computeRhoAndU(fInField, cellTypeField, rhoField, uField); } }; struct Odd { + // pullStream - collide - pushStream + static auto - iteration(Neon::set::StencilSemantic stencilSemantic, - const CellTypeField& cellTypeField /*! Cell type field */, - const Compute omega /*! LBM omega parameter */, - NEON_IO PopField& fPopField /*! Output Population field */) + iteration(const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + NEON_IO PopField& fpopField /*! Output Population field */) -> Neon::set::Container { - Neon::set::Container container = fPopField.getGrid().newContainer( - "LBM-iteration", - [=](Neon::set::Loader& L) -> auto { - auto& fPop = L.load(fPopField, - Neon::Pattern::STENCIL, stencilSemantic); - const auto& cellInfoPartition = L.load(cellTypeField); - + Neon::set::Container container = fpopField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fpop = L.load(fpopField); + const auto& cellInfoPartition = L.load(cellTypeField); [[maybe_unused]] const Compute beta = omega * 0.5; [[maybe_unused]] const Compute invBeta = 1.0 / beta; @@ -118,12 +118,12 @@ struct ContainerFactoryD3QXX CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { - Storage popIn[Lattice::Q]; - Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fPop, NEON_OUT popIn); + Storage popRegisters[Lattice::Q]; + Device::AA::pullStream(gidx, cellInfo.wallNghBitflag, fpop, NEON_OUT popRegisters); Compute rho; std::array u{.0, .0, .0}; - Device::Common::macroscopic(popIn, + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); Compute usqr = 1.5 * (u[0] * u[0] + @@ -134,15 +134,15 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, usqr, omega, - NEON_IO popIn); + NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, usqr, omega, invBeta, - NEON_IO popIn); + NEON_IO popRegisters); } - Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fPop); + Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popRegisters, NEON_OUT fpop); } }; }); @@ -157,7 +157,7 @@ struct ContainerFactoryD3QXX -> Neon::set::Container { - return Push::computeRhoAndU(fInField, cellTypeField, rhoField, uField); + return Pull::computeRhoAndU(fInField, cellTypeField, rhoField, uField); } }; }; @@ -186,12 +186,12 @@ struct ContainerFactoryD3QXX CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { - Storage popIn[Lattice::Q]; - Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + Storage popRegisters[Lattice::Q]; + Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popRegisters); Compute rho; std::array u{.0, .0, .0}; - Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); Compute usqr = 1.5 * (u[0] * u[0] + u[1] * u[1] + @@ -200,15 +200,15 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, usqr, omega, - NEON_IO popIn); + NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, usqr, omega, invBeta, - NEON_IO popIn); + NEON_IO popRegisters); } - Device::Common::localStore(gidx, popIn, fOut); + Device::Common::localStore(gidx, popRegisters, fOut); } }; }); @@ -235,12 +235,12 @@ struct ContainerFactoryD3QXX CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { - Storage popIn[Lattice::Q]; - Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); + Storage popRegisters[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters); Compute rho; std::array u{.0, .0, .0}; - Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); Compute usqr = 1.5 * (u[0] * u[0] + u[1] * u[1] + @@ -249,15 +249,15 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, usqr, omega, - NEON_IO popIn); + NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, usqr, omega, invBeta, - NEON_IO popIn); + NEON_IO popRegisters); } - Device::Common::localStore(gidx, popIn, fOut); + Device::Common::localStore(gidx, popRegisters, fOut); } }; }); @@ -289,19 +289,19 @@ struct ContainerFactoryD3QXX Compute rho = 0; std::array u{.0, .0, .0}; - Storage popIn[Lattice::Q]; + Storage popRegisters[Lattice::Q]; if (cellInfo.classification == CellType::bulk) { - Storage popIn[Lattice::Q]; - Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + Storage popRegisters[Lattice::Q]; + Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popRegisters); + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); } else { - Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); + Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters); if (cellInfo.classification == CellType::movingWall) { rho = 1.0; - u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), - static_cast(popIn[1]) / static_cast(6. * 1. / 18.), - static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + u = std::array{static_cast(popRegisters[0]) / static_cast(6. * 1. / 18.), + static_cast(popRegisters[1]) / static_cast(6. * 1. / 18.), + static_cast(popRegisters[2]) / static_cast(6. * 1. / 18.)}; } } @@ -339,12 +339,12 @@ struct ContainerFactoryD3QXX CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { - Storage popIn[Lattice::Q]; - Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); + Storage popRegisters[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters); Compute rho; std::array u{.0, .0, .0}; - Device::Common::macroscopic(popIn, + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); Compute usqr = 1.5 * (u[0] * u[0] + @@ -355,15 +355,15 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, usqr, omega, - NEON_IO popIn); + NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, usqr, omega, invBeta, - NEON_IO popIn); + NEON_IO popRegisters); } - Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut); + Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popRegisters, NEON_OUT fOut); } }; }); @@ -395,17 +395,17 @@ struct ContainerFactoryD3QXX Compute rho = 0; std::array u{.0, .0, .0}; - Storage popIn[Lattice::Q]; - Device::Common::localLoad(gidx, fIn, NEON_OUT popIn); + Storage popRegisters[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters); if (cellInfo.classification == CellType::bulk) { - Device::Common::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); } else { if (cellInfo.classification == CellType::movingWall) { rho = 1.0; - u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), - static_cast(popIn[1]) / static_cast(6. * 1. / 18.), - static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + u = std::array{static_cast(popRegisters[0]) / static_cast(6. * 1. / 18.), + static_cast(popRegisters[1]) / static_cast(6. * 1. / 18.), + static_cast(popRegisters[2]) / static_cast(6. * 1. / 18.)}; } } diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index 3327f2dc..f1b1d11e 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -49,6 +49,36 @@ struct DeviceD3QXX } }; + struct AA + { + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using QPullingReference = typename Lattice::template RegisterMapper; + + if constexpr (QPullingReference::fwdRegQ == QPullingReference::centerRegQ) { + popIn[QPullingReference::centerRegQ] = fin(gidx, QPullingReference::centerMemQ); + } else { + if (CellType::isWall(wallBitFlag)) { + // The cell in the opposite direction of the pull is a wall + popIn[QPullingReference::fwdRegQ] = fin(gidx, QPullingReference::fwdRegQ) + + fin.template getNghData(gidx, QPullingReference::bkwMemQ)(); + } else { + popIn[QPullingReference::fwdRegQ] = fin.template getNghData(gidx, QPullingReference::bkwMemQ)(); + } + } + }); + } + }; + struct Push { static inline NEON_CUDA_HOST_DEVICE auto @@ -218,6 +248,17 @@ struct DeviceD3QXX }); } + static inline NEON_CUDA_HOST_DEVICE auto + localStoreOpposite(Idx const& gidx, + Storage NEON_RESTRICT pOut[Lattice::Q], + NEON_OUT typename PopField::Partition& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + fOut(gidx, M::bkwMemQ) = pOut[M::fwdRegQ]; + }); + } + static inline NEON_CUDA_HOST_DEVICE auto collideKBCUnrolled(Compute const& rho /*! Density */, std::array const& u /*! Velocity */, @@ -246,7 +287,7 @@ struct DeviceD3QXX return (2.0 * Nxz - Nyz) / 6.0; } else if (q == 14 /* 1, 0, -1 */) { return (2.0 * Nxz - Nyz) / 6.0; - }else if (q == 1 /* 0, -1, 0 */) { + } else if (q == 1 /* 0, -1, 0 */) { return (-Nxz + 2.0 * Nyz) / 6.0; } else if (q == 15 /* 0, 1, 0 */) { return (-Nxz + 2.0 * Nyz) / 6.0; diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index 03bbb5c4..fb0dc1cf 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -70,6 +70,7 @@ struct Lbm std::stringstream name; name << "PopField_0" << i; using Storage = typename Precision::Storage; + std::cout << "Allocating population field (#" << std::to_string(i + 1) << std::endl; auto field = grid.template newField(name.str(), Lattice::Q, @@ -101,6 +102,7 @@ struct Lbm template auto setBC(Lambda bcSetFunction) -> void { + std::cout << "Setting the problem's boundary." <(); { skeleton = std::vector(2); - for (int itr_ : {0, 1}) { - iteration = itr_; - int skIdx = helpGetSkeletonIdx(); + for (int iteration : {0, 1}) { + iterationPhase.resetPhase(iteration); + int skIdx = iterationPhase.getSkeletonIdx(); auto even = ContainerFactory::Pull::iteration( configurations.stencilSemanticCli.getOption(), - pFieldList.at(helpGetInputIdx()), + pFieldList.at(iterationPhase.getInputIdx()), cellFlagField, lbmParameters.omega, - pFieldList.at(helpGetOutputIdx())); + pFieldList.at(iterationPhase.getOutputIdx())); std::vector ops; skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); @@ -160,14 +162,14 @@ struct Lbm } { // Let's compute 1 collide operation to prepare the input of the first iteration - iteration = 1; - ContainerFactory::Pull::localCollide(pFieldList.at(helpGetInputIdx()), + iterationPhase.resetPhase(0); + ContainerFactory::Pull::localCollide(pFieldList.at(iterationPhase.getInputIdx()), cellFlagField, lbmParameters.omega, - pFieldList.at(helpGetOutputIdx())) + pFieldList.at(iterationPhase.getOutputIdx())) .run(Neon::Backend::mainStreamIdx); pFieldList[0].getBackend().syncAll(); - iteration = 0; + iterationPhase.updateIterationPhase(); } return; } @@ -175,15 +177,15 @@ struct Lbm using Compute = typename Precision::Compute; auto lbmParameters = configurations.template getLbmParameters(); skeleton = std::vector(2); - for (int itr_ : {0, 1}) { - iteration = itr_; - int skIdx = helpGetSkeletonIdx(); + for (int iteration : {0, 1}) { + iterationPhase.resetPhase(iteration); + int skIdx = iterationPhase.getSkeletonIdx(); auto even = ContainerFactory::Push::iteration( configurations.stencilSemanticCli.getOption(), - pFieldList.at(helpGetInputIdx()), + pFieldList.at(iterationPhase.getInputIdx()), cellFlagField, lbmParameters.omega, - pFieldList.at(helpGetOutputIdx())); + pFieldList.at(iterationPhase.getOutputIdx())); std::vector ops; skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); @@ -198,10 +200,10 @@ struct Lbm } { - iteration = 1; - int skIdx = helpGetSkeletonIdx(); + iterationPhase.resetPhase(0); + int skIdx = iterationPhase.getSkeletonIdx(); skeleton.at(skIdx).run(); - iteration = 0; + iterationPhase.updateIterationPhase(); } return; } @@ -209,35 +211,36 @@ struct Lbm using Compute = typename Precision::Compute; auto lbmParameters = configurations.template getLbmParameters(); skeleton = std::vector(2); - for (int itr_ : {0, 1}) { - iteration = itr_; - int skIdx = helpGetSkeletonIdx(); + for (int iteration : {0, 1}) { + iterationPhase.resetPhase(iteration); + int skIdx = iterationPhase.getSkeletonIdx(); Neon::set::Container lbmIteration; - if ((iteration + 2) % 2 == 0) { - auto even = ContainerFactory::AA::Odd::iteration( - configurations.stencilSemanticCli.getOption(), - pFieldList.at(helpGetInputIdx()), + std::stringstream appName; + if (iterationPhase.getPhase() == IterationPhase::Phase::even) { + lbmIteration = ContainerFactory::AA::Even::iteration( + cellFlagField, + lbmParameters.omega, + pFieldList.at(0)); + appName << "LBM_push_even"; + } else { + lbmIteration = ContainerFactory::AA::Odd::iteration( cellFlagField, lbmParameters.omega, - pFieldList.at(helpGetOutputIdx())); + pFieldList.at(0)); + appName << "LBM_pull_even"; } std::vector ops; skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption()); - ops.push_back(even); - std::stringstream appName; - if (iteration % 2 == 0) - appName << "LBM_push_even"; - else - appName << "LBM_pull_even"; + ops.push_back(lbmIteration); skeleton.at(skIdx).sequence(ops, appName.str(), opt); } { - iteration = 1; - int skIdx = helpGetSkeletonIdx(); + iterationPhase.resetPhase(0); + int const skIdx = iterationPhase.getSkeletonIdx(); skeleton.at(skIdx).run(); - iteration = 0; + iterationPhase.updateIterationPhase(); } return; } @@ -270,35 +273,19 @@ struct Lbm tie(start, clock_iter) = metrics::restartClock(bk, false); } - skeleton[helpGetSkeletonIdx()].run(); + skeleton[iterationPhase.getSkeletonIdx()].run(); ++clock_iter; - ++iteration; + iterationPhase.updateIterationPhase(); } std::cout << "Iterations completed" << std::endl; metrics::recordMetrics(bk, configurations, *reportPtr, start, clock_iter); } - auto helpIterateOnce() -> void - { - if (lbm::Method::pull == method) { - NEON_DEV_UNDER_CONSTRUCTION(""); - return; - } - if (lbm::Method::push == method) { - skeleton.at(helpGetSkeletonIdx()).run(Neon::Backend::mainStreamIdx); - return; - } - if (lbm::Method::aa == method) { - NEON_DEV_UNDER_CONSTRUCTION(""); - return; - } - } - auto helpExportVti() -> void { grid.getBackend().syncAll(); - auto& pop = pFieldList.at(helpGetOutputIdx()); + auto& pop = pFieldList.at(iterationPhase.getOutputIdx()); bool done = false; if constexpr (method == lbm::Method::push) { auto computeRhoAndU = ContainerFactory::Push::computeRhoAndU(pop, cellFlagField, rho, u); @@ -310,6 +297,16 @@ struct Lbm computeRhoAndU.run(Neon::Backend::mainStreamIdx); done = true; } + if constexpr (method == lbm::Method::aa) { + if (iterationPhase.getPhase() == IterationPhase::Phase::even) { + auto computeRhoAndU = ContainerFactory::AA::Even::computeRhoAndU(pop, cellFlagField, rho, u); + computeRhoAndU.run(Neon::Backend::mainStreamIdx); + } else { + auto computeRhoAndU = ContainerFactory::AA::Odd::computeRhoAndU(pop, cellFlagField, rho, u); + computeRhoAndU.run(Neon::Backend::mainStreamIdx); + } + done = true; + } if (!done) { NEON_DEV_UNDER_CONSTRUCTION("helpExportVti"); } @@ -319,7 +316,7 @@ struct Lbm grid.getBackend().sync(Neon::Backend::mainStreamIdx); size_t numDigits = 5; - std::string iterIdStr = std::to_string(iteration); + std::string iterIdStr = std::to_string(iterationPhase.getCounter()); iterIdStr = std::string(numDigits - std::min(numDigits, iterIdStr.length()), '0') + iterIdStr; // pop.ioToVtk("pop_" + iterIdStr, "pop", false); @@ -370,26 +367,84 @@ struct Lbm #endif } - auto helpUpdateIterationCount() -> void - { - iteration++; - } - auto helpGetInputIdx() -> int - { - return iteration % 2; - } - auto helpGetOutputIdx() -> int + struct IterationPhase { - return (iteration + 1) % 2; - } - auto helpGetSkeletonIdx() -> int - { - return iteration % 2; - } + enum Phase + { + even, + odd, + }; + + private: + Phase state{Phase::even}; + + int counter = 0; + + public: + auto getCounter() const -> int + { + return counter; + } + + auto resetPhase(Phase newPhase) + { + state = newPhase; + counter = 0; + } + + auto resetPhase(int iteration) + { + if (iteration != 0 && iteration != 1) { + NEON_THROW_UNSUPPORTED_OPERATION(""); + } + state = iteration == 0 ? even : odd; + counter = 0; + } + + auto getPhase() const -> Phase + { + return state; + } + + auto updateIterationPhase() -> void + { + state = state == even ? odd : even; + counter++; + } + + auto getInputIdx() -> int + { + if constexpr (method == lbm::Method::pull || method == lbm::Method::push) { + return state == IterationPhase::even ? 0 : 1; + } + if constexpr (method == lbm::Method::aa) { + return 0; + } + NEON_THROW_UNSUPPORTED_OPERATION("helpGetInputIdx"); + } + auto getOutputIdx() -> int + { + if constexpr (method == lbm::Method::pull || method == lbm::Method::push) { + return state == IterationPhase::even ? 1 : 0; + } + if constexpr (method == lbm::Method::aa) { + return 0; + } + NEON_THROW_UNSUPPORTED_OPERATION("helpGetInputIdx"); + } + + auto getSkeletonIdx() -> int + { + if constexpr (method == lbm::Method::pull || method == lbm::Method::push || method == lbm::Method::aa) { + return state == IterationPhase::even ? 0 : 1; + } + NEON_THROW_UNSUPPORTED_OPERATION("helpGetInputIdx"); + } + }; Config configurations; - int iteration = 0; + IterationPhase iterationPhase; bool prepDone = false; Grid grid; std::vector pFieldList; diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cpp index 974012cf..95edebe1 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -89,11 +89,20 @@ auto runFilterMethod(Config& config, Report& report) -> void { feenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); // Enable all floating point exceptions but FE_INEXACT if (config.streamingMethod == "push") { + if(config.devices.size() != 1){ + NEON_THROW_UNSUPPORTED_OPERATION("We only support PUSH in a single device configuration for now.") + } return run(config, report); } if (config.streamingMethod == "pull") { return run(config, report); } + if (config.streamingMethod == "aa") { + if(config.devices.size() != 1){ + NEON_THROW_UNSUPPORTED_OPERATION("We only support AA in a single device configuration for now.") + } + return run(config, report); + } NEON_DEV_UNDER_CONSTRUCTION(""); } From abc4e28c897c352fdce33845d1877e0edeae8ad0 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 12:24:35 +0200 Subject: [PATCH 64/94] Cleaning up LBM benchmarking --- benchmarks/lbm/lbm.py | 124 ++++++++++-------- benchmarks/lbm/src/Config.cpp | 17 ++- benchmarks/lbm/src/Config.h | 2 + benchmarks/lbm/src/Lbm.h | 14 +- benchmarks/lbm/src/Repoert.h | 2 +- benchmarks/lbm/src/Report.cpp | 6 +- benchmarks/lbm/src/RunCavityTwoPop.cpp | 89 +++++++++---- benchmarks/lbm/src/RunCavityTwoPop.h | 3 +- benchmarks/lbm/src/app.cpp | 13 +- .../include/Neon/domain/tools/SpaceCurves.h | 1 + .../src/domain/tools/SpaceCurves.cpp | 10 ++ libNeonSet/include/Neon/set/StencilSemantic.h | 1 + libNeonSet/include/Neon/set/TransferMode.h | 1 + libNeonSet/src/set/StencilSemantic.cpp | 10 ++ libNeonSet/src/set/TransferMode.cpp | 10 ++ 15 files changed, 204 insertions(+), 99 deletions(-) diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py index 2ce5dcd3..b08fd00e 100644 --- a/benchmarks/lbm/lbm.py +++ b/benchmarks/lbm/lbm.py @@ -7,6 +7,11 @@ OCC_LIST = "nOCC sOCC".split() HU_LIST = "huGrid huLattice".split() CURVE_LIST = "sweep morton hilbert".split() +COLLISION_LIST = "bgk kbc".split() +LATTICE_LIST = "d3q19 d3q27".split() +STREAMINGMETHOD_LIST = "push pull aa".split() +TRANSFERMODE_LIST = "get put".split() +STENCILSEMANTIC_LIST = "grid, streaming".split() WARM_UP_ITER = 10 MAX_ITER = 10000 REPETITIONS = 5 @@ -34,26 +39,36 @@ def countAll(): if DEVICE_TYPE == 'gpu': for DEVICE in DEVICE_ID_LIST[1:]: DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) - for OCC in OCC_LIST: - for DOMAIN_SIZE in DOMAIN_SIZE_LIST: - for STORAGE_FP in STORAGE_FP_LIST: - for COMPUTE_FP in COMPUTE_FP_LIST: - for DEVICE_SET in DEVICE_SET_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: for GRID in GRID_LIST: for HU in HU_LIST: for CURVE in CURVE_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue - if STORAGE_FP == 'float' and COMPUTE_FP == 'double': - continue + for LATTICE in LATTICE_LIST: + for TRANSFERMODE in TRANSFERMODE_LIST: + for STENCILSEMANTIC in STENCILSEMANTIC_LIST: + for COLLISION in COLLISION_LIST: + if LATTICE != "d3q27" and LATTICE != "D3Q27": + continue + for STREAMINGMETHOD in STREAMINGMETHOD_LIST: + if STREAMINGMETHOD != 'pull' and len(DEVICE_ID_LIST) != 1: + continue + + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue - counter += 1 + counter += 1 return counter SAMPLES = countAll() counter = 0 -command = './lbm-lid-driven-cavity-flow' +command = './lbm' # command = 'echo' with open(command + '.log', 'w') as fp: for DEVICE_TYPE in DEVICE_TYPE_LIST: @@ -69,50 +84,53 @@ def countAll(): for GRID in GRID_LIST: for HU in HU_LIST: for CURVE in CURVE_LIST: + for LATTICE in LATTICE_LIST: + for TRANSFERMODE in TRANSFERMODE_LIST: + for STENCILSEMANTIC in STENCILSEMANTIC_LIST: + for COLLISION in COLLISION_LIST: + if LATTICE != "d3q27" and LATTICE != "D3Q27": + continue + for STREAMINGMETHOD in STREAMINGMETHOD_LIST: + if STREAMINGMETHOD != 'pull' and len(DEVICE_ID_LIST) != 1: + continue - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue - if STORAGE_FP == 'float' and COMPUTE_FP == 'double': - continue - - parameters = [] - parameters.append('--deviceType ' + DEVICE_TYPE) - parameters.append('--deviceIds ' + DEVICE_SET) - parameters.append('--grid ' + GRID) - parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) - parameters.append('--max-iter ' + str(MAX_ITER)) - parameters.append( - '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + - DEVICE_SET.replace(' ', '_') + '-' + - GRID + '_' + - DOMAIN_SIZE + '-' + - STORAGE_FP + '-' + COMPUTE_FP + '-' + - OCC + '-' + - HU + '-' + - CURVE) - parameters.append('--computeFP ' + COMPUTE_FP) - parameters.append('--storageFP ' + STORAGE_FP) - parameters.append('--curve ' + CURVE) + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue - parameters.append('--benchmark') - parameters.append('--' + OCC) - parameters.append('--' + HU) + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append('--report-filename ' + 'lbm') + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--occ ' + OCC) + parameters.append('--transferMode ' + TRANSFERMODE) + parameters.append('--stencilSemantic ' + STENCILSEMANTIC) + parameters.append('--spaceCurve ' + CURVE) + parameters.append('--collision ' + COLLISION) + parameters.append('--streamingMethod ' + STREAMINGMETHOD) + parameters.append('--lattice ' + LATTICE) + parameters.append('--benchmark ') + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) - commandList = [] - commandList.append(command) - for el in parameters: - for s in el.split(): - commandList.append(s) + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) - fp.write("\n-------------------------------------------\n") - fp.write(' '.join(commandList)) - fp.write("\n-------------------------------------------\n") - fp.flush() - print(' '.join(commandList)) - subprocess.run(commandList, text=True, stdout=fp) + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + print(' '.join(commandList)) + subprocess.run(commandList, text=True, stdout=fp) - counter += 1 - printProgressBar(counter * 100.0 / SAMPLES, 'Progress') + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp index a2744860..12d28b12 100644 --- a/benchmarks/lbm/src/Config.cpp +++ b/benchmarks/lbm/src/Config.cpp @@ -80,10 +80,10 @@ auto Config::parseArgs(const int argc, char* argv[]) clipp::option("--storageFP") & clipp::value("storageFP", config.storeTypeStr) % "double, float", clipp::option("--occ") & clipp::value("occ")([&config](const std::string& s) { config.occCli.set(s); }) % config.occCli.getDoc(), - clipp::option("--transferMode")& clipp::value("transferMode")([&config](const std::string& s) { config.transferModeCli.set(s); }) % config.transferModeCli.getDoc(), - clipp::option("--stencilSemantic")& clipp::value("stencilSemantic")([&config](const std::string& s) { config.stencilSemanticCli.set(s); }) % config.stencilSemanticCli.getDoc(), - clipp::option("--spaceCurve")& clipp::value("spaceCurve")([&config](const std::string& s) { config.spaceCurveCli.set(s); }) % config.spaceCurveCli.getDoc(), - clipp::option("--collision")& clipp::value("collision")([&config](const std::string& s) { config.collisionCli.set(s); }) % config.collisionCli.getDoc(), + clipp::option("--transferMode") & clipp::value("transferMode")([&config](const std::string& s) { config.transferModeCli.set(s); }) % config.transferModeCli.getDoc(), + clipp::option("--stencilSemantic") & clipp::value("stencilSemantic")([&config](const std::string& s) { config.stencilSemanticCli.set(s); }) % config.stencilSemanticCli.getDoc(), + clipp::option("--spaceCurve") & clipp::value("spaceCurve")([&config](const std::string& s) { config.spaceCurveCli.set(s); }) % config.spaceCurveCli.getDoc(), + clipp::option("--collision") & clipp::value("collision")([&config](const std::string& s) { config.collisionCli.set(s); }) % config.collisionCli.getDoc(), clipp::option("--streamingMethod") & clipp::value("streamingMethod", config.streamingMethod) % Config::getOptionList(config.streamingMethodOption, config.streamingMethod), clipp::option("--lattice") & clipp::value("lattice", config.lattice) % Config::getOptionList(config.latticeOptions, config.lattice), @@ -114,6 +114,15 @@ auto Config::parseArgs(const int argc, char* argv[]) helpSetLbmParameters(); + std::stringstream s; + for (int i = 0; i < argc; i++) { + s << argv[i]; + if (i + 1 != argc) { + s << " "; + } + } + mArgv = s.str(); + return 0; } diff --git a/benchmarks/lbm/src/Config.h b/benchmarks/lbm/src/Config.h index adb5824d..f8222bef 100644 --- a/benchmarks/lbm/src/Config.h +++ b/benchmarks/lbm/src/Config.h @@ -56,6 +56,8 @@ struct Config LbmParameters mLbmParameters; + std::string mArgv; + auto getOptionList(std::vector list, std::string defaultVal) -> std::string { std::stringstream s; diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index fb0dc1cf..aa47a45a 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -42,6 +42,7 @@ struct Lbm configurations = config; reportPtr = &report; + // Setting the backend Neon::Backend bk = [&] { if (config.deviceType == "cpu") { @@ -57,6 +58,8 @@ struct Lbm NEON_THROW(exce); }(); + auto [gridInitClockStart, notcare] = metrics::restartClock(bk, true); + // Setting the grid grid = Grid( bk, {config.N, config.N, config.N}, @@ -96,13 +99,16 @@ struct Lbm ContainerFactory::Common::setToEquilibrium(pField, cellFlagField).run(Neon::Backend::mainStreamIdx); } } + metrics::recordGridInitMetrics(bk, *reportPtr, gridInitClockStart); } // Lambda = void(*)(Neon::Index3d) -> std::tuple> template auto setBC(Lambda bcSetFunction) -> void { - std::cout << "Setting the problem's boundary." < void @@ -252,6 +259,7 @@ struct Lbm helpPrep(); // Iteration keep track of all iterations // clock_iter keeps tracks of the iteration done after the last clock reset + std::cout << "Starting main LBM loop." << std::endl; auto& bk = grid.getBackend(); auto [start, clock_iter] = metrics::restartClock(bk, true); @@ -260,7 +268,7 @@ struct Lbm tie(start, clock_iter) = metrics::restartClock(bk, true); for (time_iter = 0; time_iter < configurations.benchMaxIter; ++time_iter) { - if ((time_iter % configurations.vti) == 0) { + if ((configurations.vti > 1) && ((time_iter % configurations.vti) == 0)) { bk.syncAll(); helpExportVti(); } @@ -278,7 +286,7 @@ struct Lbm ++clock_iter; iterationPhase.updateIterationPhase(); } - std::cout << "Iterations completed" << std::endl; + std::cout << "Iterations completed." << std::endl; metrics::recordMetrics(bk, configurations, *reportPtr, start, clock_iter); } diff --git a/benchmarks/lbm/src/Repoert.h b/benchmarks/lbm/src/Repoert.h index 4ca0827b..095bce9a 100644 --- a/benchmarks/lbm/src/Repoert.h +++ b/benchmarks/lbm/src/Repoert.h @@ -33,7 +33,7 @@ struct Report const std::string& unit) -> void; - auto save() + auto save(std::stringstream & testCode) -> void; void recordBk(Neon::Backend& backend); void recordGrid(Neon::domain::interface::GridBase& g); diff --git a/benchmarks/lbm/src/Report.cpp b/benchmarks/lbm/src/Report.cpp index 98b9980c..e332de43 100644 --- a/benchmarks/lbm/src/Report.cpp +++ b/benchmarks/lbm/src/Report.cpp @@ -7,6 +7,8 @@ Report::Report(const Config& c) { mFname = c.reportFile; + mReport.addMember("argv", c.mArgv); + mReport.addMember("Re", c.Re); mReport.addMember("ulb", c.ulb); mReport.addMember("N", c.N); @@ -89,7 +91,7 @@ auto Report::recordProblemSetupTime(double time, const std::string& unit) -> voi } auto Report:: - save() + save(std::stringstream & testCode) -> void { mReport.addMember("MLUPS", mMLUPS); @@ -97,7 +99,7 @@ auto Report:: mReport.addMember(std::string("Problem Setup Time (") + mtimeUnit + ")", mProblemSetupTime); mReport.addMember(std::string("Neon Grid Init Time (") + mtimeUnit + ")", mNeonGridInitTime); - mReport.write(mFname, true); + mReport.write(mFname + testCode.str(), true); } void Report::recordBk(Neon::Backend& backend) diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cpp index 95edebe1..2659e6c3 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -17,18 +17,29 @@ int backendWasReported = false; #include "/usr/include/fenv.h" namespace details { template -auto run(Config& config, - Report& report) -> void +auto run(Config& config, + Report& report, + [[maybe_unused]] std::stringstream& code) -> void { using Storage = Storage_; using Compute = Compute_; using Precision = Precision; using Lattice = Lattice_; // D3Q27; + + code << "_" << config.deviceType; + for (auto const& id : config.devices) { + code << id; + } + code << "_SS" << config.stencilSemanticCli.getStringOption()<< "_"; + code << "_SF" << config.spaceCurveCli.getStringOption() << "_"; + code << "_TM" << config.transferModeCli.getStringOptions() << "_"; + + code << "__"; // using PopulationField = typename Grid::template Field; // using PopField = typename Grid::template Field; @@ -43,9 +54,9 @@ auto run(Config& config, Neon::index_3d domainDim(config.N, config.N, config.N); Lbm lbm(config, - report, - [](Neon::index_3d const&) { return true; }); - auto ulb = config.ulb; + report, + [](Neon::index_3d const&) { return true; }); + auto ulb = config.ulb; lbm.setBC([=] NEON_CUDA_HOST_DEVICE(Neon::index_3d const& globalIdx, NEON_OUT Storage p[Lattice::Q], NEON_OUT CellType::Classification& cellClass) { @@ -85,67 +96,82 @@ auto run(Config& config, template -auto runFilterMethod(Config& config, Report& report) -> void +auto runFilterMethod(Config& config, + Report& report, + std::stringstream& testCode) -> void { feenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); // Enable all floating point exceptions but FE_INEXACT if (config.streamingMethod == "push") { - if(config.devices.size() != 1){ + if (config.devices.size() != 1) { NEON_THROW_UNSUPPORTED_OPERATION("We only support PUSH in a single device configuration for now.") } - return run(config, report); + testCode << "_push"; + return run(config, report, testCode); } if (config.streamingMethod == "pull") { - return run(config, report); + testCode << "_pull"; + return run(config, report, testCode); } if (config.streamingMethod == "aa") { - if(config.devices.size() != 1){ + if (config.devices.size() != 1) { NEON_THROW_UNSUPPORTED_OPERATION("We only support AA in a single device configuration for now.") } - return run(config, report); + testCode << "_aa"; + return run(config, report, testCode); } NEON_DEV_UNDER_CONSTRUCTION(""); } template -auto runFilterCollision(Config& config, Report& report) -> void +auto runFilterCollision(Config& config, + Report& report, + std::stringstream& testCode) -> void { if (config.collisionCli.getOption() == Collision::bgk) { - - return runFilterMethod(config, report); + testCode << "_bgk"; + return runFilterMethod(config, report, testCode); } if (config.collisionCli.getOption() == Collision::kbc) { - if(config.lattice != "d3q27" && config.lattice != "D3Q27"){ + if (config.lattice != "d3q27" && config.lattice != "D3Q27") { Neon::NeonException e("runFilterCollision"); e << "LBM kbc collision model only supports d3q27 lattice"; NEON_THROW(e); } - return runFilterMethod(config, report); + testCode << "_kbc"; + return runFilterMethod(config, report, testCode); } NEON_DEV_UNDER_CONSTRUCTION(""); } template -auto runFilterLattice(Config& config, Report& report) -> void +auto runFilterLattice(Config& config, + Report& report, + std::stringstream& testCode) -> void { using Precision = Precision; if (config.lattice == "d3q19" || config.lattice == "D3Q19") { + testCode << "_D3Q19"; using Lattice = D3Q19; - return runFilterCollision(config, report); + return runFilterCollision(config, report, testCode); } if (config.lattice == "d3q27" || config.lattice == "D3Q27") { + testCode << "_D3Q27"; using Lattice = D3Q27; - return runFilterCollision(config, report); + return runFilterCollision(config, report, testCode); } NEON_DEV_UNDER_CONSTRUCTION("Lattice type not supported. Available options: D3Q19 and D3Q27"); } template -auto runFilterComputeType(Config& config, Report& report) -> void +auto runFilterComputeType(Config& config, + Report& report, + std::stringstream& testCode) { if (config.computeTypeStr == "double") { - return runFilterLattice(config, report); + testCode << "_SD"; + return runFilterLattice(config, report, testCode); } // if (config.computeTypeStr == "float") { // return run(config, report); @@ -154,15 +180,18 @@ auto runFilterComputeType(Config& config, Report& report) -> void } template -auto runFilterStoreType(Config& config, - Report& report) +auto runFilterStoreType(Config& config, + Report& report, + std::stringstream& testCode) -> void { if (config.storeTypeStr == "double") { - return runFilterComputeType(config, report); + testCode << "_CD"; + return runFilterComputeType(config, report, testCode); } // if (config.storeTypeStr == "float") { - // return runFilterComputeType(config, report); + // testCode << "_CS_"; + // return runFilterComputeType(config, report,testCode); // } NEON_DEV_UNDER_CONSTRUCTION(""); } @@ -174,11 +203,13 @@ constexpr bool skipTest = false; constexpr bool skipTest = false; #endif -auto run(Config& config, - Report& report) -> void +auto run(Config& config, + Report& report, + std::stringstream& testCode) -> void { if (config.gridType == "dGrid") { - return details::runFilterStoreType(config, report); + testCode << "___DG"; + return details::runFilterStoreType(config, report, testCode); } // if (config.gridType == "eGrid") { // if constexpr (!skipTest) { diff --git a/benchmarks/lbm/src/RunCavityTwoPop.h b/benchmarks/lbm/src/RunCavityTwoPop.h index d30f722e..0386d28e 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.h +++ b/benchmarks/lbm/src/RunCavityTwoPop.h @@ -8,5 +8,6 @@ namespace CavityTwoPop { auto run(Config& config, - Report& report) -> void; + Report& report, + std::stringstream&) -> void; } // namespace CavityTwoPop \ No newline at end of file diff --git a/benchmarks/lbm/src/app.cpp b/benchmarks/lbm/src/app.cpp index 6ed4b7d6..8cbfc1cf 100644 --- a/benchmarks/lbm/src/app.cpp +++ b/benchmarks/lbm/src/app.cpp @@ -3,9 +3,9 @@ #include "Repoert.h" #include "RunCavityTwoPop.h" +#include "Neon/Neon.h" #include "Neon/core/tools/clipp.h" #include "Neon/domain/dGrid.h" -#include "Neon/Neon.h" int main(int argc, char** argv) { @@ -35,13 +35,14 @@ int main(int argc, char** argv) std::cout << config.toString(); std::cout << "-------------------------------------------\n"; - Report report(config); - - for(int i=0; i EncoderType; auto set(const std::string& opt) -> void; auto getStringOptions() const -> std::string; + auto getStringOption() const -> std::string; auto getDoc() const -> std::string; auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; diff --git a/libNeonDomain/src/domain/tools/SpaceCurves.cpp b/libNeonDomain/src/domain/tools/SpaceCurves.cpp index d45ec571..cca20e19 100644 --- a/libNeonDomain/src/domain/tools/SpaceCurves.cpp +++ b/libNeonDomain/src/domain/tools/SpaceCurves.cpp @@ -133,6 +133,16 @@ auto EncoderTypeUtil::Cli::getStringOptions() const -> std::string return msg; } +auto EncoderTypeUtil::Cli::getStringOption() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "TransferSemantic was not set."; + NEON_ERROR(errorMsg.str()); + } + return EncoderTypeUtil::toString(mOption); +} + auto EncoderTypeUtil::Cli::getDoc() const -> std::string { std::stringstream s; diff --git a/libNeonSet/include/Neon/set/StencilSemantic.h b/libNeonSet/include/Neon/set/StencilSemantic.h index 3e22f4f4..aa5338fc 100644 --- a/libNeonSet/include/Neon/set/StencilSemantic.h +++ b/libNeonSet/include/Neon/set/StencilSemantic.h @@ -31,6 +31,7 @@ struct StencilSemanticUtils auto getOption() const -> StencilSemantic; auto set(const std::string& opt) -> void; auto getStringOptions() const -> std::string; + auto getStringOption() const -> std::string; auto getDoc() const -> std::string; auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; diff --git a/libNeonSet/include/Neon/set/TransferMode.h b/libNeonSet/include/Neon/set/TransferMode.h index 1b6881c8..a335f5da 100644 --- a/libNeonSet/include/Neon/set/TransferMode.h +++ b/libNeonSet/include/Neon/set/TransferMode.h @@ -30,6 +30,7 @@ class TransferModeUtils auto getOption() const -> TransferMode; auto set(const std::string& opt) -> void; auto getStringOptions() const -> std::string; + auto getStringOption() const -> std::string; auto getDoc () const -> std::string; auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const ->void; diff --git a/libNeonSet/src/set/StencilSemantic.cpp b/libNeonSet/src/set/StencilSemantic.cpp index f134fb8f..c5b24a60 100644 --- a/libNeonSet/src/set/StencilSemantic.cpp +++ b/libNeonSet/src/set/StencilSemantic.cpp @@ -96,6 +96,16 @@ auto StencilSemanticUtils::Cli::getStringOptions() const -> std::string return msg; } +auto StencilSemanticUtils::Cli::getStringOption() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "TransferSemantic was not set."; + NEON_ERROR(errorMsg.str()); + } + return StencilSemanticUtils::toString(mOption); +} + auto StencilSemanticUtils::Cli::getDoc() const-> std::string { std::stringstream s; diff --git a/libNeonSet/src/set/TransferMode.cpp b/libNeonSet/src/set/TransferMode.cpp index 7c3668bf..c2a30ab2 100644 --- a/libNeonSet/src/set/TransferMode.cpp +++ b/libNeonSet/src/set/TransferMode.cpp @@ -96,6 +96,16 @@ auto TransferModeUtils::Cli::getStringOptions() const -> std::string return msg; } +auto TransferModeUtils::Cli::getStringOption() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "TransferMode was not set."; + NEON_ERROR(errorMsg.str()); + } + return TransferModeUtils::toString(mOption); +} + auto TransferModeUtils::Cli::getDoc() const -> std::string { std::stringstream s; From cf19169f24a512b4576d75d63d6b812670553d16 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 12:26:19 +0200 Subject: [PATCH 65/94] Cleaning up LBM benchmarking --- benchmarks/lbm/src/DeviceD3QXX.h | 2 +- benchmarks/lbm/src/RunCavityTwoPop.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index f1b1d11e..1c457895 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -265,7 +265,7 @@ struct DeviceD3QXX Compute const& usqr /*! Usqr */, Compute const& omega /*! Omega */, Compute const& invBeta /*! invBeta */, - NEON_IO Storage pop[Lattice::Q]) + [[maybe_unused]] NEON_IO Storage pop[Lattice::Q]) -> void { diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cpp index 2659e6c3..62a245a1 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cpp @@ -37,7 +37,7 @@ auto run(Config& config, } code << "_SS" << config.stencilSemanticCli.getStringOption()<< "_"; code << "_SF" << config.spaceCurveCli.getStringOption() << "_"; - code << "_TM" << config.transferModeCli.getStringOptions() << "_"; + code << "_TM" << config.transferModeCli.getStringOption() << "_"; code << "__"; // using PopulationField = typename Grid::template Field; From 680b84e99e0bb3ea4e764ccce1a4a59964c49184 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 17:20:36 +0200 Subject: [PATCH 66/94] cuda issues --- benchmarks/lbm/src/CellType.h | 4 +- benchmarks/lbm/src/ContainerFactory.h | 34 -- benchmarks/lbm/src/ContainersD3Q19.h | 429 ----------------- benchmarks/lbm/src/ContainersD3Q27.h | 227 --------- benchmarks/lbm/src/ContainersD3QXX.h | 77 ++- benchmarks/lbm/src/D3Q19.h | 31 +- benchmarks/lbm/src/D3Q27.h | 31 +- benchmarks/lbm/src/DeviceD3Q19.h | 229 --------- benchmarks/lbm/src/DeviceD3Q27.h | 217 --------- benchmarks/lbm/src/DeviceD3QXX.h | 16 +- benchmarks/lbm/src/Lbm.h | 2 - benchmarks/lbm/src/LbmSkeleton.h | 117 ----- benchmarks/lbm/src/LbmToolsTemplateOnly.h | 440 ------------------ ...RunCavityTwoPop.cpp => RunCavityTwoPop.cu} | 4 +- .../Neon/core/types/vec/vec3d_integer.tdecl.h | 5 + 15 files changed, 105 insertions(+), 1758 deletions(-) delete mode 100644 benchmarks/lbm/src/ContainerFactory.h delete mode 100644 benchmarks/lbm/src/ContainersD3Q19.h delete mode 100644 benchmarks/lbm/src/ContainersD3Q27.h delete mode 100644 benchmarks/lbm/src/DeviceD3Q19.h delete mode 100644 benchmarks/lbm/src/DeviceD3Q27.h delete mode 100644 benchmarks/lbm/src/LbmSkeleton.h delete mode 100644 benchmarks/lbm/src/LbmToolsTemplateOnly.h rename benchmarks/lbm/src/{RunCavityTwoPop.cpp => RunCavityTwoPop.cu} (99%) diff --git a/benchmarks/lbm/src/CellType.h b/benchmarks/lbm/src/CellType.h index 57204a45..47c0397b 100644 --- a/benchmarks/lbm/src/CellType.h +++ b/benchmarks/lbm/src/CellType.h @@ -33,13 +33,13 @@ struct CellType operator int() const { return int(classification); } template - static auto isWall(const uint32_t& wallNghBitFlag) + NEON_CUDA_HOST_DEVICE static auto isWall(const uint32_t& wallNghBitFlag) -> bool { return wallNghBitFlag & (uint32_t(1) << fwdRegQ); } - auto setWall(int fwdRegIdx) + NEON_CUDA_HOST_DEVICE auto setWall(int fwdRegIdx) -> void { wallNghBitflag = wallNghBitflag | ((uint32_t(1) << fwdRegIdx)); diff --git a/benchmarks/lbm/src/ContainerFactory.h b/benchmarks/lbm/src/ContainerFactory.h deleted file mode 100644 index 980c67ae..00000000 --- a/benchmarks/lbm/src/ContainerFactory.h +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once -#include "CellType.h" -#include "D3Q19.h" -#include "Neon/Neon.h" -#include "Neon/set/Containter.h" - -namespace pull { -template -struct ContainerFactory -{ -}; -} // namespace pull - -namespace push { -template -struct ContainerFactory -{ -}; -} // namespace push - -namespace common { -template -struct ContainerFactory -{ -}; -} // namespace common -#include "ContainersD3Q19.h" -#include "ContainersD3Q27.h" \ No newline at end of file diff --git a/benchmarks/lbm/src/ContainersD3Q19.h b/benchmarks/lbm/src/ContainersD3Q19.h deleted file mode 100644 index 5c789ea5..00000000 --- a/benchmarks/lbm/src/ContainersD3Q19.h +++ /dev/null @@ -1,429 +0,0 @@ -#pragma once - -#include "./Methods.h" -#include "CellType.h" -#include "D3Q19.h" -#include "DeviceD3Q19.h" -#include "Methods.h" -#include "Neon/Neon.h" -#include "Neon/set/Containter.h" - -namespace pull { -/** - * Specialization for D3Q19 - */ -template -struct ContainerFactory, - Grid_> -{ - using Lattice = D3Q19; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - using PullFunctions = pull::DeviceD3Q19; - using CommonFunctions = common::DeviceD3Q19; - - static auto - iteration(Neon::set::StencilSemantic stencilSemantic, - const PopField& fInField /*! Input population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - const Compute omega /*! LBM omega parameter */, - PopField& fOutField /*! Output Population field */) - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "D3Q19_TwoPop_Pull", - [&, omega](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL, stencilSemantic); - auto& fOut = L.load(fOutField); - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - if (cellInfo.classification == CellType::bulk) { - - Storage popIn[Lattice::Q]; - PullFunctions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - - Compute rho; - std::array u{.0, .0, .0}; - CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - - Compute usqr = 1.5 * (u[0] * u[0] + - u[1] * u[1] + - u[2] * u[2]); - - PullFunctions::collideBgkUnrolled(gidx, - popIn, - rho, u, - usqr, omega, - NEON_OUT fOut); - } - }; - }); - return container; - } -}; -} // namespace pull -namespace push { -/** - * Specialization for D3Q19 - */ -template -struct ContainerFactory, - Grid_> -{ - using Lattice = D3Q19; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - using PushFunctions = push::DeviceD3Q19; - using CommonFunctions = common::DeviceD3Q19; - - static auto - iteration(Neon::set::StencilSemantic stencilSemantic, - const PopField& fInField /*! Input population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - const Compute omega /*! LBM omega parameter */, - PopField& fOutField /*! Output Population field */) - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "D3Q19_TwoPop", - [=](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL, stencilSemantic); - auto fOut = L.load(fOutField); - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - if (cellInfo.classification == CellType::bulk) { - - Storage popIn[Lattice::Q]; - CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); - - Compute rho; - std::array u{.0, .0, .0}; - CommonFunctions::macroscopic(popIn, - NEON_OUT rho, NEON_OUT u); - - Compute usqr = 1.5 * (u[0] * u[0] + - u[1] * u[1] + - u[2] * u[2]); - - CommonFunctions::collideBgkUnrolled(rho, u, - usqr, omega, - NEON_IO popIn); - - PushFunctions::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut); - } - }; - }); - return container; - } -}; -} // namespace push -namespace common { -/** - * Specialization for D3Q19 - */ -template -struct ContainerFactory, - Grid_> -{ - using Lattice = D3Q19; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - using PullFunctions = pull::DeviceD3Q19; - using PushFunctions = push::DeviceD3Q19; - using CommonFunctions = common::DeviceD3Q19; - - template - static auto - iteration([[maybe_unused]] Neon::set::StencilSemantic stencilSemantic, - [[maybe_unused]] const PopField fInField /*! Input population field */, - [[maybe_unused]] const CellTypeField& cellTypeField /*! Cell type field */, - [[maybe_unused]] const Compute omega /*! LBM omega parameter */, - [[maybe_unused]] PopField fOutField /*! Output Population field */) - -> Neon::set::Container - { - if constexpr (method_ == lbm::Method::push) { - using Factory = push::ContainerFactory, - Grid_>; - return Factory::iteration(stencilSemantic, - fInField, - cellTypeField, - omega, - fOutField); - } - if constexpr (method_ == lbm::Method::pull) { - using Factory = pull::ContainerFactory, - Grid_>; - return Factory::iteration(stencilSemantic, - fInField, - cellTypeField, - omega, - fOutField); - } - NEON_DEV_UNDER_CONSTRUCTION(""); - } - - - static auto - computeWallNghMask(const CellTypeField& infoInField, - CellTypeField& infoOutpeField) - - -> Neon::set::Container - { - Neon::set::Container container = infoInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); - auto& infoOut = L.load(infoOutpeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellType = infoIn(gidx, 0); - cellType.wallNghBitflag = 0; - - if (cellType.classification == CellType::bulk) { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwdRegIdx) { - using M = typename Lattice::template RegisterMapper; - if constexpr (M::centerMemQ != M::fwdMemQ) { - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); - if (nghCellType.classification == CellType::bounceBack || - nghCellType.classification == CellType::movingWall) { - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemQ)); - } - } - }); - infoOut(gidx, 0) = cellType; - } - }; - }); - return container; - } - - - template - static auto - userSettingBc(UserLambda userLambda, - PopField& pField, - CellTypeField& cellTypeField /*! Cell type field */) - -> Neon::set::Container - { - Neon::set::Container container = pField.getGrid().newContainer( - "UserSettingBc", - [&](Neon::set::Loader& L) -> auto { - auto& p = L.load(pField, Neon::Pattern::MAP); - auto& flag = L.load(cellTypeField, Neon::Pattern::MAP); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - const auto globalIdx = p.getGlobalIndex(gidx); - Storage pValues[Lattice::Q]; - CellType::Classification cellClass; - userLambda(globalIdx, pValues, cellClass); - - CellType flagVal(cellClass); - flag(gidx, 0) = flagVal; - - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - using M = typename Lattice::template RegisterMapper; - p(gidx, M::fwdMemQ) = pValues[M::fwdRegQ]; - }); - }; - }); - return container; - } - - static auto - copyPopulation(PopField& fInField, - PopField& foutField) - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto const& pIn = L.load(fInField, Neon::Pattern::MAP); - auto& pOut = L.load(foutField, Neon::Pattern::MAP); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - pOut(gidx, q) = pIn(gidx, q); - }); - }; - }); - return container; - } - - static auto - computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - Rho& rhoField /*! output Population field */, - U& uField /*! output Population field */) - - -> Neon::set::Container - { - Neon::set::Container container = - fInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL); - auto& rhoXpu = L.load(rhoField); - auto& uXpu = L.load(uField); - - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - Compute rho = 0; - std::array u{.0, .0, .0}; - - Storage popIn[Lattice::Q]; - CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); - - if (cellInfo.classification == CellType::bulk) { - CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - } else { - if (cellInfo.classification == CellType::movingWall) { - rho = 1.0; - u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), - static_cast(popIn[1]) / static_cast(6. * 1. / 18.), - static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; - } - } - - rhoXpu(gidx, 0) = static_cast(rho); - uXpu(gidx, 0) = static_cast(u[0]); - uXpu(gidx, 1) = static_cast(u[1]); - uXpu(gidx, 2) = static_cast(u[2]); - }; - }); - return container; - } - - static auto - problemSetup(PopField& fInField /*! inpout population field */, - PopField& fOutField, - CellTypeField& cellTypeField, - Neon::double_3d ulid, - double ulb) - - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&, ulid, ulb](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, Neon::Pattern::MAP); - auto& fOut = L.load(fOutField, Neon::Pattern::MAP); - auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - const auto globalIdx = fIn.getGlobalIndex(gidx); - const auto domainDim = fIn.getDomainSize(); - - CellType flagVal; - flagVal.classification = CellType::bulk; - flagVal.wallNghBitflag = 0; - - typename Lattice::Precision::Storage popVal = 0; - - if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || - globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 || - globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) { - flagVal.classification = CellType::bounceBack; - - if (globalIdx.y == domainDim.y - 1) { - flagVal.classification = CellType::movingWall; - } - - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - if (globalIdx.y == domainDim.y - 1) { - popVal = -6. * Lattice::Memory::template getT() * ulb * - (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + - Lattice::Memory::template getDirection().v[1] * ulid.v[1] + - Lattice::Memory::template getDirection().v[2] * ulid.v[2]); - } else { - popVal = 0; - } - fIn(gidx, q) = popVal; - fOut(gidx, q) = popVal; - }); - } else { - flagVal.classification = CellType::bulk; - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - fIn(gidx, q) = Lattice::Memory::template getT(); - fOut(gidx, q) = Lattice::Memory::template getT(); - }); - } - cellInfoPartition(gidx, 0) = flagVal; - }; - }); - return container; - } - - static auto - setToEquilibrium(PopField& fOutField, - CellTypeField& cellTypeField) - -> Neon::set::Container - { - Neon::set::Container container = fOutField.getGrid().newContainer( - "LBM_setToEquilibrium", - [&](Neon::set::Loader& L) -> auto { - auto& fOut = L.load(fOutField, Neon::Pattern::MAP); - auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - { // All pints are pre-set to bulk - CellType flagVal; - flagVal.classification = CellType::bulk; - cellInfoPartition(gidx, 0) = flagVal; - } - - { // All cells are pre-set to Equilibrium - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - using M = typename Lattice::template RegisterMapper; - fOut(gidx, M::fwdMemQ) = Lattice::Registers::template getT(); - }); - } - }; - }); - return container; - } -}; -} // namespace common \ No newline at end of file diff --git a/benchmarks/lbm/src/ContainersD3Q27.h b/benchmarks/lbm/src/ContainersD3Q27.h deleted file mode 100644 index d5d024ea..00000000 --- a/benchmarks/lbm/src/ContainersD3Q27.h +++ /dev/null @@ -1,227 +0,0 @@ -#pragma once - -#include "CellType.h" -#include "D3Q27.h" -#include "DeviceD3Q27.h" -#include "Neon/Neon.h" -#include "Neon/set/Containter.h" -#if 0 -/** - * Specialization for D3Q27 - */ -template -struct ContainerFactory, - Grid_> -{ - using Lattice = D3Q27; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - using Functions = DeviceD3Q19; - - static auto - iteration(Neon::set::StencilSemantic stencilSemantic, - const PopField& fInField /*! Input population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - const Compute omega /*! LBM omega parameter */, - PopField& fOutField /*! Output Population field */) - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "D3Q19_TwoPop", - [&, omega](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL, stencilSemantic); - auto& fOut = L.load(fOutField); - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - if (cellInfo.classification == CellType::bulk) { - - Storage popIn[Lattice::Q]; - Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - - Compute rho; - std::array u{.0, .0, .0}; - Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - - Compute usqr = 1.5 * (u[0] * u[0] + - u[1] * u[1] + - u[2] * u[2]); - - Functions::collideBgkUnrolled(gidx, - popIn, - rho, u, - usqr, omega, - NEON_OUT fOut); - } - }; - }); - return container; - } - - - static auto - computeWallNghMask(const CellTypeField& infoInField, - CellTypeField& infoOutpeField) - - -> Neon::set::Container - { - Neon::set::Container container = infoInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); - auto& infoOut = L.load(infoOutpeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellType = infoIn(gidx, 0); - cellType.wallNghBitflag = 0; - - if (cellType.classification == CellType::bulk) { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { - if constexpr (GOMemoryId != Lattice::Memory::center) { - constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; - constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; - constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; - constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; - - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); - if (nghCellType.classification != CellType::bulk) { - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId)); - } - } - }); - - infoOut(gidx, 0) = cellType; - } - }; - }); - return container; - } - - - static auto - computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - Rho& rhoField /*! output Population field */, - U& uField /*! output Population field */) - - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL); - auto& rhoXpu = L.load(rhoField); - auto& uXpu = L.load(uField); - - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - Compute rho = 0; - std::array u{.0, .0, .0}; - Storage popIn[Lattice::Q]; - - if (cellInfo.classification == CellType::bulk) { - - Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - - } else { - if (cellInfo.classification == CellType::movingWall) { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GORegisterId) { - if constexpr (GORegisterId == Lattice::Registers::center) { - popIn[Lattice::Registers::center] = fIn(gidx, Lattice::Memory::center); - } else { - popIn[GORegisterId] = fIn(gidx, Lattice::Memory::template mapFromRegisters()); - } - }); - - rho = 1.0; - u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), - static_cast(popIn[1]) / static_cast(6. * 1. / 18.), - static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; - } - } - - rhoXpu(gidx, 0) = static_cast(rho); - uXpu(gidx, 0) = static_cast(u[0]); - uXpu(gidx, 1) = static_cast(u[1]); - uXpu(gidx, 2) = static_cast(u[2]); - }; - }); - return container; - } - - static auto - problemSetup(PopField& fInField /*! inpout population field */, - PopField& fOutField, - CellTypeField& cellTypeField, - Neon::double_3d ulid, - double ulb) - - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&, ulid, ulb](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, Neon::Pattern::MAP); - auto& fOut = L.load(fOutField, Neon::Pattern::MAP); - auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { - const auto globlalIdx = fIn.getGlobalIndex(gidx); - const auto domainDim = fIn.getDomainSize(); - CellType flagVal; - flagVal.classification = CellType::bulk; - flagVal.wallNghBitflag = 0; - typename Lattice::Precision::Storage val = 0; - - if (globlalIdx.x == 0 || globlalIdx.x == domainDim.x - 1 || - globlalIdx.y == 0 || globlalIdx.y == domainDim.y - 1 || - globlalIdx.z == 0 || globlalIdx.z == domainDim.z - 1) { - flagVal.classification = CellType::bounceBack; - - if (globlalIdx.y == domainDim.y - 1) { - flagVal.classification = CellType::movingWall; - } - - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - if (globlalIdx.y == domainDim.y - 1) { - val = -6. * Lattice::Memory::template getT() * ulb * - (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + - Lattice::Memory::template getDirection().v[1] * ulid.v[1] + - Lattice::Memory::template getDirection().v[2] * ulid.v[2]); - } else { - val = 0; - } - fIn(gidx, q) = val; - fOut(gidx, q) = val; - }); - } else { - flagVal.classification = CellType::bulk; - cellInfoPartition(gidx, 0) = flagVal; - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - fIn(gidx, q) = Lattice::Memory::template getT(); - fOut(gidx, q) = Lattice::Memory::template getT(); - }); - } - }; - }); - return container; - } -}; -#endif \ No newline at end of file diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h index cfb931d6..440d785a 100644 --- a/benchmarks/lbm/src/ContainersD3QXX.h +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -3,7 +3,6 @@ #include "./Methods.h" #include "CellType.h" #include "D3Q19.h" -#include "DeviceD3Q19.h" #include "DeviceD3QXX.h" #include "Methods.h" #include "Neon/Neon.h" @@ -53,6 +52,9 @@ struct ContainerFactoryD3QXX [[maybe_unused]] const Compute invBeta = 1.0 / beta; return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { @@ -69,13 +71,13 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, - usqr, omega, + usqr, capturedOmega, NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, - usqr, omega, - invBeta, + usqr, capturedOmega, + capturedInvBeta, NEON_IO popRegisters); } Device::Common::localStoreOpposite(gidx, popRegisters, popMem); @@ -115,6 +117,9 @@ struct ContainerFactoryD3QXX [[maybe_unused]] const Compute invBeta = 1.0 / beta; return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { @@ -133,13 +138,13 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, - usqr, omega, + usqr, capturedOmega, NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, - usqr, omega, - invBeta, + usqr, capturedOmega, + capturedInvBeta, NEON_IO popRegisters); } Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popRegisters, NEON_OUT fpop); @@ -183,6 +188,9 @@ struct ContainerFactoryD3QXX [[maybe_unused]] const Compute invBeta = 1.0 / beta; return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { @@ -199,13 +207,13 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, - usqr, omega, + usqr, capturedOmega, NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, - usqr, omega, - invBeta, + usqr, capturedOmega, + capturedInvBeta, NEON_IO popRegisters); } Device::Common::localStore(gidx, popRegisters, fOut); @@ -232,6 +240,9 @@ struct ContainerFactoryD3QXX [[maybe_unused]] const Compute invBeta = 1.0 / beta; return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { @@ -248,13 +259,13 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, - usqr, omega, + usqr, capturedOmega, NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, - usqr, omega, - invBeta, + usqr, capturedOmega, + capturedInvBeta, NEON_IO popRegisters); } Device::Common::localStore(gidx, popRegisters, fOut); @@ -336,6 +347,9 @@ struct ContainerFactoryD3QXX [[maybe_unused]] const Compute invBeta = 1.0 / beta; return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + CellType cellInfo = cellInfoPartition(gidx, 0); if (cellInfo.classification == CellType::bulk) { @@ -354,13 +368,13 @@ struct ContainerFactoryD3QXX if constexpr (CollisionId == Collision::bgk) { Device::Common::collideBgkUnrolled(rho, u, - usqr, omega, + usqr, capturedOmega, NEON_IO popRegisters); } if constexpr (CollisionId == Collision::kbc) { Device::Common::collideKBCUnrolled(rho, u, - usqr, omega, - invBeta, + usqr, capturedOmega, + capturedInvBeta, NEON_IO popRegisters); } Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popRegisters, NEON_OUT fOut); @@ -421,37 +435,6 @@ struct ContainerFactoryD3QXX struct Common { - template - static auto - iteration([[maybe_unused]] Neon::set::StencilSemantic stencilSemantic, - [[maybe_unused]] const PopField fInField /*! Input population field */, - [[maybe_unused]] const CellTypeField& cellTypeField /*! Cell type field */, - [[maybe_unused]] const Compute omega /*! LBM omega parameter */, - [[maybe_unused]] PopField fOutField /*! Output Population field */) - -> Neon::set::Container - { - if constexpr (method_ == lbm::Method::push) { - using FactoryPush = push::ContainerFactory, - Grid_>; - return FactoryPush::iteration(stencilSemantic, - fInField, - cellTypeField, - omega, - fOutField); - } - if constexpr (method_ == lbm::Method::pull) { - using FactoryPull = pull::ContainerFactory, - Grid_>; - return FactoryPull::iteration(stencilSemantic, - fInField, - cellTypeField, - omega, - fOutField); - } - NEON_DEV_UNDER_CONSTRUCTION(""); - } static auto diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h index 816e46f6..cc29c225 100644 --- a/benchmarks/lbm/src/D3Q19.h +++ b/benchmarks/lbm/src/D3Q19.h @@ -29,8 +29,9 @@ struct D3Q19 struct Registers { + using Self = D3Q19::Registers; - static constexpr std::array stencil{ + static constexpr Neon::index_3d stencil[Q]{ /*! 0 */ Neon::index_3d(-1, 0, 0), /*! 1 */ Neon::index_3d(0, -1, 0), /*! 2 */ Neon::index_3d(0, 0, -1), @@ -52,9 +53,31 @@ struct D3Q19 /*! 18 */ Neon::index_3d(0, 1, -1)}; template - static inline NEON_CUDA_HOST_DEVICE auto - getComponentOfDirection() -> int{ - return stencil[qIdx].v[cIdx]; + NEON_CUDA_HOST_DEVICE static constexpr auto + getComponentOfDirection() -> int + { + constexpr Neon::index_3d s[Q]{ + /*! 0 */ Neon::index_3d(-1, 0, 0), + /*! 1 */ Neon::index_3d(0, -1, 0), + /*! 2 */ Neon::index_3d(0, 0, -1), + /*! 3 */ Neon::index_3d(-1, -1, 0), + /*! 4 */ Neon::index_3d(-1, 1, 0), + /*! 5 */ Neon::index_3d(-1, 0, -1), + /*! 6 */ Neon::index_3d(-1, 0, 1), + /*! 7 */ Neon::index_3d(0, -1, -1), + /*! 8 */ Neon::index_3d(0, -1, 1), + /*! 9 */ Neon::index_3d(0, 0, 0), + /*! 10 */ Neon::index_3d(1, 0, 0), + /*! 11 */ Neon::index_3d(0, 1, 0), + /*! 12 */ Neon::index_3d(0, 0, 1), + /*! 13 */ Neon::index_3d(1, 1, 0), + /*! 14 */ Neon::index_3d(1, -1, 0), + /*! 15 */ Neon::index_3d(1, 0, 1), + /*! 16 */ Neon::index_3d(1, 0, -1), + /*! 17 */ Neon::index_3d(0, 1, 1), + /*! 18 */ Neon::index_3d(0, 1, -1)}; + + return s[qIdx].template getComponent(); } static constexpr int center = 9; /** Position of direction {0,0,0} */ diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h index 3830ecda..4512ef83 100644 --- a/benchmarks/lbm/src/D3Q27.h +++ b/benchmarks/lbm/src/D3Q27.h @@ -63,7 +63,36 @@ struct D3Q27 static constexpr inline NEON_CUDA_HOST_DEVICE auto getComponentOfDirection() -> int { - return stencil[qIdx].v[cIdx]; + constexpr std::array s{ + /* 00 */ Neon::index_3d(-1, 0, 0), + /* 01 */ Neon::index_3d(0, -1, 0), + /* 02 */ Neon::index_3d(0, 0, -1), + /* 03 */ Neon::index_3d(-1, -1, 0), + /* 04 */ Neon::index_3d(-1, 1, 0), + /* 05 */ Neon::index_3d(-1, 0, -1), + /* 06 */ Neon::index_3d(-1, 0, 1), + /* 07 */ Neon::index_3d(0, -1, -1), + /* 08 */ Neon::index_3d(0, -1, 1), + /* 09 */ Neon::index_3d(-1, -1, -1), + /* 00 */ Neon::index_3d(-1, -1, 1), + /* 11 */ Neon::index_3d(-1, 1, -1), + /* 12 */ Neon::index_3d(-1, 1, 1), + /* 13 */ Neon::index_3d(0, 0, 0), + /* 14 */ Neon::index_3d(1, 0, 0), + /* 15 */ Neon::index_3d(0, 1, 0), + /* 16 */ Neon::index_3d(0, 0, 1), + /* 17 */ Neon::index_3d(1, 1, 0), + /* 18 */ Neon::index_3d(1, -1, 0), + /* 19 */ Neon::index_3d(1, 0, 1), + /* 20 */ Neon::index_3d(1, 0, -1), + /* 21 */ Neon::index_3d(0, 1, 1), + /* 22 */ Neon::index_3d(0, 1, -1), + /* 23 */ Neon::index_3d(1, 1, 1), + /* 24 */ Neon::index_3d(1, 1, -1), + /* 25 */ Neon::index_3d(1, -1, 1), + /* 26 */ Neon::index_3d(1, -1, -1)}; + + return s[qIdx].v[cIdx]; } static constexpr int center = 13; /** Position of direction {0,0,0} */ diff --git a/benchmarks/lbm/src/DeviceD3Q19.h b/benchmarks/lbm/src/DeviceD3Q19.h deleted file mode 100644 index 60a4033d..00000000 --- a/benchmarks/lbm/src/DeviceD3Q19.h +++ /dev/null @@ -1,229 +0,0 @@ -#pragma once -#include "CellType.h" -#include "D3Q19.h" -#include "Neon/Neon.h" -#include "Neon/set/Containter.h" - -namespace pull { -template -struct DeviceD3Q19 -{ - using Lattice = D3Q19; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - - static inline NEON_CUDA_HOST_DEVICE auto - pullStream(Idx const& gidx, - const uint32_t& wallBitFlag, - typename PopField::Partition const& fin, - NEON_OUT Storage popIn[Lattice::Q]) - { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - using M = typename Lattice::template RegisterMapper; - - if constexpr (M::fwdMemQ == M::centerMemQ) { - popIn[M::centerRegQ] = fin(gidx, M::centerMemQ); - } else { - if (CellType::isWall()) { - popIn[M::fwdRegQ] = fin(gidx, M::bkMemIdx) + - fin.template getNghData(gidx, M::bkwMemIdx)(); - } else { - popIn[M::fwdRegQ] = fin.template getNghData(gidx, M::fwdMemIdx)(); - } - } - }); - } -}; - -} // namespace pull - -namespace push { -template -struct DeviceD3Q19 -{ - using Lattice = D3Q19; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - - static inline NEON_CUDA_HOST_DEVICE auto - pushStream(Idx const& gidx, - const uint32_t& wallNghBitFlag, - NEON_OUT Storage pOut[Lattice::Q], - NEON_OUT typename PopField::Partition& fOut) - { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - using M = typename Lattice::template RegisterMapper; - - if constexpr (M::fwdMemQ == M::centerMemQ) { - fOut(gidx, M::centerMemQ) = pOut[M::centerRegQ]; - } else { - if (CellType::isWall(wallNghBitFlag)) { - const auto pop_out = pOut[M::fwdRegQ]; - const auto f_nb_k = fOut.template getNghData(gidx, M::fwdMemQ)(); - - // fout(i, opp[k]) = - fOut(gidx, M::bkwMemQ) = - // pop_out + - pop_out + - // f(nb, k); - f_nb_k; - } else { - // fout(nb, - fOut.template writeNghData(gidx, - // k) - M::fwdMemQ, - // = pop_out; - pOut[M::fwdRegQ]); - } - } - }); - } -}; -} // namespace push - - -namespace common { -template -struct DeviceD3Q19 -{ - using Lattice = D3Q19; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - - static inline NEON_CUDA_HOST_DEVICE auto - macroscopic(const Storage pop[Lattice::Q], - NEON_OUT Compute& rho, - NEON_OUT std::array& u) - -> void - { - -#define POP(IDX) static_cast(pop[IDX]) - const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); - const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); - const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); - - const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); - const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); - - const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); - const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); -#undef POP - - rho = X_M1 + X_P1 + X_0; - u[0] = (X_P1 - X_M1) / rho; - u[1] = (Y_P1 - Y_M1) / rho; - u[2] = (Z_P1 - Z_M1) / rho; - } - - - static inline NEON_CUDA_HOST_DEVICE auto - collideBgkUnrolled(Compute const& rho /*! Density */, - std::array const& u /*! Velocity */, - Compute const& usqr /*! Usqr */, - Compute const& omega /*! Omega */, - NEON_IO Storage pop[Lattice::Q]) - - -> void - { - - // constexpr Compute c1over18 = 1. / 18.; - constexpr Compute c4dot5 = 4.5; - constexpr Compute c3 = 3.; - constexpr Compute c1 = 1.; - constexpr Compute c6 = 6.; - - // constexpr int regCenter = Lattice::Registers::center; - // constexpr int regFir = Lattice::Registers::center; - - Neon::ConstexprFor<0, Lattice::Registers::firstHalfQLen, 1>( - [&](auto q) { - using M = typename Lattice::template RegisterMapper; - using T = typename Lattice::Registers; - - Compute eqFw; - Compute eqBk; - - const Compute ck_u = T::template getCk_u(u); - - // double eq = rho * t[k] * - // (1. + - // 3. * ck_u + - // 4.5 * ck_u * ck_u - - // usqr); - eqFw = rho * T::t[M::fwdRegQ] * - (c1 + - c3 * ck_u + - c4dot5 * ck_u * ck_u - - usqr); - - // double eqopp = eq - 6.* rho * t[k] * ck_u; - eqBk = eqFw - - c6 * rho * T::t[M::fwdRegQ] * ck_u; - - // pop_out = (1. - omega) * fin(i, k) + omega * eq; - pop[M::fwdRegQ] = (c1 - omega) * static_cast(pop[M::fwdRegQ]) + omega * eqFw; - // pop_out_opp = (1. - omega) * fin(i, opp[k]) + omega * eqopp; - pop[M::bkwRegQ] = (c1 - omega) * static_cast(pop[M::bkwRegQ]) + omega * eqBk; - }); - { // Center; - using T = typename Lattice::Registers; - using M = typename Lattice::template RegisterMapper; - // eq = rho * t[k] * (1. - usqr); - const Compute eqCenter = rho * T::t[M::centerRegQ] * (c1 - usqr); - // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; - pop[M::centerRegQ] = (c1 - omega) * static_cast(pop[M::centerRegQ]) + omega * eqCenter; - } - } - static inline NEON_CUDA_HOST_DEVICE auto - localLoad(Idx const& gidx, - NEON_IN typename PopField::Partition const& fOut, - Storage NEON_RESTRICT pOut[Lattice::Q]) - { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - using M = typename Lattice::template RegisterMapper; - pOut[M::fwdRegQ] = fOut(gidx, M::fwdMemQ); - }); - } - - static inline NEON_CUDA_HOST_DEVICE auto - localStore(Idx const& gidx, - Storage NEON_RESTRICT pOut[Lattice::Q], - NEON_IN typename PopField::Partition& fOut) - { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - using M = typename Lattice::template RegisterMapper; - fOut(gidx, M::fwdMemQ) = pOut[M::fwdRegQ]; - }); - } -}; -} // namespace common \ No newline at end of file diff --git a/benchmarks/lbm/src/DeviceD3Q27.h b/benchmarks/lbm/src/DeviceD3Q27.h deleted file mode 100644 index f977492b..00000000 --- a/benchmarks/lbm/src/DeviceD3Q27.h +++ /dev/null @@ -1,217 +0,0 @@ -#pragma once -#include "CellType.h" -#include "D3Q27.h" -#include "Neon/Neon.h" -#include "Neon/set/Containter.h" - - -template -struct DeviceD3Q27 -{ - using Lattice = D3Q27; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - - static inline NEON_CUDA_HOST_DEVICE auto - pullStream(Idx const& gidx, - const uint32_t& wallBitFlag, - typename PopField::Partition const& fin, - NEON_OUT Storage popIn[Lattice::Q]) - { - - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { - if constexpr (GOMemoryId == Lattice::Memory::center) { - popIn[Lattice::Registers::center] = fin(gidx, Lattice::Memory::center); - } else { - constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; - constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; - constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; - constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; - constexpr int GORegistersId = Lattice::Memory::template mapToRegisters(); - - if (wallBitFlag & (uint32_t(1) << GOMemoryId)) { - popIn[GORegistersId] = - fin(gidx, BKMemoryId) + - fin.template getNghData(gidx, BKMemoryId)(); - } else { - popIn[GORegistersId] = - fin.template getNghData(gidx, GOMemoryId)(); - } - } - }); - } - - static inline NEON_CUDA_HOST_DEVICE auto - macroscopic(const Storage pop[Lattice::Q], - NEON_OUT Compute& rho, - NEON_OUT std::array& u) - -> void - { - -#define POP(IDX) static_cast(pop[IDX]) - const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6) + POP(9) + POP(10) + POP(11) + POP(12); - const Compute X_P1 = POP(14) + POP(17) + POP(18) + POP(19) + POP(20) + POP(23) + POP(24) + POP(25) + POP(26); - const Compute X_0 = POP(1) + POP(2) + POP(7) + POP(8) + POP(13) + POP(15) + POP(16) + POP(21) + POP(22); - - const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(9) + POP(10) + POP(18) + POP(25) + POP(26); - const Compute Y_P1 = POP(15) + POP(17) + POP(21) + POP(22) + POP(23) + POP(24) + POP(4) + POP(11) + POP(12); - - const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(9) + POP(11) + POP(20) + POP(22) + POP(24) + POP(26); - const Compute Z_P1 = POP(16) + POP(19) + POP(21) + POP(23) + POP(25) + POP(6) + POP(8) + POP(10) + POP(12); -#undef POP - - rho = X_M1 + X_P1 + X_0; - u[0] = (X_P1 - X_M1) / rho; - u[0] = (Y_P1 - Y_M1) / rho; - u[0] = (Z_P1 - Z_M1) / rho; - } - - - static inline NEON_CUDA_HOST_DEVICE auto - collideBgkUnrolled(Idx const& i /*! Compute iterator */, - const Storage pop[Lattice::Q], - Compute const& rho /*! Density */, - std::array const& u /*! Velocity */, - Compute const& usqr /*! Usqr */, - Compute const& omega /*! Omega */, - typename PopField::Partition& fOut /*! Population */) - - -> void - { - const Compute cku1 = u[0] + u[1]; - const Compute cku2 = -u[0] + u[1]; - const Compute cku3 = u[0] + u[2]; - const Compute cku4 = -u[0] + u[2]; - const Compute cku5 = u[1] + u[2]; - const Compute cku6 = -u[1] + u[2]; - const Compute cku7 = u[0] + u[1] + u[2]; - const Compute cku8 = -u[0] + u[1] + u[2]; - const Compute cku9 = u[0] - u[1] + u[2]; - const Compute cku0 = u[0] + u[1] - u[2]; - - std::array feqRM; - - constexpr int F000 = 13; - constexpr int FM00 = 0; - constexpr int F0M0 = 1; - constexpr int F00M = 2; - constexpr int FMM0 = 3; - constexpr int FMP0 = 4; - constexpr int FM0M = 5; - constexpr int FM0P = 6; - constexpr int F0MM = 7; - constexpr int F0MP = 8; - constexpr int FMMM = 9; - constexpr int FMMP = 10; - constexpr int FMPM = 11; - constexpr int FMPP = 12; - constexpr int FP00 = 14; - constexpr int F0P0 = 15; - constexpr int F00P = 16; - constexpr int FPP0 = 17; - constexpr int FPM0 = 18; - constexpr int FP0P = 19; - constexpr int FP0M = 20; - constexpr int F0PP = 21; - constexpr int F0PM = 22; - constexpr int FPPP = 23; - constexpr int FPPM = 24; - constexpr int FPMP = 25; - constexpr int FPMM = 26; - - constexpr Compute c1over18 = 1. / 18.; - constexpr Compute c1over36 = 1. / 36.; - constexpr Compute c4dot5 = 4.5; - constexpr Compute c3 = 3.; - constexpr Compute c1 = 1.; - constexpr Compute c6 = 6.; - - feqRM[F000] = rho * Lattice::Registers::t[F000] * (c1- usqr); - - feqRM[FM00] = rho * Lattice::Registers::t[FM00] * (c1- c3* u[0] + c4dot5* u[0] * u[0] - usqr); - feqRM[FP00] = rho * Lattice::Registers::t[FP00] * (c6 * u[0]) + feqRM[FM00]; - - feqRM[F0M0] = rho * Lattice::Registers::t[F0M0] * (c1- c3* u[1] + c4dot5* u[1] * u[1] - usqr); - feqRM[F0P0] = rho * Lattice::Registers::t[F0P0] * (c6 * u[1]) + feqRM[F0M0]; - - feqRM[F00M] = rho * Lattice::Registers::t[F00M] * (c1- c3* u[2] + c4dot5* u[2] * u[2] - usqr); - feqRM[F00P] = rho * Lattice::Registers::t[F00P] * (c6 * u[2]) + feqRM[F00M]; - - feqRM[FMM0] = rho * Lattice::Registers::t[FMM0] * (c1- c3* cku1 + c4dot5* cku1 * cku1 - usqr); - feqRM[FPP0] = rho * Lattice::Registers::t[FPP0] * (c6 * cku1) + feqRM[FMM0]; - feqRM[FPM0] = rho * Lattice::Registers::t[FPM0] * (c1- c3* cku2 + c4dot5* cku2 * cku2 - usqr); - feqRM[FMP0] = rho * Lattice::Registers::t[FMP0] * (c6 * cku2) + feqRM[FPM0]; - - feqRM[FM0M] = rho * Lattice::Registers::t[FM0M] * (c1- c3* cku3 + c4dot5* cku3 * cku3 - usqr); - feqRM[FP0P] = rho * Lattice::Registers::t[FP0P] * (c6 * cku3) + feqRM[FM0M]; - feqRM[FP0M] = rho * Lattice::Registers::t[FP0M] * (c1- c3* cku4 + c4dot5* cku4 * cku4 - usqr); - feqRM[FM0P] = rho * Lattice::Registers::t[FM0P] * (c6 * cku4) + feqRM[FP0M]; - - feqRM[F0MM] = rho * Lattice::Registers::t[F0MM] * (c1- c3* cku5 + c4dot5* cku5 * cku5 - usqr); - feqRM[F0PP] = rho * Lattice::Registers::t[F0PP] * (c6 * cku5) + feqRM[F0MM]; - feqRM[F0PM] = rho * Lattice::Registers::t[F0PM] * (c1- c3* cku6 + c4dot5* cku6 * cku6 - usqr); - feqRM[F0MP] = rho * Lattice::Registers::t[F0MP] * (c6 * cku6) + feqRM[F0PM]; - - feqRM[FMMM] = rho * Lattice::Registers::t[FMMM] * (c1- c3* cku7 + c4dot5* cku7 * cku7 - usqr); - feqRM[FPPP] = rho * Lattice::Registers::t[FPPP] * (c6 * cku7) + feqRM[FMMM]; - feqRM[FPMM] = rho * Lattice::Registers::t[FPMM] * (c1- c3* cku8 + c4dot5* cku8 * cku8 - usqr); - feqRM[FMPP] = rho * Lattice::Registers::t[FMPP] * (c6 * cku8) + feqRM[FPMM]; - feqRM[FMPM] = rho * Lattice::Registers::t[FMPM] * (c1- c3* cku9 + c4dot5* cku9 * cku9 - usqr); - feqRM[FPMP] = rho * Lattice::Registers::t[FPMP] * (c6 * cku9) + feqRM[FMPM]; - feqRM[FMMP] = rho * Lattice::Registers::t[FMMP] * (c1- c3* cku0 + c4dot5* cku0 * cku0 - usqr); - feqRM[FPPM] = rho * Lattice::Registers::t[FPPM] * (c6 * cku0) + feqRM[FMMP]; - - // BGK Collision based on the second-order equilibrium - std::array foutRM; - - foutRM[F000] = (c1- omega) * static_cast(pop[F000]) + omega * feqRM[F000]; - - foutRM[FP00] = (c1- omega) * static_cast(pop[FP00]) + omega * feqRM[FP00]; - foutRM[FM00] = (c1- omega) * static_cast(pop[FM00]) + omega * feqRM[FM00]; - - foutRM[F0P0] = (c1- omega) * static_cast(pop[F0P0]) + omega * feqRM[F0P0]; - foutRM[F0M0] = (c1- omega) * static_cast(pop[F0M0]) + omega * feqRM[F0M0]; - - foutRM[F00P] = (c1- omega) * static_cast(pop[F00P]) + omega * feqRM[F00P]; - foutRM[F00M] = (c1- omega) * static_cast(pop[F00M]) + omega * feqRM[F00M]; - - foutRM[FPP0] = (c1- omega) * static_cast(pop[FPP0]) + omega * feqRM[FPP0]; - foutRM[FMP0] = (c1- omega) * static_cast(pop[FMP0]) + omega * feqRM[FMP0]; - foutRM[FPM0] = (c1- omega) * static_cast(pop[FPM0]) + omega * feqRM[FPM0]; - foutRM[FMM0] = (c1- omega) * static_cast(pop[FMM0]) + omega * feqRM[FMM0]; - - foutRM[FP0P] = (c1- omega) * static_cast(pop[FP0P]) + omega * feqRM[FP0P]; - foutRM[FM0P] = (c1- omega) * static_cast(pop[FM0P]) + omega * feqRM[FM0P]; - foutRM[FP0M] = (c1- omega) * static_cast(pop[FP0M]) + omega * feqRM[FP0M]; - foutRM[FM0M] = (c1- omega) * static_cast(pop[FM0M]) + omega * feqRM[FM0M]; - - foutRM[F0PP] = (c1- omega) * static_cast(pop[F0PP]) + omega * feqRM[F0PP]; - foutRM[F0MP] = (c1- omega) * static_cast(pop[F0MP]) + omega * feqRM[F0MP]; - foutRM[F0PM] = (c1- omega) * static_cast(pop[F0PM]) + omega * feqRM[F0PM]; - foutRM[F0MM] = (c1- omega) * static_cast(pop[F0MM]) + omega * feqRM[F0MM]; - - foutRM[FPPP] = (c1- omega) * static_cast(pop[FPPP]) + omega * feqRM[FPPP]; - foutRM[FMPP] = (c1- omega) * static_cast(pop[FMPP]) + omega * feqRM[FMPP]; - foutRM[FPMP] = (c1- omega) * static_cast(pop[FPMP]) + omega * feqRM[FPMP]; - foutRM[FPPM] = (c1- omega) * static_cast(pop[FPPM]) + omega * feqRM[FPPM]; - foutRM[FMMP] = (c1- omega) * static_cast(pop[FMMP]) + omega * feqRM[FMMP]; - foutRM[FMPM] = (c1- omega) * static_cast(pop[FMPM]) + omega * feqRM[FMPM]; - foutRM[FPMM] = (c1- omega) * static_cast(pop[FPMM]) + omega * feqRM[FPMM]; - foutRM[FMMM] = (c1- omega) * static_cast(pop[FMMM]) + omega * feqRM[FMMM]; - - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { - fOut(i, GOMemoryId) = static_cast(foutRM[Lattice::Memory::template mapToRegisters()]); - }); - } -}; - diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index 1c457895..b7f487a2 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -250,8 +250,8 @@ struct DeviceD3QXX static inline NEON_CUDA_HOST_DEVICE auto localStoreOpposite(Idx const& gidx, - Storage NEON_RESTRICT pOut[Lattice::Q], - NEON_OUT typename PopField::Partition& fOut) + Storage NEON_RESTRICT pOut[Lattice::Q], + NEON_OUT typename PopField::Partition& fOut) { Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { using M = typename Lattice::template RegisterMapper; @@ -260,12 +260,12 @@ struct DeviceD3QXX } static inline NEON_CUDA_HOST_DEVICE auto - collideKBCUnrolled(Compute const& rho /*! Density */, - std::array const& u /*! Velocity */, - Compute const& usqr /*! Usqr */, - Compute const& omega /*! Omega */, - Compute const& invBeta /*! invBeta */, - [[maybe_unused]] NEON_IO Storage pop[Lattice::Q]) + collideKBCUnrolled(Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + Compute const& invBeta /*! invBeta */, + [[maybe_unused]] NEON_IO Storage pop[Lattice::Q]) -> void { diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index aa47a45a..132a3d99 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -3,8 +3,6 @@ #include "./Metrics.h" #include "./Repoert.h" #include "CellType.h" -#include "ContainerFactory.h" -#include "ContainersD3Q19.h" #include "ContainersD3QXX.h" #include "D3Q19.h" #include "Methods.h" diff --git a/benchmarks/lbm/src/LbmSkeleton.h b/benchmarks/lbm/src/LbmSkeleton.h deleted file mode 100644 index 22ae8177..00000000 --- a/benchmarks/lbm/src/LbmSkeleton.h +++ /dev/null @@ -1,117 +0,0 @@ -#include "CellType.h" -#include "ContainerFactory.h" -#include "ContainersD3Q19.h" -#include "D3Q19.h" -#include "Neon/Neon.h" -#include "Neon/set/Backend.h" -#include "Neon/set/Containter.h" -#include "Neon/skeleton/Skeleton.h" - -template -struct LbmSkeleton -{ -}; - - -template -struct LbmSkeleton, - Grid_> -{ - using Lattice = D3Q19; - using Precision = Precision_; - using Compute = typename Precision::Compute; - using Storage = typename Precision::Storage; - using Grid = Grid_; - - using PopField = typename Grid::template Field; - using CellTypeField = typename Grid::template Field; - - using Idx = typename PopField::Idx; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - - using ContainerFactory = common::ContainerFactory; - - LbmSkeleton(Neon::set::StencilSemantic stencilSemantic, - Neon::skeleton::Occ occ, - Neon::set::TransferMode transfer, - PopField& fIn /*! inpout population field */, - PopField& fOut, - CellTypeField& cellTypeField /*! Cell type field */, - Compute omega /*! LBM omega parameter */) - { - pop[0] = fIn; - pop[1] = fOut; - - setupSkeletons(0, stencilSemantic, occ, transfer, pop[0], pop[1], cellTypeField, omega); - setupSkeletons(1, stencilSemantic, occ, transfer, pop[1], pop[0], cellTypeField, omega); - - parity = 0; - } - - auto getInput() - -> PopField& - { - return pop[parity]; - } - - auto getOutput() - -> PopField& - { - int other = parity == 0 ? 1 : 0; - return pop[other]; - } - - auto run() - -> void - { - lbmTwoPop[parity].run(); - updateParity(); - } - - auto sync() - -> void - { - pop[0].getBackend().syncAll(); - } - - private: - auto updateParity() - -> void - { - parity = parity == 0 ? 1 : 0; - } - - auto setupSkeletons(int target, - Neon::set::StencilSemantic stencilSemantic, - Neon::skeleton::Occ occ, - Neon::set::TransferMode transfer, - PopField& inField /*! inpout population field */, - PopField& outField, - CellTypeField& cellTypeField /*! Cell type field */, - Compute omega /*! LBM omega parameter */) - { - std::vector ops; - lbmTwoPop[target] = Neon::skeleton::Skeleton(inField.getBackend()); - Neon::skeleton::Options opt(occ, transfer); - ops.push_back(ContainerFactory::template iteration(stencilSemantic, - inField, - cellTypeField, - omega, - outField)); - std::stringstream appName; - appName << "LBM_iteration_" << std::to_string(target); - lbmTwoPop[target].sequence(ops, appName.str(), opt); - } - - Neon::skeleton::Skeleton lbmTwoPop[2]; - PopField pop[2]; - int parity; -}; \ No newline at end of file diff --git a/benchmarks/lbm/src/LbmToolsTemplateOnly.h b/benchmarks/lbm/src/LbmToolsTemplateOnly.h deleted file mode 100644 index 489b3782..00000000 --- a/benchmarks/lbm/src/LbmToolsTemplateOnly.h +++ /dev/null @@ -1,440 +0,0 @@ -#include "CellType.h" -#include "D3Q19.h" -#include "Neon/Neon.h" -#include "Neon/set/Containter.h" - -#define COMPUTE_CAST(VAR) static_cast((VAR)) - -template -struct LbmContainersTemplateOnly -{ -}; - -/** - * Specialization for Lattice - * @tparam PopulationField - * @tparam LbmComputeType - */ -template -struct LbmContainersTemplateOnly, - PopulationField, - LbmComputeType> -{ - using LbmStoreType = typename PopulationField::Type; - using CellTypeField = typename PopulationField::Grid::template Field; - using Lattice = D3Q19; - using Idx = typename PopulationField::Idx; - using Grid = typename PopulationField::Grid; - using Rho = typename Grid::template Field; - using U = typename Grid::template Field; - -#define LOADPOP(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - if (wallBitFlag & (uint32_t(1) << GOid)) { \ - /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ - popIn[GOid] = fin.template read(gidx); \ - } else { \ - popIn[GOid] = fin.template nghVal(gidx).value; \ - } \ - } \ - { /*BK*/ \ - if (wallBitFlag & (uint32_t(1) << BKid)) { \ - popIn[BKid] = fin.template read(gidx); \ - } else { \ - popIn[BKid] = fin.template nghVal(gidx).value; \ - } \ - } \ - } - static inline NEON_CUDA_HOST_DEVICE auto - loadPopulation(Idx const& gidx, - const uint32_t& wallBitFlag, - typename PopulationField::Partition const& fin, - NEON_OUT LbmStoreType popIn[19]) - { - // #pragma omp critical - // { - - LOADPOP(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); - LOADPOP(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); - LOADPOP(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); - LOADPOP(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); - LOADPOP(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); - LOADPOP(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); - LOADPOP(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); - LOADPOP(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); - LOADPOP(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); - // } - // Treat the case of the center (c[k] = {0, 0, 0,}). - { - popIn[Lattice::centerDirection] = fin(i, Lattice::centerDirection); - } - } -#undef LOADPOP - -#define PULL_STREAM(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - if (wallBitFlag & (uint32_t(1) << GOid)) { \ - /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \ - popIn[GOid] = fin(gidx, BKid) + \ - fin.template getNghData(gidx, BKid)(); \ - } else { \ - popIn[GOid] = fin.template getNghData(gidx, GOid)(); \ - } \ - } \ - { /*BK*/ \ - if (wallBitFlag & (uint32_t(1) << BKid)) { \ - popIn[BKid] = fin(gidx, GOid) + fin.template getNghData(gidx, GOid)(); \ - } else { \ - popIn[BKid] = fin.template getNghData(gidx, BKid)(); \ - } \ - } \ - } - - static inline NEON_CUDA_HOST_DEVICE auto - pullStream(Idx const& gidx, - const uint32_t& wallBitFlag, - typename PopulationField::Partition const& fin, - NEON_OUT LbmStoreType popIn[19]) - { - // #pragma omp critical - // { -#if 0 - using TopologyByDirection = std::tuple; - constexpr std::array stencil{ - std::make_tuple(Neon::int32_3d(-1, 0, 0), /* GOid */ 0, /* --- */ Neon::int32_3d(1, 0, 0), /* BKid */ 10), - std::make_tuple(Neon::int32_3d(0, -1, 0), /* GOid */ 1, /* --- */ Neon::int32_3d(0, 1, 0), /* BKid */ 11), - std::make_tuple(Neon::int32_3d(0, 0, -1), /* GOid */ 2, /* --- */ Neon::int32_3d(0, 0, 1), /* BKid */ 12), - std::make_tuple(Neon::int32_3d(-1, -1, 0), /* GOid */ 3, /* --- */ Neon::int32_3d(1, 1, 0), /* BKid */ 13), - std::make_tuple(Neon::int32_3d(-1, 1, 0), /* GOid */ 4, /* --- */ Neon::int32_3d(1, -1, 0), /* BKid */ 14), - std::make_tuple(Neon::int32_3d(-1, 0, -1), /* GOid */ 5, /* --- */ Neon::int32_3d(1, 0, 1), /* BKid */ 15), - std::make_tuple(Neon::int32_3d(-1, 0, 1), /* GOid */ 6, /* --- */ Neon::int32_3d(1, 0, -1), /* BKid */ 16), - std::make_tuple(Neon::int32_3d(0, -1, -1), /* GOid */ 7, /* --- */ Neon::int32_3d(0, 1, 1), /* BKid */ 17), - std::make_tuple(Neon::int32_3d(0, -1, 1), /* GOid */ 8, /* --- */ Neon::int32_3d(0, 1, -1), /* BKid */ 18)}; - - - auto pullStream = [&]() { - static_assert(stencilIdx < 9); - constexpr int GOid = std::get<1>(stencil[stencilIdx]); - constexpr int BKid = std::get<3>(stencil[stencilIdx]); - constexpr Neon::int32_3d GoOffset = std::get<0>(stencil[stencilIdx]); - constexpr Neon::int32_3d BkOffset = std::get<2>(stencil[stencilIdx]); - { - if (wallBitFlag & (uint32_t(1) << GOid)) { - popIn[GOid] = fin(gidx, BKid) + - fin.template getNghData(gidx, BKid)(); - } else { - popIn[GOid] = fin.template getNghData(gidx, GOid)(); - } - } - { /*BK*/ - if (wallBitFlag & (uint32_t(1) << BKid)) { - popIn[BKid] = fin(gidx, GOid) + - fin.template getNghData(gidx, GOid)(); - } else { - popIn[BKid] = fin.template getNghData(gidx, BKid)(); - } - } - }; - pullStream.template operator()<0>(); - pullStream.template operator()<1>(); - pullStream.template operator()<2>(); - pullStream.template operator()<3>(); - pullStream.template operator()<4>(); - pullStream.template operator()<5>(); - pullStream.template operator()<6>(); - pullStream.template operator()<7>(); - pullStream.template operator()<8>(); -#endif - PULL_STREAM(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10); - PULL_STREAM(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11); - PULL_STREAM(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12); - PULL_STREAM(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13); - PULL_STREAM(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14); - PULL_STREAM(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15); - PULL_STREAM(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16); - PULL_STREAM(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17); - PULL_STREAM(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18); - - // } - // Treat the case of the center (c[k] = {0, 0, 0,}). - { - popIn[Lattice::centerDirection] = fin(gidx, Lattice::centerDirection); - } - } -#undef PULL_STREAM - - static inline NEON_CUDA_HOST_DEVICE auto - macroscopic(const LbmStoreType pop[Lattice::Q], - NEON_OUT LbmComputeType& rho, - NEON_OUT std::array& u) - -> void - { -#define POP(IDX) static_cast(pop[IDX]) - - const LbmComputeType X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); - const LbmComputeType X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); - const LbmComputeType X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); - - const LbmComputeType Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); - const LbmComputeType Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); - - const LbmComputeType Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); - const LbmComputeType Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); - -#undef POP - - rho = X_M1 + X_P1 + X_0; - u[0] = (X_P1 - X_M1) / rho; - u[1] = (Y_P1 - Y_M1) / rho; - u[2] = (Z_P1 - Z_M1) / rho; - } - - - static inline NEON_CUDA_HOST_DEVICE auto - collideBgkUnrolled(Idx const& i /*! LbmComputeType iterator */, - const LbmStoreType pop[Lattice::Q], - LbmComputeType const& rho /*! Density */, - std::array const& u /*! Velocity */, - LbmComputeType const& usqr /*! Usqr */, - LbmComputeType const& omega /*! Omega */, - typename PopulationField::Partition& fOut /*! Population */) - - -> void - { - const LbmComputeType ck_u03 = u[0] + u[1]; - const LbmComputeType ck_u04 = u[0] - u[1]; - const LbmComputeType ck_u05 = u[0] + u[2]; - const LbmComputeType ck_u06 = u[0] - u[2]; - const LbmComputeType ck_u07 = u[1] + u[2]; - const LbmComputeType ck_u08 = u[1] - u[2]; - - const LbmComputeType eq_00 = rho * (1. / 18.) * (1. - 3. * u[0] + 4.5 * u[0] * u[0] - usqr); - const LbmComputeType eq_01 = rho * (1. / 18.) * (1. - 3. * u[1] + 4.5 * u[1] * u[1] - usqr); - const LbmComputeType eq_02 = rho * (1. / 18.) * (1. - 3. * u[2] + 4.5 * u[2] * u[2] - usqr); - const LbmComputeType eq_03 = rho * (1. / 36.) * (1. - 3. * ck_u03 + 4.5 * ck_u03 * ck_u03 - usqr); - const LbmComputeType eq_04 = rho * (1. / 36.) * (1. - 3. * ck_u04 + 4.5 * ck_u04 * ck_u04 - usqr); - const LbmComputeType eq_05 = rho * (1. / 36.) * (1. - 3. * ck_u05 + 4.5 * ck_u05 * ck_u05 - usqr); - const LbmComputeType eq_06 = rho * (1. / 36.) * (1. - 3. * ck_u06 + 4.5 * ck_u06 * ck_u06 - usqr); - const LbmComputeType eq_07 = rho * (1. / 36.) * (1. - 3. * ck_u07 + 4.5 * ck_u07 * ck_u07 - usqr); - const LbmComputeType eq_08 = rho * (1. / 36.) * (1. - 3. * ck_u08 + 4.5 * ck_u08 * ck_u08 - usqr); - - const LbmComputeType eqopp_00 = eq_00 + rho * (1. / 18.) * 6. * u[0]; - const LbmComputeType eqopp_01 = eq_01 + rho * (1. / 18.) * 6. * u[1]; - const LbmComputeType eqopp_02 = eq_02 + rho * (1. / 18.) * 6. * u[2]; - const LbmComputeType eqopp_03 = eq_03 + rho * (1. / 36.) * 6. * ck_u03; - const LbmComputeType eqopp_04 = eq_04 + rho * (1. / 36.) * 6. * ck_u04; - const LbmComputeType eqopp_05 = eq_05 + rho * (1. / 36.) * 6. * ck_u05; - const LbmComputeType eqopp_06 = eq_06 + rho * (1. / 36.) * 6. * ck_u06; - const LbmComputeType eqopp_07 = eq_07 + rho * (1. / 36.) * 6. * ck_u07; - const LbmComputeType eqopp_08 = eq_08 + rho * (1. / 36.) * 6. * ck_u08; - - const LbmComputeType pop_out_00 = (1. - omega) * static_cast(pop[0]) + omega * eq_00; - const LbmComputeType pop_out_01 = (1. - omega) * static_cast(pop[1]) + omega * eq_01; - const LbmComputeType pop_out_02 = (1. - omega) * static_cast(pop[2]) + omega * eq_02; - const LbmComputeType pop_out_03 = (1. - omega) * static_cast(pop[3]) + omega * eq_03; - const LbmComputeType pop_out_04 = (1. - omega) * static_cast(pop[4]) + omega * eq_04; - const LbmComputeType pop_out_05 = (1. - omega) * static_cast(pop[5]) + omega * eq_05; - const LbmComputeType pop_out_06 = (1. - omega) * static_cast(pop[6]) + omega * eq_06; - const LbmComputeType pop_out_07 = (1. - omega) * static_cast(pop[7]) + omega * eq_07; - const LbmComputeType pop_out_08 = (1. - omega) * static_cast(pop[8]) + omega * eq_08; - - const LbmComputeType pop_out_opp_00 = (1. - omega) * static_cast(pop[10]) + omega * eqopp_00; - const LbmComputeType pop_out_opp_01 = (1. - omega) * static_cast(pop[11]) + omega * eqopp_01; - const LbmComputeType pop_out_opp_02 = (1. - omega) * static_cast(pop[12]) + omega * eqopp_02; - const LbmComputeType pop_out_opp_03 = (1. - omega) * static_cast(pop[13]) + omega * eqopp_03; - const LbmComputeType pop_out_opp_04 = (1. - omega) * static_cast(pop[14]) + omega * eqopp_04; - const LbmComputeType pop_out_opp_05 = (1. - omega) * static_cast(pop[15]) + omega * eqopp_05; - const LbmComputeType pop_out_opp_06 = (1. - omega) * static_cast(pop[16]) + omega * eqopp_06; - const LbmComputeType pop_out_opp_07 = (1. - omega) * static_cast(pop[17]) + omega * eqopp_07; - const LbmComputeType pop_out_opp_08 = (1. - omega) * static_cast(pop[18]) + omega * eqopp_08; - - -#define COMPUTE_GO_AND_BACK(GOid, BKid) \ - { \ - fOut(i, GOid) = static_cast(pop_out_0##GOid); \ - fOut(i, BKid) = static_cast(pop_out_opp_0##GOid); \ - } - - COMPUTE_GO_AND_BACK(0, 10) - COMPUTE_GO_AND_BACK(1, 11) - COMPUTE_GO_AND_BACK(2, 12) - COMPUTE_GO_AND_BACK(3, 13) - COMPUTE_GO_AND_BACK(4, 14) - COMPUTE_GO_AND_BACK(5, 15) - COMPUTE_GO_AND_BACK(6, 16) - COMPUTE_GO_AND_BACK(7, 17) - COMPUTE_GO_AND_BACK(8, 18) - -#undef COMPUTE_GO_AND_BACK - - { - const LbmComputeType eq_09 = rho * (1. / 3.) * (1. - usqr); - const LbmComputeType pop_out_09 = (1. - omega) * - static_cast(pop[Lattice::centerDirection]) + - omega * eq_09; - fOut(i, Lattice::centerDirection) = static_cast(pop_out_09); - } - } - - static auto - iteration(Neon::set::StencilSemantic stencilSemantic, - const PopulationField& fInField /*! inpout population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - const LbmComputeType omega /*! LBM omega parameter */, - PopulationField& fOutField /*! output Population field */) - -> Neon::set::Container - { - - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&, omega](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL, stencilSemantic); - auto& fOut = L.load(fOutField); - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - if (cellInfo.classification == CellType::bulk) { - - LbmStoreType popIn[Lattice::Q]; - pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - - LbmComputeType rho; - std::array u{.0, .0, .0}; - macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - - LbmComputeType usqr = 1.5 * (u[0] * u[0] + - u[1] * u[1] + - u[2] * u[2]); - - collideBgkUnrolled(gidx, - popIn, - rho, u, - usqr, omega, - NEON_OUT fOut); - } - }; - }); - return container; - } - -#define COMPUTE_MASK_WALL(GOx, GOy, GOz, GOid, BKx, BKy, BKz, BKid) \ - { \ - { /*GO*/ \ - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ - if (nghCellType.classification != CellType::bulk) { \ - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOid)); \ - } \ - } \ - { /*BK*/ \ - CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); \ - if (nghCellType.classification != CellType::bulk) { \ - cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << BKid)); \ - } \ - } \ - } - - static auto - computeWallNghMask(const CellTypeField& infoInField, - CellTypeField& infoOutpeField) - - -> Neon::set::Container - { - Neon::set::Container container = infoInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& infoIn = L.load(infoInField, - Neon::Pattern::STENCIL); - auto& infoOut = L.load(infoOutpeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { - CellType cellType = infoIn(gidx, 0); - cellType.wallNghBitflag = 0; - - if (cellType.classification == CellType::bulk) { - COMPUTE_MASK_WALL(-1, 0, 0, /* GOid */ 0, /* --- */ 1, 0, 0, /* BKid */ 10) - COMPUTE_MASK_WALL(0, -1, 0, /* GOid */ 1, /* --- */ 0, 1, 0, /* BKid */ 11) - COMPUTE_MASK_WALL(0, 0, -1, /* GOid */ 2, /* --- */ 0, 0, 1, /* BKid */ 12) - COMPUTE_MASK_WALL(-1, -1, 0, /* GOid */ 3, /* --- */ 1, 1, 0, /* BKid */ 13) - COMPUTE_MASK_WALL(-1, 1, 0, /* GOid */ 4, /* --- */ 1, -1, 0, /* BKid */ 14) - COMPUTE_MASK_WALL(-1, 0, -1, /* GOid */ 5, /* --- */ 1, 0, 1, /* BKid */ 15) - COMPUTE_MASK_WALL(-1, 0, 1, /* GOid */ 6, /* --- */ 1, 0, -1, /* BKid */ 16) - COMPUTE_MASK_WALL(0, -1, -1, /* GOid */ 7, /* --- */ 0, 1, 1, /* BKid */ 17) - COMPUTE_MASK_WALL(0, -1, 1, /* GOid */ 8, /* --- */ 0, 1, -1, /* BKid */ 18) - - infoOut(gidx, 0) = cellType; - } - }; - }); - return container; - } -#undef COMPUTE_MASK_WALL - -#define BC_LOAD(GOID, DKID) \ - popIn[GOID] = fIn(gidx, GOID); \ - popIn[DKID] = fIn(gidx, DKID); - - static auto - computeRhoAndU([[maybe_unused]] const PopulationField& fInField /*! inpout population field */, - const CellTypeField& cellTypeField /*! Cell type field */, - Rho& rhoField /*! output Population field */, - U& uField /*! output Population field */) - - -> Neon::set::Container - { - Neon::set::Container container = fInField.getGrid().newContainer( - "LBM_iteration", - [&](Neon::set::Loader& L) -> auto { - auto& fIn = L.load(fInField, - Neon::Pattern::STENCIL); - auto& rhoXpu = L.load(rhoField); - auto& uXpu = L.load(uField); - - const auto& cellInfoPartition = L.load(cellTypeField); - - return [=] NEON_CUDA_HOST_DEVICE(const typename PopulationField::Idx& gidx) mutable { - CellType cellInfo = cellInfoPartition(gidx, 0); - LbmComputeType rho = 0; - std::array u{.0, .0, .0}; - LbmStoreType popIn[Lattice::Q]; - - if (cellInfo.classification == CellType::bulk) { - pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); - macroscopic(popIn, NEON_OUT rho, NEON_OUT u); - } else { - if (cellInfo.classification == CellType::movingWall) { - BC_LOAD(0, 10) - BC_LOAD(1, 11) - BC_LOAD(2, 12) - BC_LOAD(3, 13) - BC_LOAD(4, 14) - BC_LOAD(5, 15) - BC_LOAD(6, 16) - BC_LOAD(7, 17) - BC_LOAD(8, 18) - popIn[9] = fIn(gidx, 9); - - rho = 1.0; - u = std::array{COMPUTE_CAST(popIn[0]) / COMPUTE_CAST(6. * 1. / 18.), - COMPUTE_CAST(popIn[1]) / COMPUTE_CAST(6. * 1. / 18.), - COMPUTE_CAST(popIn[2]) / COMPUTE_CAST(6. * 1. / 18.)}; - } - } - - rhoXpu(gidx, 0) = static_cast(rho); - uXpu(gidx, 0) = static_cast(u[0]); - uXpu(gidx, 1) = static_cast(u[1]); - uXpu(gidx, 2) = static_cast(u[2]); - }; - }); - return container; - } -}; - -#undef COMPUTE_CAST \ No newline at end of file diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cpp b/benchmarks/lbm/src/RunCavityTwoPop.cu similarity index 99% rename from benchmarks/lbm/src/RunCavityTwoPop.cpp rename to benchmarks/lbm/src/RunCavityTwoPop.cu index 62a245a1..15438d8d 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cpp +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -1,5 +1,8 @@ #include "Config.h" + #include "D3Q19.h" +#include "D3Q27.h" + #include "Neon/domain/bGrid.h" #include "Neon/domain/dGrid.h" #include "Neon/domain/details/dGridSoA/dGridSoA.h" @@ -7,7 +10,6 @@ #include "./Lbm.h" #include "CellType.h" -#include "LbmSkeleton.h" #include "Metrics.h" #include "Repoert.h" namespace CavityTwoPop { diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h index 3193d63d..e41c8f26 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h @@ -128,6 +128,11 @@ class Vec_3d NEON_CUDA_HOST_DEVICE inline void constexpr set(const Integer& xyz); + template + NEON_CUDA_HOST_DEVICE inline constexpr Integer getComponent() const + { + return v[componentId]; + } //---- [REDUCE SECTION] -------------------------------------------------------------------------------------------- //---- [REDUCE SECTION] -------------------------------------------------------------------------------------------- From 90c6ede5caac1f7deb30e1f177849f0c30736fd2 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 18:13:03 +0200 Subject: [PATCH 67/94] WIP: fixing nvcc bug. --- benchmarks/lbm/src/D3Q19.h | 1 + benchmarks/lbm/src/RunCavityTwoPop.cu | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h index cc29c225..07d7c163 100644 --- a/benchmarks/lbm/src/D3Q19.h +++ b/benchmarks/lbm/src/D3Q19.h @@ -31,6 +31,7 @@ struct D3Q19 { using Self = D3Q19::Registers; + static constexpr Neon::index_3d stencil[Q]{ /*! 0 */ Neon::index_3d(-1, 0, 0), /*! 1 */ Neon::index_3d(0, -1, 0), diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index 15438d8d..e182b6c4 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -157,11 +157,11 @@ auto runFilterLattice(Config& config, using Lattice = D3Q19; return runFilterCollision(config, report, testCode); } - if (config.lattice == "d3q27" || config.lattice == "D3Q27") { - testCode << "_D3Q27"; - using Lattice = D3Q27; - return runFilterCollision(config, report, testCode); - } +// if (config.lattice == "d3q27" || config.lattice == "D3Q27") { +// testCode << "_D3Q27"; +// using Lattice = D3Q27; +// return runFilterCollision(config, report, testCode); +// } NEON_DEV_UNDER_CONSTRUCTION("Lattice type not supported. Available options: D3Q19 and D3Q27"); } From ad641738f11b50f96758986ece06595d20e9d5cd Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 18:39:16 +0200 Subject: [PATCH 68/94] Updating script. --- benchmarks/lbm/lbm.py | 188 +++++++++++++++++++++--------------------- 1 file changed, 92 insertions(+), 96 deletions(-) diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py index b08fd00e..09f516e9 100644 --- a/benchmarks/lbm/lbm.py +++ b/benchmarks/lbm/lbm.py @@ -1,20 +1,20 @@ -DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split() -DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split() -DEVICE_TYPE_LIST = 'cpu gpu'.split() -GRID_LIST = "dGrid bGrid eGrid".split() -STORAGE_FP_LIST = "double float".split() -COMPUTE_FP_LIST = "double float".split() -OCC_LIST = "nOCC sOCC".split() -HU_LIST = "huGrid huLattice".split() -CURVE_LIST = "sweep morton hilbert".split() -COLLISION_LIST = "bgk kbc".split() -LATTICE_LIST = "d3q19 d3q27".split() -STREAMINGMETHOD_LIST = "push pull aa".split() -TRANSFERMODE_LIST = "get put".split() -STENCILSEMANTIC_LIST = "grid, streaming".split() -WARM_UP_ITER = 10 -MAX_ITER = 10000 -REPETITIONS = 5 +deviceType_LIST = 'cpu gpu'.split() +deviceIds_LIST= "0 1 2 3 4 5 6 7".split() +grid_LIST= "dGrid bGrid eGrid".split() +domainSize_LIST= "64 128 192 256 320 384 448 512".split() +computeFP_LIST= "double float".split() +storageFP_LIST= "double float".split() +occ_LIST="nOCC sOCC".split() +transferMode_LIST= "get put".split() +stencilSemantic_LIST= "grid, streaming".split() +spaceCurve_LIST= "sweep morton hilbert".split() +collision_LIST = "bgk kbc".split() +streamingMethod_LIST= "push pull aa".split() +lattice_LIST= "d3q19 d3q27".split() + +warmupIter_INT = 10 +repetitions_INT = 5 +maxIter_INT = 10000 import subprocess import sys @@ -34,35 +34,33 @@ def printProgressBar(value, label): def countAll(): counter = 0 - for DEVICE_TYPE in DEVICE_TYPE_LIST: - DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + for DEVICE_TYPE in deviceType_LIST: + DEVICE_SET_LIST = [deviceIds_LIST[0]] if DEVICE_TYPE == 'gpu': - for DEVICE in DEVICE_ID_LIST[1:]: + for DEVICE in deviceIds_LIST[1:]: DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) for DEVICE_SET in DEVICE_SET_LIST: - for OCC in OCC_LIST: - for DOMAIN_SIZE in DOMAIN_SIZE_LIST: - for STORAGE_FP in STORAGE_FP_LIST: - for COMPUTE_FP in COMPUTE_FP_LIST: - for GRID in GRID_LIST: - for HU in HU_LIST: - for CURVE in CURVE_LIST: - for LATTICE in LATTICE_LIST: - for TRANSFERMODE in TRANSFERMODE_LIST: - for STENCILSEMANTIC in STENCILSEMANTIC_LIST: - for COLLISION in COLLISION_LIST: - if LATTICE != "d3q27" and LATTICE != "D3Q27": + for OCC in occ_LIST: + for DOMAIN_SIZE in domainSize_LIST: + for STORAGE_FP in storageFP_LIST: + for COMPUTE_FP in computeFP_LIST: + for GRID in grid_LIST: + for CURVE in spaceCurve_LIST: + for LATTICE in lattice_LIST: + for TRANSFERMODE in transferMode_LIST: + for STENCILSEMANTIC in stencilSemantic_LIST: + for COLLISION in collision_LIST: + if LATTICE != "d3q27" and LATTICE != "D3Q27": + continue + for STREAMINGMETHOD in streamingMethod_LIST: + if STREAMINGMETHOD != 'pull' and len(deviceIds_LIST) != 1: + continue + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': continue - for STREAMINGMETHOD in STREAMINGMETHOD_LIST: - if STREAMINGMETHOD != 'pull' and len(DEVICE_ID_LIST) != 1: - continue - - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue - if STORAGE_FP == 'float' and COMPUTE_FP == 'double': - continue - counter += 1 + counter += 1 return counter @@ -71,66 +69,64 @@ def countAll(): command = './lbm' # command = 'echo' with open(command + '.log', 'w') as fp: - for DEVICE_TYPE in DEVICE_TYPE_LIST: - DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] + for DEVICE_TYPE in deviceType_LIST: + DEVICE_SET_LIST = [deviceIds_LIST[0]] if DEVICE_TYPE == 'gpu': - for DEVICE in DEVICE_ID_LIST[1:]: + for DEVICE in deviceIds_LIST[1:]: DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) for DEVICE_SET in DEVICE_SET_LIST: - for OCC in OCC_LIST: - for DOMAIN_SIZE in DOMAIN_SIZE_LIST: - for STORAGE_FP in STORAGE_FP_LIST: - for COMPUTE_FP in COMPUTE_FP_LIST: - for GRID in GRID_LIST: - for HU in HU_LIST: - for CURVE in CURVE_LIST: - for LATTICE in LATTICE_LIST: - for TRANSFERMODE in TRANSFERMODE_LIST: - for STENCILSEMANTIC in STENCILSEMANTIC_LIST: - for COLLISION in COLLISION_LIST: - if LATTICE != "d3q27" and LATTICE != "D3Q27": + for OCC in occ_LIST: + for DOMAIN_SIZE in domainSize_LIST: + for STORAGE_FP in storageFP_LIST: + for COMPUTE_FP in computeFP_LIST: + for GRID in grid_LIST: + for CURVE in spaceCurve_LIST: + for LATTICE in lattice_LIST: + for TRANSFERMODE in transferMode_LIST: + for STENCILSEMANTIC in stencilSemantic_LIST: + for COLLISION in collision_LIST: + if LATTICE != "d3q27" and LATTICE != "D3Q27": + continue + for STREAMINGMETHOD in streamingMethod_LIST: + if STREAMINGMETHOD != 'pull' and len(deviceIds_LIST) != 1: + continue + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': continue - for STREAMINGMETHOD in STREAMINGMETHOD_LIST: - if STREAMINGMETHOD != 'pull' and len(DEVICE_ID_LIST) != 1: - continue - - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue - if STORAGE_FP == 'float' and COMPUTE_FP == 'double': - continue - parameters = [] - parameters.append('--deviceType ' + DEVICE_TYPE) - parameters.append('--deviceIds ' + DEVICE_SET) - parameters.append('--grid ' + GRID) - parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--max-iter ' + str(MAX_ITER)) - parameters.append('--report-filename ' + 'lbm') - parameters.append('--computeFP ' + COMPUTE_FP) - parameters.append('--storageFP ' + STORAGE_FP) - parameters.append('--occ ' + OCC) - parameters.append('--transferMode ' + TRANSFERMODE) - parameters.append('--stencilSemantic ' + STENCILSEMANTIC) - parameters.append('--spaceCurve ' + CURVE) - parameters.append('--collision ' + COLLISION) - parameters.append('--streamingMethod ' + STREAMINGMETHOD) - parameters.append('--lattice ' + LATTICE) - parameters.append('--benchmark ') - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append('--report-filename ' + 'lbm') + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--occ ' + OCC) + parameters.append('--transferMode ' + TRANSFERMODE) + parameters.append('--stencilSemantic ' + STENCILSEMANTIC) + parameters.append('--spaceCurve ' + CURVE) + parameters.append('--collision ' + COLLISION) + parameters.append('--streamingMethod ' + STREAMINGMETHOD) + parameters.append('--lattice ' + LATTICE) + parameters.append('--benchmark ') + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) - commandList = [] - commandList.append(command) - for el in parameters: - for s in el.split(): - commandList.append(s) + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) - fp.write("\n-------------------------------------------\n") - fp.write(' '.join(commandList)) - fp.write("\n-------------------------------------------\n") - fp.flush() - print(' '.join(commandList)) - subprocess.run(commandList, text=True, stdout=fp) + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + print(' '.join(commandList)) + subprocess.run(commandList, text=True, stdout=fp) - counter += 1 - printProgressBar(counter * 100.0 / SAMPLES, 'Progress') + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') From 59e316161642c3f9fa6c38f05372c944d1290e7e Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 18:45:27 +0200 Subject: [PATCH 69/94] Updating script. --- benchmarks/lbm/lbm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py index 09f516e9..c38282e9 100644 --- a/benchmarks/lbm/lbm.py +++ b/benchmarks/lbm/lbm.py @@ -50,7 +50,7 @@ def countAll(): for TRANSFERMODE in transferMode_LIST: for STENCILSEMANTIC in stencilSemantic_LIST: for COLLISION in collision_LIST: - if LATTICE != "d3q27" and LATTICE != "D3Q27": + if LATTICE != "d3q27" and LATTICE != "D3Q27" and COLLISION == 'kbc': continue for STREAMINGMETHOD in streamingMethod_LIST: if STREAMINGMETHOD != 'pull' and len(deviceIds_LIST) != 1: From 0112332b02ebf2d987cd9f54799729570d2c9303 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 18:58:16 +0200 Subject: [PATCH 70/94] Updating script. --- benchmarks/lbm/lbm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py index c38282e9..10f9f4ce 100644 --- a/benchmarks/lbm/lbm.py +++ b/benchmarks/lbm/lbm.py @@ -50,10 +50,10 @@ def countAll(): for TRANSFERMODE in transferMode_LIST: for STENCILSEMANTIC in stencilSemantic_LIST: for COLLISION in collision_LIST: - if LATTICE != "d3q27" and LATTICE != "D3Q27" and COLLISION == 'kbc': + if LATTICE != "d3q27" and COLLISION == 'kbc': continue for STREAMINGMETHOD in streamingMethod_LIST: - if STREAMINGMETHOD != 'pull' and len(deviceIds_LIST) != 1: + if STREAMINGMETHOD != 'pull' and len(DEVICE_SET_LIST) != 1: continue if STORAGE_FP == 'double' and COMPUTE_FP == 'float': continue @@ -85,10 +85,10 @@ def countAll(): for TRANSFERMODE in transferMode_LIST: for STENCILSEMANTIC in stencilSemantic_LIST: for COLLISION in collision_LIST: - if LATTICE != "d3q27" and LATTICE != "D3Q27": + if LATTICE != "d3q27" and COLLISION == 'kbc': continue for STREAMINGMETHOD in streamingMethod_LIST: - if STREAMINGMETHOD != 'pull' and len(deviceIds_LIST) != 1: + if STREAMINGMETHOD != 'pull' and len(DEVICE_SET_LIST) != 1: continue if STORAGE_FP == 'double' and COMPUTE_FP == 'float': continue From 6b896f9bb6d62ed7fc6875a1e3f3edff88248c1e Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 18:59:27 +0200 Subject: [PATCH 71/94] Updating script. --- benchmarks/lbm/lbm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py index 10f9f4ce..5394ceaf 100644 --- a/benchmarks/lbm/lbm.py +++ b/benchmarks/lbm/lbm.py @@ -100,7 +100,7 @@ def countAll(): parameters.append('--deviceIds ' + DEVICE_SET) parameters.append('--grid ' + GRID) parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append('--max-iter ' + str(maxIter_INT)) parameters.append('--report-filename ' + 'lbm') parameters.append('--computeFP ' + COMPUTE_FP) parameters.append('--storageFP ' + STORAGE_FP) @@ -112,8 +112,8 @@ def countAll(): parameters.append('--streamingMethod ' + STREAMINGMETHOD) parameters.append('--lattice ' + LATTICE) parameters.append('--benchmark ') - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--warmup-iter ' + str(warmupIter_INT)) + parameters.append('--repetitions ' + str(repetitions_INT)) commandList = [] commandList.append(command) From 65d829b4be8e6008608afeffc84b30ad2e3cee19 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 19:07:25 +0200 Subject: [PATCH 72/94] Updating script. --- benchmarks/lbm/src/RunCavityTwoPop.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index e182b6c4..cb8976bb 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -40,7 +40,6 @@ auto run(Config& config, code << "_SS" << config.stencilSemanticCli.getStringOption()<< "_"; code << "_SF" << config.spaceCurveCli.getStringOption() << "_"; code << "_TM" << config.transferModeCli.getStringOption() << "_"; - code << "__"; // using PopulationField = typename Grid::template Field; @@ -209,8 +208,10 @@ auto run(Config& config, Report& report, std::stringstream& testCode) -> void { + testCode << "___" << config.N << "_"; + if (config.gridType == "dGrid") { - testCode << "___DG"; + testCode << "_DG"; return details::runFilterStoreType(config, report, testCode); } // if (config.gridType == "eGrid") { From 5133b11606ce4609799815c6fbacbdb91f897508 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 19:36:40 +0200 Subject: [PATCH 73/94] Updating script. --- benchmarks/lbm/src/RunCavityTwoPop.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index cb8976bb..b3014461 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -174,9 +174,9 @@ auto runFilterComputeType(Config& config, testCode << "_SD"; return runFilterLattice(config, report, testCode); } - // if (config.computeTypeStr == "float") { - // return run(config, report); - // } + if (config.computeTypeStr == "float") { + return run(config, report); + } NEON_DEV_UNDER_CONSTRUCTION(""); } From 3f28bfd1ad013ce861574a632c2b529f76392206 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 19:38:57 +0200 Subject: [PATCH 74/94] Updating script. --- benchmarks/lbm/src/RunCavityTwoPop.cu | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index b3014461..3503e8ef 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -37,7 +37,7 @@ auto run(Config& config, for (auto const& id : config.devices) { code << id; } - code << "_SS" << config.stencilSemanticCli.getStringOption()<< "_"; + code << "_SS" << config.stencilSemanticCli.getStringOption() << "_"; code << "_SF" << config.spaceCurveCli.getStringOption() << "_"; code << "_TM" << config.transferModeCli.getStringOption() << "_"; code << "__"; @@ -156,11 +156,11 @@ auto runFilterLattice(Config& config, using Lattice = D3Q19; return runFilterCollision(config, report, testCode); } -// if (config.lattice == "d3q27" || config.lattice == "D3Q27") { -// testCode << "_D3Q27"; -// using Lattice = D3Q27; -// return runFilterCollision(config, report, testCode); -// } + // if (config.lattice == "d3q27" || config.lattice == "D3Q27") { + // testCode << "_D3Q27"; + // using Lattice = D3Q27; + // return runFilterCollision(config, report, testCode); + // } NEON_DEV_UNDER_CONSTRUCTION("Lattice type not supported. Available options: D3Q19 and D3Q27"); } @@ -175,7 +175,8 @@ auto runFilterComputeType(Config& config, return runFilterLattice(config, report, testCode); } if (config.computeTypeStr == "float") { - return run(config, report); + testCode << "_SF"; + return runFilterLattice(config, report, testCode); } NEON_DEV_UNDER_CONSTRUCTION(""); } @@ -190,10 +191,10 @@ auto runFilterStoreType(Config& config, testCode << "_CD"; return runFilterComputeType(config, report, testCode); } - // if (config.storeTypeStr == "float") { - // testCode << "_CS_"; - // return runFilterComputeType(config, report,testCode); - // } + if (config.storeTypeStr == "float") { + testCode << "_CF"; + return runFilterComputeType(config, report, testCode); + } NEON_DEV_UNDER_CONSTRUCTION(""); } } // namespace details From be81ec7ecb5a9f80b6ce1087af8d2119f339d0f0 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Sun, 10 Sep 2023 22:07:38 +0200 Subject: [PATCH 75/94] Updating script. --- benchmarks/lbm/lbm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py index 5394ceaf..2c50f01a 100644 --- a/benchmarks/lbm/lbm.py +++ b/benchmarks/lbm/lbm.py @@ -4,9 +4,9 @@ domainSize_LIST= "64 128 192 256 320 384 448 512".split() computeFP_LIST= "double float".split() storageFP_LIST= "double float".split() -occ_LIST="nOCC sOCC".split() +occ_LIST="none".split() transferMode_LIST= "get put".split() -stencilSemantic_LIST= "grid, streaming".split() +stencilSemantic_LIST= "grid streaming".split() spaceCurve_LIST= "sweep morton hilbert".split() collision_LIST = "bgk kbc".split() streamingMethod_LIST= "push pull aa".split() From cb6b4376dbc25e4ebf3661f059863c053a591462 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 11 Sep 2023 08:52:29 +0200 Subject: [PATCH 76/94] Cleaning up for PR. --- benchmarks/CMakeLists.txt | 2 +- .../include/Neon/domain/details/eGrid/ePartition_imp.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index efb267c6..786a165e 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.19 FATAL_ERROR) -add_subdirectory(lbm) +# add_subdirectory(lbm) # add_subdirectory("lbm-lid-driven-cavity-flow") # add_subdirectory("lbm-flow-over-sphere") diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h index 1611106f..29980a61 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h @@ -67,7 +67,7 @@ ePartition::getNghData(eIndex gidx, T val = this->operator()(gidxxNgh, card); return NghData(val, isValidNeighbour); } - return NghData(isValidNeighbour); + return NghData(); } template Date: Mon, 11 Sep 2023 09:38:05 +0200 Subject: [PATCH 77/94] Cleaning up for PR. --- libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h | 3 ++- libNeonDomain/include/Neon/domain/tools/SpaceCurves.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index d92a4351..1ae2bf1d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -312,8 +312,9 @@ auto bField::initHaloUpdateTable() -> void T* srcMem = blockViewPartitions[Data::EndPoints::src]->mem(); T* dstMem = blockViewPartitions[Data::EndPoints::dst]->mem(); - Neon::size_4d srcBoundaryBuff(boundaryZBeginIdx[Data::EndPoints::src][static_cast(byDirection)], 0, 0, 0); Neon::size_4d dstGhostBuff(ghostZBeginIdx[Data::EndPoints::dst][static_cast(ByDirectionUtils::invert(byDirection))], 0, 0, 0); + Neon::size_4d srcBoundaryBuff(boundaryZBeginIdx[Data::EndPoints::src][static_cast(byDirection)], 0, 0, 0); + size_t transferDataBlockCount = mData->grid->mData->partitioner1D.getSpanLayout().getBoundsBoundary(setIdxVec[Data::EndPoints::src], byDirection).count; // std::cout << "To " << dstGhostBuff << " prt " << blockViewPartitions[Data::EndPoints::dst]->prtID() << " From " << srcBoundaryBuff << " prt " << blockViewPartitions[Data::EndPoints::src]->prtID() << std::endl; diff --git a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h index 8535bfd0..b04c72c8 100644 --- a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h +++ b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h @@ -278,7 +278,7 @@ class Encoder uint64_t transform = 0; uint64_t out = 0; - for (int32_t i = 3 * (bits - 1); i >= 0; i -= 3) { + for (int32_t i = int(3 * (bits - 1)); i >= 0; i -= 3) { transform = lookupTable[transform | ((in >> i) & 7)]; out = (out << 3) | (transform & 7); transform &= ~7; From e4f43c4d6b4e083f54c0c078d0d7b3dee6d04c5b Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 11 Sep 2023 16:39:35 +0200 Subject: [PATCH 78/94] Cleaning up for PR. --- libNeonDomain/include/Neon/domain/tools/SpaceCurves.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h index b04c72c8..835fa066 100644 --- a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h +++ b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h @@ -298,7 +298,7 @@ class Encoder } - static inline auto splitBy3(unsigned int a) + static inline auto splitBy3(uint64_t a) { uint64_t x = a & 0x1fffff; // we only care about 21 bits x = (x | x << 32) & 0x1f00000000ffff; // shift left 32 bits, mask out bits 21-31 From 3c7f0921b790ffe631c20fe68512099cf019ca92 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 11 Sep 2023 21:58:07 +0200 Subject: [PATCH 79/94] Issue with nvcc fixed. --- benchmarks/CMakeLists.txt | 2 +- benchmarks/lbm/src/D3Q19.h | 33 +++++---------------------- benchmarks/lbm/src/D3Q27.h | 33 ++------------------------- benchmarks/lbm/src/DeviceD3QXX.h | 12 +++++++--- benchmarks/lbm/src/RunCavityTwoPop.cu | 6 ++++- 5 files changed, 23 insertions(+), 63 deletions(-) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 786a165e..efb267c6 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.19 FATAL_ERROR) -# add_subdirectory(lbm) +add_subdirectory(lbm) # add_subdirectory("lbm-lid-driven-cavity-flow") # add_subdirectory("lbm-flow-over-sphere") diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h index 07d7c163..422a69d3 100644 --- a/benchmarks/lbm/src/D3Q19.h +++ b/benchmarks/lbm/src/D3Q19.h @@ -32,7 +32,7 @@ struct D3Q19 using Self = D3Q19::Registers; - static constexpr Neon::index_3d stencil[Q]{ + static constexpr std::array stencil{ /*! 0 */ Neon::index_3d(-1, 0, 0), /*! 1 */ Neon::index_3d(0, -1, 0), /*! 2 */ Neon::index_3d(0, 0, -1), @@ -54,31 +54,10 @@ struct D3Q19 /*! 18 */ Neon::index_3d(0, 1, -1)}; template - NEON_CUDA_HOST_DEVICE static constexpr auto + static constexpr inline auto getComponentOfDirection() -> int { - constexpr Neon::index_3d s[Q]{ - /*! 0 */ Neon::index_3d(-1, 0, 0), - /*! 1 */ Neon::index_3d(0, -1, 0), - /*! 2 */ Neon::index_3d(0, 0, -1), - /*! 3 */ Neon::index_3d(-1, -1, 0), - /*! 4 */ Neon::index_3d(-1, 1, 0), - /*! 5 */ Neon::index_3d(-1, 0, -1), - /*! 6 */ Neon::index_3d(-1, 0, 1), - /*! 7 */ Neon::index_3d(0, -1, -1), - /*! 8 */ Neon::index_3d(0, -1, 1), - /*! 9 */ Neon::index_3d(0, 0, 0), - /*! 10 */ Neon::index_3d(1, 0, 0), - /*! 11 */ Neon::index_3d(0, 1, 0), - /*! 12 */ Neon::index_3d(0, 0, 1), - /*! 13 */ Neon::index_3d(1, 1, 0), - /*! 14 */ Neon::index_3d(1, -1, 0), - /*! 15 */ Neon::index_3d(1, 0, 1), - /*! 16 */ Neon::index_3d(1, 0, -1), - /*! 17 */ Neon::index_3d(0, 1, 1), - /*! 18 */ Neon::index_3d(0, 1, -1)}; - - return s[qIdx].template getComponent(); + return Self::stencil[qIdx].template getComponent(); } static constexpr int center = 9; /** Position of direction {0,0,0} */ @@ -210,21 +189,21 @@ struct D3Q19 template - NEON_CUDA_HOST_DEVICE static constexpr auto mapToRegisters() + static constexpr auto mapToRegisters() -> int { return memoryToRegister[go]; } template - NEON_CUDA_HOST_DEVICE static constexpr auto mapToMemory() + static constexpr auto mapToMemory() -> int { return registerToMemory[go]; } template - NEON_CUDA_HOST_DEVICE static constexpr auto getOpposite() + static constexpr auto getOpposite() -> int { return opposite[go]; diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h index 4512ef83..59a97850 100644 --- a/benchmarks/lbm/src/D3Q27.h +++ b/benchmarks/lbm/src/D3Q27.h @@ -60,39 +60,10 @@ struct D3Q27 /* 26 */ Neon::index_3d(1, -1, -1)}; template - static constexpr inline NEON_CUDA_HOST_DEVICE auto + static constexpr inline auto getComponentOfDirection() -> int { - constexpr std::array s{ - /* 00 */ Neon::index_3d(-1, 0, 0), - /* 01 */ Neon::index_3d(0, -1, 0), - /* 02 */ Neon::index_3d(0, 0, -1), - /* 03 */ Neon::index_3d(-1, -1, 0), - /* 04 */ Neon::index_3d(-1, 1, 0), - /* 05 */ Neon::index_3d(-1, 0, -1), - /* 06 */ Neon::index_3d(-1, 0, 1), - /* 07 */ Neon::index_3d(0, -1, -1), - /* 08 */ Neon::index_3d(0, -1, 1), - /* 09 */ Neon::index_3d(-1, -1, -1), - /* 00 */ Neon::index_3d(-1, -1, 1), - /* 11 */ Neon::index_3d(-1, 1, -1), - /* 12 */ Neon::index_3d(-1, 1, 1), - /* 13 */ Neon::index_3d(0, 0, 0), - /* 14 */ Neon::index_3d(1, 0, 0), - /* 15 */ Neon::index_3d(0, 1, 0), - /* 16 */ Neon::index_3d(0, 0, 1), - /* 17 */ Neon::index_3d(1, 1, 0), - /* 18 */ Neon::index_3d(1, -1, 0), - /* 19 */ Neon::index_3d(1, 0, 1), - /* 20 */ Neon::index_3d(1, 0, -1), - /* 21 */ Neon::index_3d(0, 1, 1), - /* 22 */ Neon::index_3d(0, 1, -1), - /* 23 */ Neon::index_3d(1, 1, 1), - /* 24 */ Neon::index_3d(1, 1, -1), - /* 25 */ Neon::index_3d(1, -1, 1), - /* 26 */ Neon::index_3d(1, -1, -1)}; - - return s[qIdx].v[cIdx]; + return Self::stencil[qIdx].v[cIdx]; } static constexpr int center = 13; /** Position of direction {0,0,0} */ diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index b7f487a2..fc275308 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -326,9 +326,15 @@ struct DeviceD3QXX // momentum_flux Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { - Neon::ConstexprFor<0, 6, 1>([&](auto i) { - Pi[i] += fneq[q] * Lattice::Registers::template getMomentByDirection(); - }); +// Neon::ConstexprFor<0, 6, 1>([&](auto i) { +// Pi[i] += fneq[q] * Lattice::Registers::template getMomentByDirection(); +// }); + Pi[0] += fneq[q] * Lattice::Registers::template getMomentByDirection(); + Pi[1] += fneq[q] * Lattice::Registers::template getMomentByDirection(); + Pi[2] += fneq[q] * Lattice::Registers::template getMomentByDirection(); + Pi[3] += fneq[q] * Lattice::Registers::template getMomentByDirection(); + Pi[4] += fneq[q] * Lattice::Registers::template getMomentByDirection(); + Pi[5] += fneq[q] * Lattice::Registers::template getMomentByDirection(); }); // fdecompose_shear diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index 3503e8ef..67ba0ca3 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -139,7 +139,11 @@ auto runFilterCollision(Config& config, NEON_THROW(e); } testCode << "_kbc"; - return runFilterMethod(config, report, testCode); + using Precision = Precision; + using L = D3Q27; + if constexpr (std::is_same_v) { + return runFilterMethod(config, report, testCode); + } } NEON_DEV_UNDER_CONSTRUCTION(""); } From 74a0ae0532d67a6dbce77ffbb223548fd14d57b0 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 13 Sep 2023 09:30:58 +0200 Subject: [PATCH 80/94] Fix for win compilation --- libNeonDomain/include/Neon/domain/tools/SpaceCurves.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h index 835fa066..add3f51e 100644 --- a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h +++ b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h @@ -321,7 +321,7 @@ class Encoder -> uint64_t { uint64_t mortonEncoded = mortonEncode(dim, idx); - uint64_t bits = std::ceil(std::log2(dim.newType().rMax())); + uint64_t bits = uint64_t(std::ceil(std::log2(dim.newType().rMax()))); return mortonToHilbert3D(mortonEncoded, bits); } From 2c474ed5bea9c91228d781f0d74614dc44223d41 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Thu, 14 Sep 2023 09:29:45 +0200 Subject: [PATCH 81/94] Fixing CUDA C++ issues for D3Q19 --- benchmarks/lbm/lbm.py | 2 +- benchmarks/lbm/src/D3Q19.h | 391 +++++++++++++++++--------- benchmarks/lbm/src/DeviceD3QXX.h | 18 +- benchmarks/lbm/src/RunCavityTwoPop.cu | 8 +- 4 files changed, 267 insertions(+), 152 deletions(-) diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py index 2c50f01a..730dd05c 100644 --- a/benchmarks/lbm/lbm.py +++ b/benchmarks/lbm/lbm.py @@ -1,6 +1,6 @@ deviceType_LIST = 'cpu gpu'.split() deviceIds_LIST= "0 1 2 3 4 5 6 7".split() -grid_LIST= "dGrid bGrid eGrid".split() +grid_LIST= "dGrid bGrid_4_4_4".split() domainSize_LIST= "64 128 192 256 320 384 448 512".split() computeFP_LIST= "double float".split() storageFP_LIST= "double float".split() diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h index 422a69d3..11a3408a 100644 --- a/benchmarks/lbm/src/D3Q19.h +++ b/benchmarks/lbm/src/D3Q19.h @@ -32,83 +32,121 @@ struct D3Q19 using Self = D3Q19::Registers; - static constexpr std::array stencil{ - /*! 0 */ Neon::index_3d(-1, 0, 0), - /*! 1 */ Neon::index_3d(0, -1, 0), - /*! 2 */ Neon::index_3d(0, 0, -1), - /*! 3 */ Neon::index_3d(-1, -1, 0), - /*! 4 */ Neon::index_3d(-1, 1, 0), - /*! 5 */ Neon::index_3d(-1, 0, -1), - /*! 6 */ Neon::index_3d(-1, 0, 1), - /*! 7 */ Neon::index_3d(0, -1, -1), - /*! 8 */ Neon::index_3d(0, -1, 1), - /*! 9 */ Neon::index_3d(0, 0, 0), - /*! 10 */ Neon::index_3d(1, 0, 0), - /*! 11 */ Neon::index_3d(0, 1, 0), - /*! 12 */ Neon::index_3d(0, 0, 1), - /*! 13 */ Neon::index_3d(1, 1, 0), - /*! 14 */ Neon::index_3d(1, -1, 0), - /*! 15 */ Neon::index_3d(1, 0, 1), - /*! 16 */ Neon::index_3d(1, 0, -1), - /*! 17 */ Neon::index_3d(0, 1, 1), - /*! 18 */ Neon::index_3d(0, 1, -1)}; - - template - static constexpr inline auto - getComponentOfDirection() -> int - { - return Self::stencil[qIdx].template getComponent(); - } - static constexpr int center = 9; /** Position of direction {0,0,0} */ - template - static constexpr auto getOpposite() - -> int + template + static constexpr auto getVelocityComponent() -> int { - auto opposite3d = stencil[go] * -1; - for (int i = 0; i < Q; ++i) { - if (stencil[i] == opposite3d) { - return i; - } - } + static_assert(myQ < Q); + static_assert(myXYZ < 3); + +#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((myXYZ) == 0) { \ + return XXX; \ + } \ + if constexpr ((myXYZ) == 1) { \ + return YYY; \ + } \ + if constexpr ((myXYZ) == 2) { \ + return ZZZ; \ + } \ + } + + ADD_COMPONENT(0, -1, 0, 0) + ADD_COMPONENT(1, 0, -1, 0) + ADD_COMPONENT(2, 0, 0, -1) + ADD_COMPONENT(3, -1, -1, 0) + ADD_COMPONENT(4, -1, 1, 0) + ADD_COMPONENT(5, -1, 0, -1) + ADD_COMPONENT(6, -1, 0, 1) + ADD_COMPONENT(7, 0, -1, -1) + ADD_COMPONENT(8, 0, -1, 1) + ADD_COMPONENT(9, 0, 0, 0) + ADD_COMPONENT(10, 1, 0, 0) + ADD_COMPONENT(11, 0, 1, 0) + ADD_COMPONENT(12, 0, 0, 1) + ADD_COMPONENT(13, 1, 1, 0) + ADD_COMPONENT(14, 1, -1, 0) + ADD_COMPONENT(15, 1, 0, 1) + ADD_COMPONENT(16, 1, 0, -1) + ADD_COMPONENT(17, 0, 1, 1) + ADD_COMPONENT(18, 0, 1, -1) + +#undef ADD_COMPONENT } - static constexpr std::array opposite{ - 10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8}; - - static constexpr std::array t{ - 1. / 18. /*! 0 */, - 1. / 18. /*! 1 */, - 1. / 18. /*! 2 */, - 1. / 36. /*! 3 */, - 1. / 36. /*! 4 */, - 1. / 36. /*! 5 */, - 1. / 36. /*! 6 */, - 1. / 36. /*! 7 */, - 1. / 36. /*! 8 */, - 1. / 3. /*! 9 */, - 1. / 18. /*! 10 */, - 1. / 18. /*! 11 */, - 1. / 18. /*! 12 */, - 1. / 36. /*! 13 */, - 1. / 36. /*! 14 */, - 1. / 36. /*! 15 */, - 1. / 36. /*! 16 */, - 1. / 36. /*! 17 */, - 1. / 36. /*! 18 */ - }; + template + static constexpr auto getOpposite() -> int + { + static_assert(myQ < Q); - template - static constexpr auto getT() -> const typename Precision::Storage +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 10) + ADD_COMPONENT(1, 11) + ADD_COMPONENT(2, 12) + ADD_COMPONENT(3, 13) + ADD_COMPONENT(4, 14) + ADD_COMPONENT(5, 15) + ADD_COMPONENT(6, 16) + ADD_COMPONENT(7, 17) + ADD_COMPONENT(8, 18) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 0) + ADD_COMPONENT(11, 1) + ADD_COMPONENT(12, 2) + ADD_COMPONENT(13, 3) + ADD_COMPONENT(14, 4) + ADD_COMPONENT(15, 5) + ADD_COMPONENT(16, 6) + ADD_COMPONENT(17, 7) + ADD_COMPONENT(18, 8) +#undef ADD_COMPONENT + } + + template + static constexpr auto getT() -> typename Precision::Storage { - return t[q]; + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + + ADD_COMPONENT(0, 1. / 18.) + ADD_COMPONENT(1, 1. / 18.) + ADD_COMPONENT(2, 1. / 18.) + ADD_COMPONENT(3, 1. / 36.) + ADD_COMPONENT(4, 1. / 36.) + ADD_COMPONENT(5, 1. / 36.) + ADD_COMPONENT(6, 1. / 36.) + ADD_COMPONENT(7, 1. / 36.) + ADD_COMPONENT(8, 1. / 36.) + ADD_COMPONENT(9, 1. / 3.) + ADD_COMPONENT(10, 1. / 18.) + ADD_COMPONENT(11, 1. / 18.) + ADD_COMPONENT(12, 1. / 18.) + ADD_COMPONENT(13, 1. / 36.) + ADD_COMPONENT(14, 1. / 36.) + ADD_COMPONENT(15, 1. / 36.) + ADD_COMPONENT(16, 1. / 36.) + ADD_COMPONENT(17, 1. / 36.) + ADD_COMPONENT(18, 1. / 36.) + +#undef ADD_COMPONENT } + template - static constexpr auto getDirection() -> const typename Neon::index_3d + static constexpr auto getVelocity() -> const typename Neon::index_3d { - return stencil[q]; + return Neon::index_3d(getVelocityComponent, + getVelocityComponent, + getVelocityComponent); } // Identifying first half of the directions @@ -119,7 +157,8 @@ struct D3Q19 template static inline NEON_CUDA_HOST_DEVICE auto - getCk_u(std::array const& u) -> Compute + getCk_u(std::array const& u) + -> Compute { if constexpr (tegIdx == 0 || tegIdx == 10) { return -u[0]; @@ -157,99 +196,167 @@ struct D3Q19 { using Self = D3Q19::Memory; - static constexpr std::array stencil{ - Neon::index_3d(-1, 0, 0), - Neon::index_3d(0, -1, 0), - Neon::index_3d(0, 0, -1), - Neon::index_3d(-1, -1, 0), - Neon::index_3d(-1, 1, 0), - Neon::index_3d(-1, 0, -1), - Neon::index_3d(-1, 0, 1), - Neon::index_3d(0, -1, -1), - Neon::index_3d(0, -1, 1), - Neon::index_3d(0, 0, 0), - Neon::index_3d(1, 0, 0), - Neon::index_3d(0, 1, 0), - Neon::index_3d(0, 0, 1), - Neon::index_3d(1, 1, 0), - Neon::index_3d(1, -1, 0), - Neon::index_3d(1, 0, 1), - Neon::index_3d(1, 0, -1), - Neon::index_3d(0, 1, 1), - Neon::index_3d(0, 1, -1)}; + template + static constexpr auto getVelocityComponent() -> int + { + static_assert(myQ < Q); + static_assert(myXYZ < 3); + +#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((myXYZ) == 0) { \ + return XXX; \ + } \ + if constexpr ((myXYZ) == 1) { \ + return YYY; \ + } \ + if constexpr ((myXYZ) == 2) { \ + return ZZZ; \ + } \ + } + ADD_COMPONENT(0, -1, 0, 0) + ADD_COMPONENT(1, 0, -1, 0) + ADD_COMPONENT(2, 0, 0, -1) + ADD_COMPONENT(3, -1, -1, 0) + ADD_COMPONENT(4, -1, 1, 0) + ADD_COMPONENT(5, -1, 0, -1) + ADD_COMPONENT(6, -1, 0, 1) + ADD_COMPONENT(7, 0, -1, -1) + ADD_COMPONENT(8, 0, -1, 1) + ADD_COMPONENT(9, 0, 0, 0) + ADD_COMPONENT(10, 1, 0, 0) + ADD_COMPONENT(11, 0, 1, 0) + ADD_COMPONENT(12, 0, 0, 1) + ADD_COMPONENT(13, 1, 1, 0) + ADD_COMPONENT(14, 1, -1, 0) + ADD_COMPONENT(15, 1, 0, 1) + ADD_COMPONENT(16, 1, 0, -1) + ADD_COMPONENT(17, 0, 1, 1) + ADD_COMPONENT(18, 0, 1, -1) + +#undef ADD_COMPONENT + } static constexpr int center = 9; /** Position of direction {0,0,0} */ - static constexpr std::array memoryToRegister{ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; - - static constexpr std::array registerToMemory{ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; - - - template - static constexpr auto mapToRegisters() + template + static constexpr auto mapToRegisters() -> int { - return memoryToRegister[go]; + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 0) + ADD_COMPONENT(1, 1) + ADD_COMPONENT(2, 2) + ADD_COMPONENT(3, 3) + ADD_COMPONENT(4, 4) + ADD_COMPONENT(5, 5) + ADD_COMPONENT(6, 6) + ADD_COMPONENT(7, 7) + ADD_COMPONENT(8, 8) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 10) + ADD_COMPONENT(11, 11) + ADD_COMPONENT(12, 12) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 14) + ADD_COMPONENT(15, 15) + ADD_COMPONENT(16, 16) + ADD_COMPONENT(17, 17) + ADD_COMPONENT(18, 18) +#undef ADD_COMPONENT } - template - static constexpr auto mapToMemory() + template + static constexpr auto mapToMemory() -> int { - return registerToMemory[go]; + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 0) + ADD_COMPONENT(1, 1) + ADD_COMPONENT(2, 2) + ADD_COMPONENT(3, 3) + ADD_COMPONENT(4, 4) + ADD_COMPONENT(5, 5) + ADD_COMPONENT(6, 6) + ADD_COMPONENT(7, 7) + ADD_COMPONENT(8, 8) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 10) + ADD_COMPONENT(11, 11) + ADD_COMPONENT(12, 12) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 14) + ADD_COMPONENT(15, 15) + ADD_COMPONENT(16, 16) + ADD_COMPONENT(17, 17) + ADD_COMPONENT(18, 18) +#undef ADD_COMPONENT } - template - static constexpr auto getOpposite() - -> int + template + static constexpr auto getOpposite() -> int { - return opposite[go]; - } + static_assert(myQ < Q); - static constexpr std::array opposite{ - 10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8}; +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 10) + ADD_COMPONENT(1, 11) + ADD_COMPONENT(2, 12) + ADD_COMPONENT(3, 13) + ADD_COMPONENT(4, 14) + ADD_COMPONENT(5, 15) + ADD_COMPONENT(6, 16) + ADD_COMPONENT(7, 17) + ADD_COMPONENT(8, 18) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 0) + ADD_COMPONENT(11, 1) + ADD_COMPONENT(12, 2) + ADD_COMPONENT(13, 3) + ADD_COMPONENT(14, 4) + ADD_COMPONENT(15, 5) + ADD_COMPONENT(16, 6) + ADD_COMPONENT(17, 7) + ADD_COMPONENT(18, 8) +#undef ADD_COMPONENT + } }; - // template - // struct MemMapper - // { - // constexpr static int fwdMemQ = fwMemIdx_; - // constexpr static int fwdMemQX = Memory::stencil[fwdMemQ].x; - // constexpr static int fwdY = Memory::stencil[fwdMemQ].y; - // constexpr static int fwdZ = Memory::stencil[fwdMemQ].z; - // - // constexpr static int bkwMemQ = Memory::opposite[fwdMemQ]; - // constexpr static int bkwX = Memory::stencil[bkwMemQ].x; - // constexpr static int bkwY = Memory::stencil[bkwMemQ].y; - // constexpr static int bkwZ = Memory::stencil[bkwMemQ].z; - // - // constexpr static int fwdRegQ = Memory::template mapToRegisters(); - // constexpr static int centerRegQ = Registers::center; - // constexpr static int centerMemQ = Memory::center; - // }; template struct RegisterMapper { constexpr static int fwdRegQ = fwdRegIdx_; - constexpr static int bkwRegQ = Registers::opposite[fwdRegQ]; + constexpr static int bkwRegQ = Registers::template getOpposite(); constexpr static int fwdMemQ = Memory::template mapToMemory(); constexpr static int bkwMemQ = Memory::template mapToMemory(); constexpr static int centerRegQ = Registers::center; constexpr static int centerMemQ = Memory::center; - constexpr static int fwdMemQX = Memory::stencil[fwdMemQ].x; - constexpr static int fwdMemQY = Memory::stencil[fwdMemQ].y; - constexpr static int fwdMemQZ = Memory::stencil[fwdMemQ].z; + constexpr static int fwdMemQX = Memory::template getVelocityComponent(); + constexpr static int fwdMemQY = Memory::template getVelocityComponent(); + constexpr static int fwdMemQZ = Memory::template getVelocityComponent(); - constexpr static int bkwMemQX = Memory::stencil[bkwMemQ].x; - constexpr static int bkwMemQY = Memory::stencil[bkwMemQ].y; - constexpr static int bkwMemQZ = Memory::stencil[bkwMemQ].z; + constexpr static int bkwMemQX = Memory::template getVelocityComponent(); + constexpr static int bkwMemQY = Memory::template getVelocityComponent(); + constexpr static int bkwMemQZ = Memory::template getVelocityComponent(); }; + public: template static auto getDirectionAsVector() @@ -257,14 +364,22 @@ struct D3Q19 { std::vector vec; if constexpr (mappingType == RegisterMapping) { - for (auto const& a : Registers::stencil) { - vec.push_back(a); - } + Neon::ConstexprFor<0, Q, 1>( + [&vec](auto q) { + Neon::index_3d val(Registers::template getVelocityComponent(), + Registers::template getVelocityComponent(), + Registers::template getVelocityComponent()); + vec.push_back(val); + }); } else if constexpr (mappingType == MemoryMapping) { - for (auto const& a : Memory::stencil) { - vec.push_back(a); - } + Neon::ConstexprFor<0, Q, 1>( + [&vec](auto q) { + Neon::index_3d val(Memory::template getVelocityComponent(), + Memory::template getVelocityComponent(), + Memory::template getVelocityComponent()); + vec.push_back(val); + }); } return vec; } -}; +}; \ No newline at end of file diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index fc275308..57a6b2b4 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -192,16 +192,16 @@ struct DeviceD3QXX Compute eqFw; Compute eqBk; - const Compute ck_u = u[0] * Lattice::Registers::template getComponentOfDirection() + - u[1] * Lattice::Registers::template getComponentOfDirection() + - u[2] * Lattice::Registers::template getComponentOfDirection(); + const Compute ck_u = u[0] * Lattice::Registers::template getVelocityComponent() + + u[1] * Lattice::Registers::template getVelocityComponent() + + u[2] * Lattice::Registers::template getVelocityComponent(); // double eq = rho * t[k] * // (1. + // 3. * ck_u + // 4.5 * ck_u * ck_u - // usqr); - eqFw = rho * T::t[M::fwdRegQ] * + eqFw = rho * T::template getT() * (c1 + c3 * ck_u + c4dot5 * ck_u * ck_u - @@ -209,7 +209,7 @@ struct DeviceD3QXX // double eqopp = eq - 6.* rho * t[k] * ck_u; eqBk = eqFw - - c6 * rho * T::t[M::fwdRegQ] * ck_u; + c6 * rho * T::template getT() * ck_u; // pop_out = (1. - omega) * fin(i, k) + omega * eq; pop[M::fwdRegQ] = (c1 - omega) * static_cast(pop[M::fwdRegQ]) + omega * eqFw; @@ -220,7 +220,7 @@ struct DeviceD3QXX using T = typename Lattice::Registers; using M = typename Lattice::template RegisterMapper; // eq = rho * t[k] * (1. - usqr); - const Compute eqCenter = rho * T::t[M::centerRegQ] * (c1 - usqr); + const Compute eqCenter = rho * T::template getT() * (c1 - usqr); // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; pop[M::centerRegQ] = (c1 - omega) * static_cast(pop[M::centerRegQ]) + omega * eqCenter; } @@ -326,9 +326,9 @@ struct DeviceD3QXX // momentum_flux Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { -// Neon::ConstexprFor<0, 6, 1>([&](auto i) { -// Pi[i] += fneq[q] * Lattice::Registers::template getMomentByDirection(); -// }); + // Neon::ConstexprFor<0, 6, 1>([&](auto i) { + // Pi[i] += fneq[q] * Lattice::Registers::template getMomentByDirection(); + // }); Pi[0] += fneq[q] * Lattice::Registers::template getMomentByDirection(); Pi[1] += fneq[q] * Lattice::Registers::template getMomentByDirection(); Pi[2] += fneq[q] * Lattice::Registers::template getMomentByDirection(); diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index 67ba0ca3..ccb5589f 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -76,9 +76,9 @@ auto run(Config& config, using M = typename Lattice::template RegisterMapper; if (globalIdx.y == domainDim.y - 1) { popVal = -6. * Lattice::Registers::template getT() * ulb * - (Lattice::Registers::template getDirection().v[0] * ulid.v[0] + - Lattice::Registers::template getDirection().v[1] * ulid.v[1] + - Lattice::Registers::template getDirection().v[2] * ulid.v[2]); + (Lattice::Registers::template getVelocityComponent() * ulid.v[0] + + Lattice::Registers::template getVelocityComponent() * ulid.v[1] + + Lattice::Registers::template getVelocityComponent() * ulid.v[2]); } else { popVal = 0; } @@ -142,7 +142,7 @@ auto runFilterCollision(Config& config, using Precision = Precision; using L = D3Q27; if constexpr (std::is_same_v) { - return runFilterMethod(config, report, testCode); + return runFilterMethod(config, report, testCode); } } NEON_DEV_UNDER_CONSTRUCTION(""); From eea7cf83a366adc405b7da59e5dd9736a17af988 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 15 Sep 2023 15:01:28 +0200 Subject: [PATCH 82/94] Fixing CUDA C++ issues for D3Q19 - bgk --- benchmarks/lbm/src/D3Q27.h | 625 ++++++++++++++++---------- benchmarks/lbm/src/DeviceD3QXX.h | 20 +- benchmarks/lbm/src/RunCavityTwoPop.cu | 10 +- 3 files changed, 407 insertions(+), 248 deletions(-) diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h index 59a97850..535dd2de 100644 --- a/benchmarks/lbm/src/D3Q27.h +++ b/benchmarks/lbm/src/D3Q27.h @@ -29,255 +29,425 @@ struct D3Q27 struct Registers { + using Self = D3Q27::Registers; - static constexpr std::array stencil{ - /* 00 */ Neon::index_3d(-1, 0, 0), - /* 01 */ Neon::index_3d(0, -1, 0), - /* 02 */ Neon::index_3d(0, 0, -1), - /* 03 */ Neon::index_3d(-1, -1, 0), - /* 04 */ Neon::index_3d(-1, 1, 0), - /* 05 */ Neon::index_3d(-1, 0, -1), - /* 06 */ Neon::index_3d(-1, 0, 1), - /* 07 */ Neon::index_3d(0, -1, -1), - /* 08 */ Neon::index_3d(0, -1, 1), - /* 09 */ Neon::index_3d(-1, -1, -1), - /* 00 */ Neon::index_3d(-1, -1, 1), - /* 11 */ Neon::index_3d(-1, 1, -1), - /* 12 */ Neon::index_3d(-1, 1, 1), - /* 13 */ Neon::index_3d(0, 0, 0), - /* 14 */ Neon::index_3d(1, 0, 0), - /* 15 */ Neon::index_3d(0, 1, 0), - /* 16 */ Neon::index_3d(0, 0, 1), - /* 17 */ Neon::index_3d(1, 1, 0), - /* 18 */ Neon::index_3d(1, -1, 0), - /* 19 */ Neon::index_3d(1, 0, 1), - /* 20 */ Neon::index_3d(1, 0, -1), - /* 21 */ Neon::index_3d(0, 1, 1), - /* 22 */ Neon::index_3d(0, 1, -1), - /* 23 */ Neon::index_3d(1, 1, 1), - /* 24 */ Neon::index_3d(1, 1, -1), - /* 25 */ Neon::index_3d(1, -1, 1), - /* 26 */ Neon::index_3d(1, -1, -1)}; - - template - static constexpr inline auto - getComponentOfDirection() -> int - { - return Self::stencil[qIdx].v[cIdx]; - } static constexpr int center = 13; /** Position of direction {0,0,0} */ + // Identifying first half of the directions + // For each direction in the list, the opposite is not present. + // Center is also removed + static constexpr int firstHalfQLen = (Q - 1) / 2; + static constexpr std::array firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - template - static constexpr auto getOpposite() - -> int + template + static constexpr auto getVelocityComponent() -> int { - auto opposite3d = stencil[go] * -1; - for (int i = 0; i < Q; ++i) { - if (stencil[i] == opposite3d) { - return i; - } - } + static_assert(myQ < Q); + static_assert(myXYZ < 3); + +#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((myXYZ) == 0) { \ + return XXX; \ + } \ + if constexpr ((myXYZ) == 1) { \ + return YYY; \ + } \ + if constexpr ((myXYZ) == 2) { \ + return ZZZ; \ + } \ + } + + ADD_COMPONENT(0, -1, 0, 0) + ADD_COMPONENT(1, 0, -1, 0) + ADD_COMPONENT(2, 0, 0, -1) + ADD_COMPONENT(3, -1, -1, 0) + ADD_COMPONENT(4, -1, 1, 0) + ADD_COMPONENT(5, -1, 0, -1) + ADD_COMPONENT(6, -1, 0, 1) + ADD_COMPONENT(7, 0, -1, -1) + ADD_COMPONENT(8, 0, -1, 1) + ADD_COMPONENT(9, -1, -1, -1) + ADD_COMPONENT(10, -1, -1, 1) + ADD_COMPONENT(11, -1, 1, -1) + ADD_COMPONENT(12, -1, 1, 1) + ADD_COMPONENT(13, 0, 0, 0) + ADD_COMPONENT(14, 1, 0, 0) + ADD_COMPONENT(15, 0, 1, 0) + ADD_COMPONENT(16, 0, 0, 1) + ADD_COMPONENT(17, 1, 1, 0) + ADD_COMPONENT(18, 1, -1, 0) + ADD_COMPONENT(19, 1, 0, 1) + ADD_COMPONENT(20, 1, 0, -1) + ADD_COMPONENT(21, 0, 1, 1) + ADD_COMPONENT(22, 0, 1, -1) + ADD_COMPONENT(23, 1, 1, 1) + ADD_COMPONENT(24, 1, 1, -1) + ADD_COMPONENT(25, 1, -1, 1) + ADD_COMPONENT(26, 1, -1, -1) + +#undef ADD_COMPONENT } - static constexpr std::array opposite{ - 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, - 13, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - - static constexpr std::array t{ - /* 00 */ 2. / 27., - /* 01 */ 2. / 27., - /* 02 */ 2. / 27., - /* 03 */ 1. / 54., - /* 04 */ 1. / 54., - /* 05 */ 1. / 54., - /* 06 */ 1. / 54., - /* 07 */ 1. / 54., - /* 08 */ 1. / 54., - /* 09 */ 1. / 216., - /* 00 */ 1. / 216., - /* 11 */ 1. / 216., - /* 12 */ 1. / 216., - /* 13 */ 8. / 27., - /* 14 */ 2. / 27., - /* 15 */ 2. / 27., - /* 16 */ 2. / 27., - /* 17 */ 1. / 54., - /* 18 */ 1. / 54., - /* 19 */ 1. / 54., - /* 20 */ 1. / 54., - /* 21 */ 1. / 54., - /* 22 */ 1. / 54., - /* 23 */ 1. / 216., - /* 24 */ 1. / 216., - /* 25 */ 1. / 216., - /* 26 */ 1. / 216.}; - - template - static inline NEON_CUDA_HOST_DEVICE auto - getWeightOfDirection() -> const typename Precision::Compute + template + static constexpr auto getOpposite() -> int { - return t[qIdx]; + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + + + ADD_COMPONENT(0, 14) + ADD_COMPONENT(1, 15) + ADD_COMPONENT(2, 16) + ADD_COMPONENT(3, 17) + ADD_COMPONENT(4, 18) + ADD_COMPONENT(5, 19) + ADD_COMPONENT(6, 20) + ADD_COMPONENT(7, 21) + ADD_COMPONENT(8, 22) + ADD_COMPONENT(9, 23) + ADD_COMPONENT(10, 24) + ADD_COMPONENT(11, 25) + ADD_COMPONENT(12, 26) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 0) + ADD_COMPONENT(15, 1) + ADD_COMPONENT(16, 2) + ADD_COMPONENT(17, 3) + ADD_COMPONENT(18, 4) + ADD_COMPONENT(19, 5) + ADD_COMPONENT(20, 6) + ADD_COMPONENT(21, 7) + ADD_COMPONENT(22, 8) + ADD_COMPONENT(23, 9) + ADD_COMPONENT(24, 10) + ADD_COMPONENT(25, 11) + ADD_COMPONENT(26, 12) + + +#undef ADD_COMPONENT } - template - static constexpr NEON_CUDA_HOST_DEVICE auto getT() -> const typename Precision::Storage + template + static constexpr auto getT() -> typename Precision::Storage { - return t[q]; + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + + ADD_COMPONENT(0, 2. / 27.) + ADD_COMPONENT(1, 2. / 27.) + ADD_COMPONENT(2, 2. / 27.) + ADD_COMPONENT(3, 1. / 54.) + ADD_COMPONENT(4, 1. / 54.) + ADD_COMPONENT(5, 1. / 54.) + ADD_COMPONENT(6, 1. / 54.) + ADD_COMPONENT(7, 1. / 54.) + ADD_COMPONENT(8, 1. / 54.) + ADD_COMPONENT(9, 1. / 216.) + ADD_COMPONENT(10, 1. / 216.) + ADD_COMPONENT(11, 1. / 216.) + ADD_COMPONENT(12, 1. / 216.) + ADD_COMPONENT(13, 8. / 27.) + ADD_COMPONENT(14, 2. / 27.) + ADD_COMPONENT(15, 2. / 27.) + ADD_COMPONENT(16, 2. / 27.) + ADD_COMPONENT(17, 1. / 54.) + ADD_COMPONENT(18, 1. / 54.) + ADD_COMPONENT(19, 1. / 54.) + ADD_COMPONENT(20, 1. / 54.) + ADD_COMPONENT(21, 1. / 54.) + ADD_COMPONENT(22, 1. / 54.) + ADD_COMPONENT(23, 1. / 216.) + ADD_COMPONENT(24, 1. / 216.) + ADD_COMPONENT(25, 1. / 216.) + ADD_COMPONENT(26, 1. / 216.) + +#undef ADD_COMPONENT } - template - static constexpr NEON_CUDA_HOST_DEVICE auto getDirection() -> const typename Neon::index_3d + template + static constexpr auto getMomentumComponet() -> typename Precision::Storage { - return stencil[q]; + static_assert(myQ < Q); + static_assert(mementumID < 6); + +#define ADD_COMPONENT(QQ, AA, BB, CC, DD, EE, FF) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((mementumID) == 0) { \ + return AA; \ + } \ + if constexpr ((mementumID) == 1) { \ + return BB; \ + } \ + if constexpr ((mementumID) == 2) { \ + return CC; \ + } \ + if constexpr ((mementumID) == 3) { \ + return DD; \ + } \ + if constexpr ((mementumID) == 4) { \ + return EE; \ + } \ + if constexpr ((mementumID) == 5) { \ + return FF; \ + } \ + } + + ADD_COMPONENT(0, 1, 0, 0, 0, 0, 0) + ADD_COMPONENT(1, 0, 0, 0, 1, 0, 0) + ADD_COMPONENT(2, 0, 0, 0, 0, 0, 1) + ADD_COMPONENT(3, 1, 1, 0, 1, 0, 0) + ADD_COMPONENT(4, 1, -1, 0, 1, 0, 0) + ADD_COMPONENT(5, 1, 0, 1, 0, 0, 1) + ADD_COMPONENT(6, 1, 0, -1, 0, 0, 1) + ADD_COMPONENT(7, 0, 0, 0, 1, 1, 1) + ADD_COMPONENT(8, 0, 0, 0, 1, -1, 1) + ADD_COMPONENT(9, 1, 1, 1, 1, 1, 1) + ADD_COMPONENT(10, 1, 1, -1, 1, -1, 1) + ADD_COMPONENT(11, 1, -1, 1, 1, -1, 1) + ADD_COMPONENT(12, 1, -1, -1, 1, 1, 1) + ADD_COMPONENT(13, 0, 0, 0, 0, 0, 0) + ADD_COMPONENT(14, 1, 0, 0, 0, 0, 0) + ADD_COMPONENT(15, 0, 0, 0, 1, 0, 0) + ADD_COMPONENT(16, 0, 0, 0, 0, 0, 1) + ADD_COMPONENT(17, 1, 1, 0, 1, 0, 0) + ADD_COMPONENT(18, 1, -1, 0, 1, 0, 0) + ADD_COMPONENT(19, 1, 0, 1, 0, 0, 1) + ADD_COMPONENT(20, 1, 0, -1, 0, 0, 1) + ADD_COMPONENT(21, 0, 0, 0, 1, 1, 1) + ADD_COMPONENT(22, 0, 0, 0, 1, -1, 1) + ADD_COMPONENT(23, 1, 1, 1, 1, 1, 1) + ADD_COMPONENT(24, 1, 1, -1, 1, -1, 1) + ADD_COMPONENT(25, 1, -1, 1, 1, -1, 1) + ADD_COMPONENT(26, 1, -1, -1, 1, 1, 1) + +#undef ADD_COMPONENT } - // Identifying first half of the directions - // For each direction in the list, the opposite is not present. - // Center is also removed - static constexpr int firstHalfQLen = (Q - 1) / 2; - static constexpr std::array firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - struct Moment - { - std::array v{0, 0, 0, 0, 0, 0}; - inline constexpr Moment(int a0, int a1, int a2, int a3, int a4, int a5) - { - v[0] = a0; - v[1] = a1; - v[2] = a2; - v[3] = a3; - v[4] = a4; - v[5] = a5; - } - }; - - static constexpr std::array latticeMoment{ - Moment(1, 0, 0, 0, 0, 0), - Moment(0, 0, 0, 1, 0, 0), - Moment(0, 0, 0, 0, 0, 1), - Moment(1, 1, 0, 1, 0, 0), - Moment(1, -1, 0, 1, 0, 0), - Moment(1, 0, 1, 0, 0, 1), - Moment(1, 0, -1, 0, 0, 1), - Moment(0, 0, 0, 1, 1, 1), - Moment(0, 0, 0, 1, -1, 1), - Moment(1, 1, 1, 1, 1, 1), - Moment(1, 1, -1, 1, -1, 1), - Moment(1, -1, 1, 1, -1, 1), - Moment(1, -1, -1, 1, 1, 1), - Moment(0, 0, 0, 0, 0, 0), - Moment(1, 0, 0, 0, 0, 0), - Moment(0, 0, 0, 1, 0, 0), - Moment(0, 0, 0, 0, 0, 1), - Moment(1, 1, 0, 1, 0, 0), - Moment(1, -1, 0, 1, 0, 0), - Moment(1, 0, 1, 0, 0, 1), - Moment(1, 0, -1, 0, 0, 1), - Moment(0, 0, 0, 1, 1, 1), - Moment(0, 0, 0, 1, -1, 1), - Moment(1, 1, 1, 1, 1, 1), - Moment(1, 1, -1, 1, -1, 1), - Moment(1, -1, 1, 1, -1, 1), - Moment(1, -1, -1, 1, 1, 1)}; - - template - static constexpr inline NEON_CUDA_HOST_DEVICE auto - getMomentByDirection() - -> int + + template + static constexpr auto getVelocity() -> const typename Neon::index_3d { - return latticeMoment[qIdx].v[mIdx]; + return Neon::index_3d(getVelocityComponent, + getVelocityComponent, + getVelocityComponent); } + + // // Identifying first half of the directions + // // For each direction in the list, the opposite is not present. + // // Center is also removed + // static constexpr int firstHalfQLen = (Q - 1) / 2; + // static constexpr std::array firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8}; }; struct Memory { using Self = D3Q27::Memory; - static constexpr std::array stencil{ - Neon::index_3d(-1, 0, 0), - Neon::index_3d(0, -1, 0), - Neon::index_3d(0, 0, -1), - Neon::index_3d(-1, -1, 0), - Neon::index_3d(-1, 1, 0), - Neon::index_3d(-1, 0, -1), - Neon::index_3d(-1, 0, 1), - Neon::index_3d(0, -1, -1), - Neon::index_3d(0, -1, 1), - Neon::index_3d(-1, -1, -1), - Neon::index_3d(-1, -1, 1), - Neon::index_3d(-1, 1, -1), - Neon::index_3d(-1, 1, 1), - Neon::index_3d(0, 0, 0), - Neon::index_3d(1, 0, 0), - Neon::index_3d(0, 1, 0), - Neon::index_3d(0, 0, 1), - Neon::index_3d(1, 1, 0), - Neon::index_3d(1, -1, 0), - Neon::index_3d(1, 0, 1), - Neon::index_3d(1, 0, -1), - Neon::index_3d(0, 1, 1), - Neon::index_3d(0, 1, -1), - Neon::index_3d(1, 1, 1), - Neon::index_3d(1, 1, -1), - Neon::index_3d(1, -1, 1), - Neon::index_3d(1, -1, -1)}; + template + static constexpr auto getVelocityComponent() -> int + { + static_assert(myQ < Q); + static_assert(myXYZ < 3); + +#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((myXYZ) == 0) { \ + return XXX; \ + } \ + if constexpr ((myXYZ) == 1) { \ + return YYY; \ + } \ + if constexpr ((myXYZ) == 2) { \ + return ZZZ; \ + } \ + } - static constexpr int center = 13; /** Position of direction {0,0,0} */ - - static constexpr std::array memoryToRegister{ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}; + ADD_COMPONENT(0, -1, 0, 0) + ADD_COMPONENT(1, 0, -1, 0) + ADD_COMPONENT(2, 0, 0, -1) + ADD_COMPONENT(3, -1, -1, 0) + ADD_COMPONENT(4, -1, 1, 0) + ADD_COMPONENT(5, -1, 0, -1) + ADD_COMPONENT(6, -1, 0, 1) + ADD_COMPONENT(7, 0, -1, -1) + ADD_COMPONENT(8, 0, -1, 1) + ADD_COMPONENT(9, -1, -1, -1) + ADD_COMPONENT(10, -1, -1, 1) + ADD_COMPONENT(11, -1, 1, -1) + ADD_COMPONENT(12, -1, 1, 1) + ADD_COMPONENT(13, 0, 0, 0) + ADD_COMPONENT(14, 1, 0, 0) + ADD_COMPONENT(15, 0, 1, 0) + ADD_COMPONENT(16, 0, 0, 1) + ADD_COMPONENT(17, 1, 1, 0) + ADD_COMPONENT(18, 1, -1, 0) + ADD_COMPONENT(19, 1, 0, 1) + ADD_COMPONENT(20, 1, 0, -1) + ADD_COMPONENT(21, 0, 1, 1) + ADD_COMPONENT(22, 0, 1, -1) + ADD_COMPONENT(23, 1, 1, 1) + ADD_COMPONENT(24, 1, 1, -1) + ADD_COMPONENT(25, 1, -1, 1) + ADD_COMPONENT(26, 1, -1, -1) + +#undef ADD_COMPONENT + } - static constexpr std::array registerToMemory{ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}; + static constexpr int center = 13; /** Position of direction {0,0,0} */ - template - NEON_CUDA_HOST_DEVICE static constexpr auto mapToRegisters() + template + static constexpr auto mapToRegisters() -> int { - return memoryToRegister[go]; + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 0) + ADD_COMPONENT(1, 1) + ADD_COMPONENT(2, 2) + ADD_COMPONENT(3, 3) + ADD_COMPONENT(4, 4) + ADD_COMPONENT(5, 5) + ADD_COMPONENT(6, 6) + ADD_COMPONENT(7, 7) + ADD_COMPONENT(8, 8) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 10) + ADD_COMPONENT(11, 11) + ADD_COMPONENT(12, 12) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 14) + ADD_COMPONENT(15, 15) + ADD_COMPONENT(16, 16) + ADD_COMPONENT(17, 17) + ADD_COMPONENT(18, 18) + + ADD_COMPONENT(19, 19) + ADD_COMPONENT(20, 20) + ADD_COMPONENT(21, 21) + ADD_COMPONENT(22, 22) + ADD_COMPONENT(23, 23) + ADD_COMPONENT(24, 24) + ADD_COMPONENT(25, 25) + ADD_COMPONENT(26, 26) + +#undef ADD_COMPONENT } - template - NEON_CUDA_HOST_DEVICE static constexpr auto mapToMemory() + template + static constexpr auto mapToMemory() -> int { - return registerToMemory[go]; + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 0) + ADD_COMPONENT(1, 1) + ADD_COMPONENT(2, 2) + ADD_COMPONENT(3, 3) + ADD_COMPONENT(4, 4) + ADD_COMPONENT(5, 5) + ADD_COMPONENT(6, 6) + ADD_COMPONENT(7, 7) + ADD_COMPONENT(8, 8) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 10) + ADD_COMPONENT(11, 11) + ADD_COMPONENT(12, 12) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 14) + ADD_COMPONENT(15, 15) + ADD_COMPONENT(16, 16) + ADD_COMPONENT(17, 17) + ADD_COMPONENT(18, 18) + + ADD_COMPONENT(19, 19) + ADD_COMPONENT(20, 20) + ADD_COMPONENT(21, 21) + ADD_COMPONENT(22, 22) + ADD_COMPONENT(23, 23) + ADD_COMPONENT(24, 24) + ADD_COMPONENT(25, 25) + ADD_COMPONENT(26, 26) +#undef ADD_COMPONENT } - template - NEON_CUDA_HOST_DEVICE static constexpr auto getOpposite() - -> int + template + static constexpr auto getOpposite() -> int { - auto opposite3d = stencil[go] * -1; - for (int i = 0; i < Q; ++i) { - if (stencil[i] == opposite3d) { - return i; - } - } + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 14) + ADD_COMPONENT(1, 15) + ADD_COMPONENT(2, 16) + ADD_COMPONENT(3, 17) + ADD_COMPONENT(4, 18) + ADD_COMPONENT(5, 19) + ADD_COMPONENT(6, 20) + ADD_COMPONENT(7, 21) + ADD_COMPONENT(8, 22) + ADD_COMPONENT(9, 23) + ADD_COMPONENT(10, 24) + ADD_COMPONENT(11, 25) + ADD_COMPONENT(12, 26) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 0) + ADD_COMPONENT(15, 1) + ADD_COMPONENT(16, 2) + ADD_COMPONENT(17, 3) + ADD_COMPONENT(18, 4) + ADD_COMPONENT(19, 5) + ADD_COMPONENT(20, 6) + ADD_COMPONENT(21, 7) + ADD_COMPONENT(22, 8) + ADD_COMPONENT(23, 9) + ADD_COMPONENT(24, 10) + ADD_COMPONENT(25, 11) + ADD_COMPONENT(26, 12) +#undef ADD_COMPONENT } + }; - static constexpr std::array opposite{ - 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, - 13, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; - template - static constexpr auto helpGetValueforT() - -> typename Precision::Storage - { - auto goInRegisterSpace = Self::template mapToRegisters(); - return Registers::t[goInRegisterSpace]; - } + template + struct RegisterMapper + { + constexpr static int fwdRegQ = fwdRegIdx_; + constexpr static int bkwRegQ = Registers::template getOpposite(); + constexpr static int fwdMemQ = Memory::template mapToMemory(); + constexpr static int bkwMemQ = Memory::template mapToMemory(); + constexpr static int centerRegQ = Registers::center; + constexpr static int centerMemQ = Memory::center; - static constexpr std::array t{ - 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., - 1. / 216., 1. / 216., 1. / 216., 1. / 216., - 8. / 27., - 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., - 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; + constexpr static int fwdMemQX = Memory::template getVelocityComponent(); + constexpr static int fwdMemQY = Memory::template getVelocityComponent(); + constexpr static int fwdMemQZ = Memory::template getVelocityComponent(); + + constexpr static int bkwMemQX = Memory::template getVelocityComponent(); + constexpr static int bkwMemQY = Memory::template getVelocityComponent(); + constexpr static int bkwMemQZ = Memory::template getVelocityComponent(); }; + public: template static auto getDirectionAsVector() @@ -285,33 +455,22 @@ struct D3Q27 { std::vector vec; if constexpr (mappingType == RegisterMapping) { - for (auto const& a : Registers::stencil) { - vec.push_back(a); - } + Neon::ConstexprFor<0, Q, 1>( + [&vec](auto q) { + Neon::index_3d val(Registers::template getVelocityComponent(), + Registers::template getVelocityComponent(), + Registers::template getVelocityComponent()); + vec.push_back(val); + }); } else if constexpr (mappingType == MemoryMapping) { - for (auto const& a : Memory::stencil) { - vec.push_back(a); - } + Neon::ConstexprFor<0, Q, 1>( + [&vec](auto q) { + Neon::index_3d val(Memory::template getVelocityComponent(), + Memory::template getVelocityComponent(), + Memory::template getVelocityComponent()); + vec.push_back(val); + }); } return vec; } - - template - struct RegisterMapper - { - constexpr static int fwdRegQ = fwdRegIdx_; - constexpr static int bkwRegQ = Registers::opposite[fwdRegQ]; - constexpr static int fwdMemQ = Memory::template mapToMemory(); - constexpr static int bkwMemQ = Memory::template mapToMemory(); - constexpr static int centerRegQ = Registers::center; - constexpr static int centerMemQ = Memory::center; - - constexpr static int fwdMemQX = Memory::stencil[fwdMemQ].x; - constexpr static int fwdMemQY = Memory::stencil[fwdMemQ].y; - constexpr static int fwdMemQZ = Memory::stencil[fwdMemQ].z; - - constexpr static int bkwMemQX = Memory::stencil[bkwMemQ].x; - constexpr static int bkwMemQY = Memory::stencil[bkwMemQ].y; - constexpr static int bkwMemQZ = Memory::stencil[bkwMemQ].z; - }; -}; +}; \ No newline at end of file diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index 57a6b2b4..4864f829 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -315,12 +315,12 @@ struct DeviceD3QXX // equilibrium Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { const Compute cu = Compute(3) * - (u[0] * Lattice::Registers::template getComponentOfDirection() + - u[1] * Lattice::Registers::template getComponentOfDirection() + - u[2] * Lattice::Registers::template getComponentOfDirection()); + (u[0] * Lattice::Registers::template getVelocityComponent() + + u[1] * Lattice::Registers::template getVelocityComponent() + + u[2] * Lattice::Registers::template getVelocityComponent()); - feq[q] = rho * Lattice::Registers::template getWeightOfDirection() * (1. + cu + 0.5 * cu * cu - usqr); + feq[q] = rho * Lattice::Registers::template getT() * (1. + cu + 0.5 * cu * cu - usqr); fneq[q] = pop[q] - feq[q]; }); @@ -329,12 +329,12 @@ struct DeviceD3QXX // Neon::ConstexprFor<0, 6, 1>([&](auto i) { // Pi[i] += fneq[q] * Lattice::Registers::template getMomentByDirection(); // }); - Pi[0] += fneq[q] * Lattice::Registers::template getMomentByDirection(); - Pi[1] += fneq[q] * Lattice::Registers::template getMomentByDirection(); - Pi[2] += fneq[q] * Lattice::Registers::template getMomentByDirection(); - Pi[3] += fneq[q] * Lattice::Registers::template getMomentByDirection(); - Pi[4] += fneq[q] * Lattice::Registers::template getMomentByDirection(); - Pi[5] += fneq[q] * Lattice::Registers::template getMomentByDirection(); + Pi[0] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[1] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[2] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[3] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[4] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[5] += fneq[q] * Lattice::Registers::template getMomentumComponet(); }); // fdecompose_shear diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index ccb5589f..f1024006 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -160,11 +160,11 @@ auto runFilterLattice(Config& config, using Lattice = D3Q19; return runFilterCollision(config, report, testCode); } - // if (config.lattice == "d3q27" || config.lattice == "D3Q27") { - // testCode << "_D3Q27"; - // using Lattice = D3Q27; - // return runFilterCollision(config, report, testCode); - // } + if (config.lattice == "d3q27" || config.lattice == "D3Q27") { + testCode << "_D3Q27"; + using Lattice = D3Q27; + return runFilterCollision(config, report, testCode); + } NEON_DEV_UNDER_CONSTRUCTION("Lattice type not supported. Available options: D3Q19 and D3Q27"); } From f69c3b7cec5014cf0bef62771da038db0f6300b7 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Fri, 15 Sep 2023 16:44:00 +0200 Subject: [PATCH 83/94] Adding remote write support to bGrid. --- benchmarks/lbm/src/RunCavityTwoPop.cu | 18 +++++++-------- .../Neon/domain/details/bGrid/bPartition.h | 10 ++++++++- .../domain/details/bGrid/bPartition_imp.h | 22 +++++++++++++++++-- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index f1024006..ff9662fb 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -229,15 +229,15 @@ auto run(Config& config, // if (config.gridType == "bGrid" || config.gridType == "bGrid_8_8_8") { // return details::runFilterStoreType(config, report); // } - // if (config.gridType == "bGrid_4_4_4") { - // if constexpr (!skipTest) { - // using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; - // using Grid = Neon::domain::details::bGrid::bGrid; - // return details::runFilterStoreType(config, report); - // } else { - // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") - // } - // } + if (config.gridType == "bGrid_4_4_4") { + if constexpr (!skipTest) { + using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report, testCode); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } + } // if (config.gridType == "bGrid_2_2_2") { // if constexpr (!skipTest) { // using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index 1a36e8ea..fc596898 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -111,6 +111,14 @@ class bPartition LambdaNOTValid funIfNOTValid = nullptr) const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void>; + template + NEON_CUDA_HOST_DEVICE inline auto + writeNghData(const Idx& gidx, + int card, + T value) + -> bool; /** * Gets the global coordinates of the cartesian point. @@ -135,7 +143,7 @@ class bPartition getBlockViewIdx(const Idx& cell) const -> BlockViewGridIdx; - + NEON_CUDA_HOST_DEVICE inline auto helpGetPitch(const Idx& cell, int card) const -> uint32_t; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index 231a4d1b..75d2006b 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -81,7 +81,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: template inline NEON_CUDA_HOST_DEVICE auto bPartition:: - operator()(const Idx& cell, +operator()(const Idx& cell, int card) -> T& { return mMem[helpGetPitch(cell, card)]; @@ -89,7 +89,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: template inline NEON_CUDA_HOST_DEVICE auto bPartition:: - operator()(const Idx& cell, +operator()(const Idx& cell, int card) const -> const T& { return mMem[helpGetPitch(cell, card)]; @@ -414,6 +414,24 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: return; } +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + writeNghData(const Idx& gidx, + int card, + T value) + -> bool +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + return false; + } + mMem[pitch] = value; + return true; +} + template NEON_CUDA_HOST_DEVICE inline auto bPartition::isActive(const Idx& cell, From ef494dfaeecd1b96c03247a2683423ddd5d54d3c Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 18 Sep 2023 23:10:56 +0200 Subject: [PATCH 84/94] WIP --- libNeonSet/include/Neon/set/DevSet.h | 35 +++++++++--- libNeonSet/include/Neon/set/LambdaExecutor.h | 55 +++++++++++++++++-- .../container/CudaLaunchCompileTimeHints.h | 21 +++++++ .../Neon/set/container/DeviceContainer.h | 6 +- 4 files changed, 101 insertions(+), 16 deletions(-) create mode 100644 libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h diff --git a/libNeonSet/include/Neon/set/DevSet.h b/libNeonSet/include/Neon/set/DevSet.h index 5ac38250..5e8b03b7 100644 --- a/libNeonSet/include/Neon/set/DevSet.h +++ b/libNeonSet/include/Neon/set/DevSet.h @@ -20,6 +20,7 @@ #include "Neon/set/LambdaExecutor.h" #include "Neon/set/LaunchParameters.h" #include "Neon/set/Transfer.h" +#include "Neon/set/container/CudaLaunchCompileTimeHints.h" #include "Neon/set/memory/memDevSet.h" #include "Neon/set/memory/memSet.h" #include "Neon/sys/global/GpuSysGlobal.h" @@ -222,7 +223,9 @@ class DevSet auto newLaunchParameters() const -> LaunchParameters; - template + template inline auto launchLambdaOnSpan( Neon::Execution execution, const Neon::set::KernelConfig& kernelConfig, @@ -236,9 +239,11 @@ class DevSet switch (mode) { case Neon::Runtime::stream: { if (execution == Neon::Execution::device) { - this->template helpLaunchLambdaOnSpanCUDA(kernelConfig, - dataSetContainer, - lambdaHolder); + this->template helpLaunchLambdaOnSpanCUDA(kernelConfig, + dataSetContainer, + lambdaHolder); return; } #if defined(NEON_OS_LINUX) || defined(NEON_OS_MAC) @@ -352,7 +357,9 @@ class DevSet } } - template + template inline auto helpLaunchLambdaOnSpanCUDA([[maybe_unused]] const Neon::set::KernelConfig& kernelConfig, [[maybe_unused]] DataSetContainer& dataSetContainer, [[maybe_unused]] std::function; - } else { - executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDA; + if constexpr (!CudaLaunchCompilerTimeHints::initialized) { + if constexpr (!details::ExecutionThreadSpanUtils::isBlockSpan(DataSetContainer::executionThreadSpan)) { + executor = (void*)Neon::set::details::denseSpan::launchLambdaOnSpanCUDA; + } else { + executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDA; + } + } + + if constexpr (CudaLaunchCompilerTimeHints::initialized) { + if constexpr (!details::ExecutionThreadSpanUtils::isBlockSpan(DataSetContainer::executionThreadSpan)) { + executor = (void*)Neon::set::details::denseSpan::launchLambdaOnSpanCUDAWithCompilerHints; + } else { + executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDAWithCompilerHints; + } } dev.kernel.template cudaLaunchKernel(gpuStreamSet[setIdx.idx()], launchInfoSet[setIdx.idx()], diff --git a/libNeonSet/include/Neon/set/LambdaExecutor.h b/libNeonSet/include/Neon/set/LambdaExecutor.h index 4ffe2501..825e86a7 100644 --- a/libNeonSet/include/Neon/set/LambdaExecutor.h +++ b/libNeonSet/include/Neon/set/LambdaExecutor.h @@ -36,6 +36,38 @@ NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDA(typename DataSetContainer::Span spa } } } + +template +__launch_bounds__(CudaLaunchCompilerTimeHints::maxThreadsPerBlock) + NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDAWithCompilerHints(typename DataSetContainer::Span span, + UserLambda userLambdaTa) + -> void +{ + typename DataSetContainer::Idx e; + if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d1) { + if (span.setAndValidate(e, + threadIdx.x + blockIdx.x * blockDim.x)) { + userLambdaTa(e); + } + } + if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d2) { + if (span.setAndValidate(e, + threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y)) { + userLambdaTa(e); + } + } + if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d3) { + if (span.setAndValidate(e, + threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z)) { + userLambdaTa(e); + } + } +} #endif @@ -48,9 +80,9 @@ void launchLambdaOnSpanOMP(Neon::Integer_3d const& gridDim, { if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d1) { #ifdef NEON_OS_WINDOWS -//#pragma omp parallel for default(shared) +// #pragma omp parallel for default(shared) #else - #pragma omp parallel for simd default(shared) +#pragma omp parallel for simd default(shared) #endif for (IndexType x = 0; x < gridDim.x; x++) { typename DataSetContainer::Idx e; @@ -65,7 +97,7 @@ void launchLambdaOnSpanOMP(Neon::Integer_3d const& gridDim, #ifdef NEON_OS_WINDOWS #pragma omp parallel for default(shared) #else -// #pragma omp parallel for simd collapse(2) default(shared) + // #pragma omp parallel for simd collapse(2) default(shared) #endif for (IndexType y = 0; y < gridDim.y; y++) { for (IndexType x = 0; x < gridDim.x; x++) { @@ -81,7 +113,7 @@ void launchLambdaOnSpanOMP(Neon::Integer_3d const& gridDim, #ifdef NEON_OS_WINDOWS #pragma omp parallel for default(shared) #else -// #pragma omp parallel for simd collapse(1) default(shared) schedule(guided) + // #pragma omp parallel for simd collapse(1) default(shared) schedule(guided) #endif for (IndexType z = 0; z < gridDim.z; z++) { for (IndexType y = 0; y < gridDim.y; y++) { @@ -113,6 +145,21 @@ NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDA(typename DataSetContainer::Span spa } } } + +template +NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDAWithCompilerHints(typename DataSetContainer::Span span, + UserLambda userLambdaTa) + -> void +{ + typename DataSetContainer::Idx e; + if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d1b3) { + if (span.setAndValidateGPUDevice(e)) { + userLambdaTa(e); + } + } +} #endif diff --git a/libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h b/libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h new file mode 100644 index 00000000..84fee176 --- /dev/null +++ b/libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h @@ -0,0 +1,21 @@ +#pragma once + +#include "Neon/core/core.h" + + +namespace Neon::set::container { + +template +struct CudaLaunchCompileTimeHint +{ + public: + static constexpr bool initialized = inited__; + static constexpr int maxThreadsPerBlock = maxThreadsPerBlock__; + static constexpr int minBlocksPerMultiprocessor = minBlocksPerMultiprocessor__; + static constexpr int maxBlocksPerCluster = maxBlocksPerCluster__; +}; + +} // namespace Neon::set::container diff --git a/libNeonSet/include/Neon/set/container/DeviceContainer.h b/libNeonSet/include/Neon/set/container/DeviceContainer.h index 6f729894..ae3bf957 100644 --- a/libNeonSet/include/Neon/set/container/DeviceContainer.h +++ b/libNeonSet/include/Neon/set/container/DeviceContainer.h @@ -6,8 +6,8 @@ namespace Neon::set::internal { -template +template < typename DataIteratorContainerT, + typename UserComputeLambdaT, typename CudaLaunchCompileTimeHintT = Neon::set::container::CudaLaunchCompileTimeHint> struct DeviceContainer : ContainerAPI { public: @@ -93,7 +93,7 @@ struct DeviceContainer : ContainerAPI Neon::set::KernelConfig kernelConfig(dataView, bk, streamIdx, this->getLaunchParameters(dataView)); if (ContainerExecutionType::device == this->getContainerExecutionType()) { - bk.devSet().template launchLambdaOnSpan( + bk.devSet().template launchLambdaOnSpan( mExecution, kernelConfig, m_dataIteratorContainer, From ddf430a1537efa91d98f8d93133d8d90190d2404 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 20 Sep 2023 22:07:30 +0200 Subject: [PATCH 85/94] Fixing lbm benchmark template initialization --- benchmarks/lbm/src/RunCavityTwoPop.cu | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index ff9662fb..6a4e84df 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -107,18 +107,18 @@ auto runFilterMethod(Config& config, NEON_THROW_UNSUPPORTED_OPERATION("We only support PUSH in a single device configuration for now.") } testCode << "_push"; - return run(config, report, testCode); + return run(config, report, testCode); } if (config.streamingMethod == "pull") { testCode << "_pull"; - return run(config, report, testCode); + return run(config, report, testCode); } if (config.streamingMethod == "aa") { if (config.devices.size() != 1) { NEON_THROW_UNSUPPORTED_OPERATION("We only support AA in a single device configuration for now.") } testCode << "_aa"; - return run(config, report, testCode); + return run(config, report, testCode); } NEON_DEV_UNDER_CONSTRUCTION(""); } @@ -130,7 +130,7 @@ auto runFilterCollision(Config& config, { if (config.collisionCli.getOption() == Collision::bgk) { testCode << "_bgk"; - return runFilterMethod(config, report, testCode); + return runFilterMethod(config, report, testCode); } if (config.collisionCli.getOption() == Collision::kbc) { if (config.lattice != "d3q27" && config.lattice != "D3Q27") { @@ -139,10 +139,9 @@ auto runFilterCollision(Config& config, NEON_THROW(e); } testCode << "_kbc"; - using Precision = Precision; - using L = D3Q27; + using L = D3Q27>; if constexpr (std::is_same_v) { - return runFilterMethod(config, report, testCode); + return runFilterMethod(config, report, testCode); } } NEON_DEV_UNDER_CONSTRUCTION(""); @@ -153,17 +152,17 @@ auto runFilterLattice(Config& config, Report& report, std::stringstream& testCode) -> void { - using Precision = Precision; + using P = Precision; if (config.lattice == "d3q19" || config.lattice == "D3Q19") { testCode << "_D3Q19"; - using Lattice = D3Q19; - return runFilterCollision(config, report, testCode); + using L = D3Q19

; + return runFilterCollision(config, report, testCode); } if (config.lattice == "d3q27" || config.lattice == "D3Q27") { testCode << "_D3Q27"; - using Lattice = D3Q27; - return runFilterCollision(config, report, testCode); + using L = D3Q27

; + return runFilterCollision(config, report, testCode); } NEON_DEV_UNDER_CONSTRUCTION("Lattice type not supported. Available options: D3Q19 and D3Q27"); } From 728c1869d9523af89c6e150cbeca55f934b131c1 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 9 Oct 2023 10:14:50 +0200 Subject: [PATCH 86/94] Dropping kernel bound mechanisms. --- benchmarks/lbm/src/RunCavityTwoPop.cu | 40 ++++++++++++------- .../domain/tools/partitioning/SpanLayout.cpp | 2 +- libNeonSkeleton/include/Neon/skeleton/Occ.h | 4 +- libNeonSkeleton/src/skeleton/Occ.cpp | 12 +++++- 4 files changed, 41 insertions(+), 17 deletions(-) diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index 6a4e84df..d1e03933 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -33,13 +33,14 @@ auto run(Config& config, using Precision = Precision; using Lattice = Lattice_; // D3Q27; - code << "_" << config.deviceType; + code << "_" << config.deviceType << "_"; for (auto const& id : config.devices) { code << id; } - code << "_SS" << config.stencilSemanticCli.getStringOption() << "_"; - code << "_SF" << config.spaceCurveCli.getStringOption() << "_"; - code << "_TM" << config.transferModeCli.getStringOption() << "_"; + code << "_SS" << config.stencilSemanticCli.getStringOption(); + code << "_SF" << config.spaceCurveCli.getStringOption(); + code << "_TM" << config.transferModeCli.getStringOption(); + code << "_Occ" << config.occCli.getStringOption(); code << "__"; // using PopulationField = typename Grid::template Field; @@ -159,11 +160,11 @@ auto runFilterLattice(Config& config, using L = D3Q19

; return runFilterCollision(config, report, testCode); } - if (config.lattice == "d3q27" || config.lattice == "D3Q27") { - testCode << "_D3Q27"; - using L = D3Q27

; - return runFilterCollision(config, report, testCode); - } + if (config.lattice == "d3q27" || config.lattice == "D3Q27") { + testCode << "_D3Q27"; + using L = D3Q27

; + return runFilterCollision(config, report, testCode); + } NEON_DEV_UNDER_CONSTRUCTION("Lattice type not supported. Available options: D3Q19 and D3Q27"); } @@ -174,11 +175,11 @@ auto runFilterComputeType(Config& config, std::stringstream& testCode) { if (config.computeTypeStr == "double") { - testCode << "_SD"; + testCode << "_Sdouble"; return runFilterLattice(config, report, testCode); } if (config.computeTypeStr == "float") { - testCode << "_SF"; + testCode << "_Sfloat"; return runFilterLattice(config, report, testCode); } NEON_DEV_UNDER_CONSTRUCTION(""); @@ -191,11 +192,11 @@ auto runFilterStoreType(Config& config, -> void { if (config.storeTypeStr == "double") { - testCode << "_CD"; + testCode << "_Cdouble"; return runFilterComputeType(config, report, testCode); } if (config.storeTypeStr == "float") { - testCode << "_CF"; + testCode << "_Cfloat"; return runFilterComputeType(config, report, testCode); } NEON_DEV_UNDER_CONSTRUCTION(""); @@ -213,9 +214,10 @@ auto run(Config& config, std::stringstream& testCode) -> void { testCode << "___" << config.N << "_"; + testCode << "_numDevs_" << config.devices.size(); if (config.gridType == "dGrid") { - testCode << "_DG"; + testCode << "_dGrid"; return details::runFilterStoreType(config, report, testCode); } // if (config.gridType == "eGrid") { @@ -230,6 +232,7 @@ auto run(Config& config, // } if (config.gridType == "bGrid_4_4_4") { if constexpr (!skipTest) { + testCode << "_bGrid_4_4_4"; using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; using Grid = Neon::domain::details::bGrid::bGrid; return details::runFilterStoreType(config, report, testCode); @@ -237,6 +240,15 @@ auto run(Config& config, NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") } } + // if (config.gridType == "bGrid_8_8_8") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<8, 8, 8>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report, testCode); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } // if (config.gridType == "bGrid_2_2_2") { // if constexpr (!skipTest) { // using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>; diff --git a/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp b/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp index 591bd07f..9a81de6b 100644 --- a/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp +++ b/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp @@ -207,7 +207,7 @@ auto SpanLayout::findPossiblyLocalPointOffset( byDomain); auto const infoPtr = mapper.getMetadata(point); if (infoPtr != nullptr) { - return {true, *infoPtr, byPartition, byDirection, byDomain}; + return {true, int32_t(*infoPtr), byPartition, byDirection, byDomain}; } } } diff --git a/libNeonSkeleton/include/Neon/skeleton/Occ.h b/libNeonSkeleton/include/Neon/skeleton/Occ.h index 4b2e3522..041d178f 100644 --- a/libNeonSkeleton/include/Neon/skeleton/Occ.h +++ b/libNeonSkeleton/include/Neon/skeleton/Occ.h @@ -32,6 +32,8 @@ struct OccUtils auto getStringOptions() const -> std::string; auto getDoc() const -> std::string; + auto getStringOption() const -> std::string; + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; auto addToReport(Neon::Report& report) const -> void; @@ -42,4 +44,4 @@ struct OccUtils }; -} // namespace Neon::skeleton \ No newline at end of file +} // namespace Neon::skeleton diff --git a/libNeonSkeleton/src/skeleton/Occ.cpp b/libNeonSkeleton/src/skeleton/Occ.cpp index 7690ce55..44ba2cd9 100644 --- a/libNeonSkeleton/src/skeleton/Occ.cpp +++ b/libNeonSkeleton/src/skeleton/Occ.cpp @@ -48,6 +48,16 @@ OccUtils::Cli::Cli(std::string s) set(s); } +auto OccUtils::Cli::getStringOption() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "Occ was not set."; + NEON_ERROR(errorMsg.str()); + } + return OccUtils::toString(mOption); +} + OccUtils::Cli::Cli(Occ model) { mOption = model; @@ -121,4 +131,4 @@ auto OccUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& su report.addMember("Occ", OccUtils::toString(this->getOption()), &subBlock); } -} // namespace Neon::skeleton \ No newline at end of file +} // namespace Neon::skeleton From 45e82ba0a8f8327ec61961ff5ec3a78845e5c5ef Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 9 Oct 2023 19:15:14 +0200 Subject: [PATCH 87/94] Removing debugging command. --- benchmarks/lbm/src/RunCavityTwoPop.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu index d1e03933..55d6bac5 100644 --- a/benchmarks/lbm/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -16,7 +16,8 @@ namespace CavityTwoPop { int backendWasReported = false; // #include -#include "/usr/include/fenv.h" +// #include "/usr/include/fenv.h" + namespace details { template void { - feenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); // Enable all floating point exceptions but FE_INEXACT + //feenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); // Enable all floating point exceptions but FE_INEXACT if (config.streamingMethod == "push") { if (config.devices.size() != 1) { NEON_THROW_UNSUPPORTED_OPERATION("We only support PUSH in a single device configuration for now.") From a79ef8b2c7b516d0a9eb94518cb6f8088c19f844 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Mon, 9 Oct 2023 19:17:26 +0200 Subject: [PATCH 88/94] Fixing print messages. --- benchmarks/lbm/src/Config.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp index 12d28b12..4a380539 100644 --- a/benchmarks/lbm/src/Config.cpp +++ b/benchmarks/lbm/src/Config.cpp @@ -28,10 +28,10 @@ auto Config::toString() const -> std::string s << ".......... reportFile " << c.reportFile << std::endl; s << "............ gridType " << c.gridType << std::endl; - s << ".......... spaceCurve " << c.spaceCurveCli.getStringOptions() << std::endl; - s << "................. occ " << c.occCli.getStringOptions() << std::endl; - s << "........ transferMode " << c.transferModeCli.getStringOptions() << std::endl; - s << "..... stencilSemantic " << c.stencilSemanticCli.getStringOptions() << std::endl; + s << ".......... spaceCurve " << c.spaceCurveCli.getStringOption() << std::endl; + s << "................. occ " << c.occCli.getStringOption() << std::endl; + s << "........ transferMode " << c.transferModeCli.getStringOption() << std::endl; + s << "..... stencilSemantic " << c.stencilSemanticCli.getStringOption() << std::endl; s << "\n==>[LBM Implementation]" << std::endl; s << "............. lattice " << c.lattice << std::endl; From 7e23387ba43ec8dcc831473cc7997e79588e958a Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 10 Oct 2023 11:25:40 +0200 Subject: [PATCH 89/94] Lattice halo update --- benchmarks/lbm/src/Config.cpp | 16 +++ benchmarks/lbm/src/Lbm.h | 29 +++-- .../Neon/domain/details/dGrid/dField_imp.h | 112 +++++++++++++++++- .../src/sUt_skeleton.Stencil.cu | 12 +- .../unit/skeleton-stencil/src/runHelper.h | 11 +- .../unit/skeleton-stencil/src/stencil.cu | 10 +- 6 files changed, 167 insertions(+), 23 deletions(-) diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp index 4a380539..ae30c720 100644 --- a/benchmarks/lbm/src/Config.cpp +++ b/benchmarks/lbm/src/Config.cpp @@ -109,6 +109,22 @@ auto Config::parseArgs(const int argc, char* argv[]) std::cout << "Benchmark example " << '\n'; std::cout << "./lbm --deviceType gpu --deviceIds 0 1 2 3 4 --grid dGrid --domain-size 100 --max-iter 2000 --computeFP double --storageFP double --nOCC --huGrid --benchmark --warmup-iter 10 --repetitions 5" << '\n'; + std::cout <<" ./lbm --deviceType gpu\\\n" + " --deviceIds 0\\\n" + " --grid dGrid\\\n" + " --domain-size 100\\\n" + " --max-iter 1000\\\n" + " --computeFP float\\\n" + " --storageFP float\\\n" + " --occ none\\\n" + " --transferMode put\\\n" + " --stencilSemantic grid\\\n" + " --spaceCurve sweep\\\n" + " --collision bgk\\\n" + " --streamingMethod pull\\\n" + " --lattice d3q19\\\n" + " --vti 10"; + return -1; } diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h index 132a3d99..ce465ce4 100644 --- a/benchmarks/lbm/src/Lbm.h +++ b/benchmarks/lbm/src/Lbm.h @@ -127,6 +127,10 @@ struct Lbm ContainerFactory::Common::computeWallNghMask(cellFlagField, cellFlagField) .run(Neon::Backend::mainStreamIdx); + cellFlagField.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); metrics::recordProblemSetupMetrics(grid.getBackend(), *reportPtr, setBcClockStart); } @@ -157,12 +161,17 @@ struct Lbm ops.push_back(even); std::stringstream appName; - if (iteration % 2 == 0) - appName << "LBM_push_even"; - else + if (skIdx % 2 == 0) appName << "LBM_pull_even"; + else + appName << "LBM_pull_odd"; skeleton.at(skIdx).sequence(ops, appName.str(), opt); + + if (skIdx % 2 == 0) + skeleton.at(skIdx).ioToDot("lbm-pull-even","lbm_pull_even",true); + else + skeleton.at(skIdx).ioToDot("lbm-pull-odd","lbm_pull_even", true); } } { @@ -200,7 +209,7 @@ struct Lbm if (iteration % 2 == 0) appName << "LBM_push_even"; else - appName << "LBM_pull_even"; + appName << "LBM_push_odd"; skeleton.at(skIdx).sequence(ops, appName.str(), opt); } @@ -226,13 +235,13 @@ struct Lbm cellFlagField, lbmParameters.omega, pFieldList.at(0)); - appName << "LBM_push_even"; + appName << "LBM_aa_even"; } else { lbmIteration = ContainerFactory::AA::Odd::iteration( cellFlagField, lbmParameters.omega, pFieldList.at(0)); - appName << "LBM_pull_even"; + appName << "LBM_aa_even"; } std::vector ops; skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); @@ -299,6 +308,10 @@ struct Lbm done = true; } if constexpr (method == lbm::Method::pull) { + pop.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); auto computeRhoAndU = ContainerFactory::Pull::computeRhoAndU(pop, cellFlagField, rho, u); computeRhoAndU.run(Neon::Backend::mainStreamIdx); done = true; @@ -326,8 +339,8 @@ struct Lbm iterIdStr = std::string(numDigits - std::min(numDigits, iterIdStr.length()), '0') + iterIdStr; // pop.ioToVtk("pop_" + iterIdStr, "pop", false); - u.ioToVtk("u_" + iterIdStr, "u", false); - rho.ioToVtk("rho_" + iterIdStr, "rho", false); + u.ioToVtk("u_" + iterIdStr, "u", false, Neon::IoFileType::BINARY); + rho.ioToVtk("rho_" + iterIdStr, "rho", false, Neon::IoFileType::BINARY); cellFlagField.template ioToVtk("cellFlagField_" + iterIdStr, "flag", false); #if 0 diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h index 49f57dbd..11dda19e 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h @@ -26,7 +26,8 @@ dField::dField(const std::string& fieldUserName, T(0), dataUse, memoryOptions, - haloStatus) { + haloStatus) +{ // only works if dims in x and y direction for all partitions match for (int i = 0; i < dims.size() - 1; ++i) { @@ -88,7 +89,7 @@ dField::dField(const std::string& fieldUserName, { // Setting up partitions Neon::aGrid const& aGrid = mData->grid->helpFieldMemoryAllocator(); - mData->memoryField = aGrid.newField(fieldUserName + "-storage", cardinality, T(), dataUse, memoryOptions); + mData->memoryField = aGrid.newField(fieldUserName + "-storage", cardinality, T(), dataUse, memoryOptions); // const int setCardinality = mData->grid->getBackend().getDeviceCount(); mData->partitionTable.forEachConfiguration( [&](Neon::Execution execution, @@ -306,7 +307,7 @@ auto dField::operator()(const Neon::index_3d& idxGlobal, auto& partition = mData->partitionTable.getPartition(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD); - auto& span = mData->grid->getSpan(Neon::Execution::host,partitionIdx, Neon::DataView::STANDARD); + auto& span = mData->grid->getSpan(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD); Idx idx; bool isOk = span.setAndValidate(idx, localIDx.x, localIDx.y, localIDx.z); if (!isOk) { @@ -326,7 +327,7 @@ auto dField::getReference(const Neon::index_3d& idxGlobal, auto& partition = mData->partitionTable.getPartition(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD); - auto& span = mData->grid->getSpan(Neon::Execution::host,partitionIdx, Neon::DataView::STANDARD); + auto& span = mData->grid->getSpan(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD); Idx idx; bool isOk = span.setAndValidate(idx, localIDx.x, localIDx.y, localIDx.z); if (!isOk) { @@ -484,6 +485,81 @@ auto dField::initHaloUpdateTable() transfersVec.push_back(transfer); } }); + + mData->latticeHaloUpdateTable.forEachPutConfiguration( + bk, [&](Neon::SetIdx setIdxSrc, + Execution execution, + Neon::domain::tool::partitioning::ByDirection byDirection, + std::vector& transfersVec) { + { + using namespace Neon::domain::tool::partitioning; + + Neon::SetIdx setIdxDst = getNghSetIdx(setIdxSrc, byDirection); + + int r = grid.getStencil().getRadius(); + + std::array partitions; + std::array, Data::EndPointsUtils::nConfigs> ghostZBeginIdx; + std::array, Data::EndPointsUtils::nConfigs> boundaryZBeginIdx; + std::array memPhyDim; + + partitions[Data::EndPoints::dst] = &this->getPartition(execution, setIdxDst, Neon::DataView::STANDARD); + partitions[Data::EndPoints::src] = &this->getPartition(execution, setIdxSrc, Neon::DataView::STANDARD); + + for (auto endPoint : {Data::EndPoints::dst, Data::EndPoints::src}) { + ghostZBeginIdx[endPoint][static_cast(ByDirection::down)] = 0; + boundaryZBeginIdx[endPoint][static_cast(ByDirection::down)] = r; + boundaryZBeginIdx[endPoint][static_cast(ByDirection::up)] = partitions[endPoint]->dim().z; + ghostZBeginIdx[endPoint][static_cast(ByDirection::up)] = partitions[endPoint]->dim().z + r; + + memPhyDim[endPoint] = Neon::size_4d( + 1, + size_t(partitions[endPoint]->dim().x), + size_t(partitions[endPoint]->dim().x) * partitions[endPoint]->dim().y, + size_t(partitions[endPoint]->dim().x) * partitions[endPoint]->dim().y * (partitions[endPoint]->dim().z + 2 * r)); + } + + for (int j = 0; j < this->getCardinality(); j++) { + auto const& stencil = this->getGrid().getStencil(); + if (this->getCardinality() != stencil.nPoints()) { + continue; + } + T* srcMem = partitions[Data::EndPoints::src]->mem(); + T* dstMem = partitions[Data::EndPoints::dst]->mem(); + + Neon::size_4d srcBoundaryBuff(0, 0, boundaryZBeginIdx[Data::EndPoints::src][static_cast(byDirection)], j); + Neon::size_4d dstGhostBuff(0, 0, ghostZBeginIdx[Data::EndPoints::dst][static_cast(ByDirectionUtils::invert(byDirection))], j); + + // std::cout << "To " << dstGhostBuff << " prt " << partitions[Data::EndPoints::dst]->prtID() << " From " << srcBoundaryBuff << "(src dim" << partitions[Data::EndPoints::src]->dim() << ")" << std::endl; + // std::cout << "dst mem " << partitions[Data::EndPoints::dst]->mem() << " " << std::endl; + // std::cout << "dst pitch " << (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum() << " " << std::endl; + // std::cout << "dst dstGhostBuff " << dstGhostBuff << " " << std::endl; + // std::cout << "dst pitch all" << memPhyDim[Data::EndPoints::dst] << " " << std::endl; + + Neon::set::MemoryTransfer transfer({setIdxDst, dstMem + (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum(), dstGhostBuff}, + {setIdxSrc, srcMem + (srcBoundaryBuff * memPhyDim[Data::EndPoints::src]).rSum(), srcBoundaryBuff}, + sizeof(T) * + r * + partitions[Data::EndPoints::src]->dim().x * + partitions[Data::EndPoints::src]->dim().y); + if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSrc)) { + return; + } + + if (ByDirection::down == byDirection && bk.isFirstDevice(setIdxSrc)) { + return; + } + if (ByDirection::up == byDirection && !(stencil.points()[j].z > 0)) { + continue; + } + if (ByDirection::down == byDirection && !(stencil.points()[j].z < 0)) { + continue; + } + // std::cout << transfer.toString() << std::endl; + transfersVec.push_back(transfer); + } + } + }); // // mData->latticeHaloUpdateTable.forEachPutConfiguration( // bk, [&](Neon::SetIdx setIdxSrc, @@ -608,7 +684,33 @@ auto dField:: execution); } } else { - NEON_DEV_UNDER_CONSTRUCTION(""); + auto transfers = bk.template newDataSet>(); + if (this->getMemoryOptions().getOrder() == Neon::MemoryLayout::structOfArrays) { + for (auto byDirection : {tool::partitioning::ByDirection::up, + tool::partitioning::ByDirection::down}) { + + auto const& tableEntryByDir = mData->latticeHaloUpdateTable.get(transferMode, + execution, + byDirection); + + tableEntryByDir.forEachSeq([&](SetIdx setIdx, auto const& tableEntryByDirBySetIdx) { + transfers[setIdx].insert(std::end(transfers[setIdx]), + std::begin(tableEntryByDirBySetIdx), + std::end(tableEntryByDirBySetIdx)); + }); + } + dataTransferContainer = + Neon::set::Container::factoryDataTransfer( + *this, + transferMode, + stencilSemantic, + transfers, + execution); + + + } else { + NEON_DEV_UNDER_CONSTRUCTION(""); + } } Neon::set::Container SyncContainer = Neon::set::Container::factorySynchronization( diff --git a/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu b/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu index 0170936c..2e2a2929 100644 --- a/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu +++ b/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu @@ -160,7 +160,7 @@ void SingleStencil(TestData& data, } template -void SingleStencilOCC(TestData& data) +void SingleStencilStandardOCC(TestData& data) { SingleStencil(data, Neon::skeleton::Occ::standard, Neon::set::TransferMode::get); } @@ -208,4 +208,14 @@ TEST(SingleStencil_NoOCC, bGrid) // using Grid = Neon::dGrid; using Type = int32_t; runAllTestConfiguration("bGrid_t", SingleStencilNoOCC, nGpus, 1); +} + +TEST(SingleStencil_StandardOCC, bGrid) +{ + int nGpus = 1; + using Grid = Neon::bGrid; + // using Grid = Neon::domain::eGrid; + // using Grid = Neon::dGrid; + using Type = int32_t; + runAllTestConfiguration("bGrid_t", SingleStencilStandardOCC, nGpus, 1); } \ No newline at end of file diff --git a/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h b/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h index 4858b819..4e975e88 100644 --- a/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h +++ b/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h @@ -22,10 +22,11 @@ using namespace Neon::domain::tool::testing; using namespace Neon::domain::tool; template -void runAllTestConfiguration(const std::string& gname, - std::function&)> f, - int nGpus, - int minNumGpus) +void runAllTestConfiguration(const std::string& gname, + std::function&, Neon::skeleton::Occ)> f, + Neon::skeleton::Occ occ, + int nGpus, + int minNumGpus) { if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) { std::vector nGpuTest; @@ -69,7 +70,7 @@ void runAllTestConfiguration(const std::string& gname, NEON_INFO(testData.toString()); - f(testData); + f(testData, occ); } } } diff --git a/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu b/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu index 0e88980a..b657aa14 100644 --- a/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu +++ b/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu @@ -59,7 +59,8 @@ auto laplaceOnIntegers(const Field& filedA, template -void singleStencil(TestData& data) +void singleStencil(TestData& data, + Neon::skeleton::Occ occ) { using Type = typename TestData::Type; @@ -82,7 +83,8 @@ void singleStencil(TestData& data) ops.push_back(laplaceOnIntegers(Y, X)); Neon::skeleton::Skeleton skl(data.getBackend()); - skl.sequence(ops, "sUt_dGridStencil"); + Neon::skeleton::Options opt(occ, Neon::set::TransferMode::get); + skl.sequence(ops, "sUt_dGridStencil", opt); for (int j = 0; j < nIterations; j++) { skl.run(); @@ -114,7 +116,7 @@ TEST(singleStencil, dGrid) using Grid = Neon::dGrid; using Type = int32_t; constexpr int C = 0; - runAllTestConfiguration("dGrid", singleStencil, nGpus, 1); + runAllTestConfiguration("dGrid", singleStencil, Neon::skeleton::Occ::none, nGpus, 1); } TEST(singleStencil, bGridSingleGpu) @@ -123,5 +125,5 @@ TEST(singleStencil, bGridSingleGpu) using Grid = Neon::bGrid; using Type = int32_t; constexpr int C = 0; - runAllTestConfiguration("bGrid", singleStencil, nGpus, 1); + runAllTestConfiguration("bGrid", singleStencil, Neon::skeleton::Occ::none, nGpus, 1); } \ No newline at end of file From 207d25287dc9d3f1ab2c481c5a3851a3faca11cd Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 10 Oct 2023 14:18:02 +0200 Subject: [PATCH 90/94] Fixing issue with dSpan and dataView. --- .../Neon/domain/details/dGrid/dGrid_imp.h | 22 ++++++++++--------- .../include/Neon/domain/details/dGrid/dSpan.h | 9 ++++---- .../Neon/domain/details/dGrid/dSpan_imp.h | 20 ++++++++--------- .../src/domain/details/dGrid/dGrid.cpp | 2 +- 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h index a6fbf1aa..a263400a 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h @@ -91,15 +91,17 @@ dGrid::dGrid(const Neon::Backend& backend, Neon::DataView dw, dSpan& span) { span.mDataView = dw; - span.mZHaloRadius = setCardinality == 1 ? 0 : mData->halo.z; - span.mZBoundaryRadius = mData->halo.z; + span.mZghostRadius = setCardinality == 1 ? 0 : mData->halo.z; + span.mZboundaryRadius = mData->halo.z; + span.mMaxZInDomain = mData->partitionDims[setIdx].z; switch (dw) { case Neon::DataView::STANDARD: { // Only works z partitions. assert(mData->halo.x == 0 && mData->halo.y == 0); - span.mDim = mData->partitionDims[setIdx]; + span.mSpanDim = mData->partitionDims[setIdx]; + break; } case Neon::DataView::BOUNDARY: { @@ -107,8 +109,8 @@ dGrid::dGrid(const Neon::Backend& backend, // Only works z partitions. assert(mData->halo.x == 0 && mData->halo.y == 0); - span.mDim = mData->partitionDims[setIdx]; - span.mDim.z = span.mZBoundaryRadius * 2; + span.mSpanDim = mData->partitionDims[setIdx]; + span.mSpanDim.z = span.mZboundaryRadius * 2; break; } @@ -117,12 +119,12 @@ dGrid::dGrid(const Neon::Backend& backend, // Only works z partitions. assert(mData->halo.x == 0 && mData->halo.y == 0); - span.mDim = mData->partitionDims[setIdx]; - span.mDim.z = span.mDim.z - span.mZBoundaryRadius * 2; - if (span.mDim.z <= 0 && setCardinality > 1) { + span.mSpanDim = mData->partitionDims[setIdx]; + span.mSpanDim.z = span.mSpanDim.z - span.mZboundaryRadius * 2; + if (span.mSpanDim.z <= 0 && setCardinality > 1) { NeonException exp("dGrid"); exp << "The grid size is too small to support the data view model correctly \n"; - exp << span.mDim << " for setIdx " << setIdx << " and device " << getDevSet().devId(setIdx); + exp << span.mSpanDim << " for setIdx " << setIdx << " and device " << getDevSet().devId(setIdx); NEON_THROW(exp); } @@ -140,7 +142,7 @@ dGrid::dGrid(const Neon::Backend& backend, Neon::DataView dw, int& count) { if (Execution::host == execution) { - count = mData->spanTable.getSpan(Neon::Execution::host, setIdx, dw).mDim.rMul(); + count = mData->spanTable.getSpan(Neon::Execution::host, setIdx, dw).mSpanDim.rMul(); } }); } diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h index 74ab5ff3..c81baace 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h @@ -43,11 +43,12 @@ class dSpan private: Neon::DataView mDataView; - int mZHaloRadius; - int mZBoundaryRadius; - Neon::index_3d mDim /** Dimension of the span, its values depends on the mDataView*/; + int mZghostRadius; + int mZboundaryRadius; + int mMaxZInDomain; + Neon::index_3d mSpanDim /** Dimension of the span, its values depends on the mDataView*/; }; -} // namespace Neon::domain::details::dGrid +} // namespace Neon::domain::deta ils::dGrid #include "dSpan_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h index 9fb56572..37bea7d7 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h @@ -14,25 +14,25 @@ dSpan::setAndValidate(Idx& idx, idx.setLocation().y = int(y); idx.setLocation().z = int(z); - if (idx.getLocation() < mDim) { + if (idx.getLocation() < mSpanDim) { res = true; } switch (mDataView) { case Neon::DataView::STANDARD: { - idx.setLocation().z += mZHaloRadius; + idx.setLocation().z += mZghostRadius; return res; } case Neon::DataView::INTERNAL: { - idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; + idx.setLocation().z += mZghostRadius + mZboundaryRadius; return res; } case Neon::DataView::BOUNDARY: { - idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius - ? 0 - : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); - idx.setLocation().z += mZHaloRadius; + idx.setLocation().z += idx.getLocation().z < mZboundaryRadius + ? 0 + : (mMaxZInDomain - 1) + (-1 * mZboundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); + idx.setLocation().z += mZghostRadius; return res; } @@ -51,19 +51,19 @@ NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetDataView() NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetZHaloRadius() const -> int const& { - return mZHaloRadius; + return mZghostRadius; } NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetZBoundaryRadius() const -> int const& { - return mZBoundaryRadius; + return mZboundaryRadius; } NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetDim() const -> Neon::index_3d const& { - return mDim; + return mSpanDim; } } // namespace Neon::domain::details::dGrid \ No newline at end of file diff --git a/libNeonDomain/src/domain/details/dGrid/dGrid.cpp b/libNeonDomain/src/domain/details/dGrid/dGrid.cpp index 890642b3..ec8b24d8 100644 --- a/libNeonDomain/src/domain/details/dGrid/dGrid.cpp +++ b/libNeonDomain/src/domain/details/dGrid/dGrid.cpp @@ -59,7 +59,7 @@ auto dGrid::getLaunchParameters(const Neon::DataView dataView, auto dimsByDataView = getBackend().devSet().newDataSet([&](Neon::SetIdx const& setIdx, auto& value) { - value = getSpan(Neon::Execution::host, setIdx, dataView).mDim; + value = getSpan(Neon::Execution::host, setIdx, dataView).mSpanDim; }); ret.set(Neon::sys::GpuLaunchInfo::domainGridMode, From b574b495d658ddd6c8640b0432247581a1859a3b Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 10 Oct 2023 17:12:55 +0200 Subject: [PATCH 91/94] Fixing issue with dSpan and dataView. --- libNeonCore/include/Neon/core/types/Macros.h | 2 +- libNeonDomain/tests/domain-map/src/gtests.cpp | 9 +++ libNeonDomain/tests/domain-map/src/map.cu | 71 ++++++++++++++++++- libNeonDomain/tests/domain-map/src/map.h | 13 +++- .../unit/skeleton-stencil/src/runHelper.h | 12 ++-- .../unit/skeleton-stencil/src/stencil.cu | 21 ++++-- 6 files changed, 114 insertions(+), 14 deletions(-) diff --git a/libNeonCore/include/Neon/core/types/Macros.h b/libNeonCore/include/Neon/core/types/Macros.h index 5e909d3a..bcecdbb7 100644 --- a/libNeonCore/include/Neon/core/types/Macros.h +++ b/libNeonCore/include/Neon/core/types/Macros.h @@ -206,7 +206,7 @@ #define NEON_RESTRICT restrict #endif -#ifdef NEON_COMPILER_CUDA +#if defined(NEON_COMPILER_CUDA) && !defined(_WIN32) #define NEON_RESTRICT __restrict__ #endif diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp index 50d6e34d..c48511b7 100644 --- a/libNeonDomain/tests/domain-map/src/gtests.cpp +++ b/libNeonDomain/tests/domain-map/src/gtests.cpp @@ -13,6 +13,15 @@ TEST(domain_map, dGrid) 1); } +TEST(domain_map_dataView, dGrid) +{ + int nGpus = 2; + using Type = int64_t; + runAllTestConfiguration(std::function(map::dataView::run), + nGpus, + 2); +} + TEST(domain_map, eGrid) { int nGpus = 3; diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu index b001d832..2ed92ddb 100644 --- a/libNeonDomain/tests/domain-map/src/map.cu +++ b/libNeonDomain/tests/domain-map/src/map.cu @@ -1,10 +1,10 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" #include "gtest/gtest.h" -#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace map { @@ -32,6 +32,27 @@ auto mapContainer_axpy(int streamIdx, }); } +template +auto mapContainer_add(int streamIdx, + typename Field::Type& val, + Field& fieldB) + -> Neon::set::Container +{ + const auto& grid = fieldB.getGrid(); + return grid.newContainer( + "mapContainer_axpy", + [&, val](Neon::set::Loader& loader) { + auto b = loader.load(fieldB); + + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { + for (int i = 0; i < b.cardinality(); i++) { + // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); + b(e, i) += val; + } + }; + }); +} + using namespace Neon::domain::tool::testing; template @@ -78,5 +99,53 @@ template auto run(TestData&) - template auto run(TestData&) -> void; template auto run(TestData&) -> void; +namespace dataView { +template +auto run(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + + data.resetValuesToLinear(1, 100); + T val = T(33); + + { // NEON + const Neon::index_3d dim = grid.getDimension(); + std::vector elements; + + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + + + mapContainer_axpy(Neon::Backend::mainStreamIdx, + val, X, Y) + .run(0, Neon::DataView::BOUNDARY); + + mapContainer_axpy(Neon::Backend::mainStreamIdx, + val, X, Y) + .run(0, Neon::DataView::INTERNAL); + + X.updateHostData(0); + Y.updateHostData(0); + + data.getBackend().sync(0); + } + + { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + data.axpy(&val, X, Y); + } + + bool isOk = data.compare(FieldNames::Y); + ASSERT_TRUE(isOk); +} +template auto run(TestData&) -> void; +template auto run(TestData&) -> void; +template auto run(TestData&) -> void; +template auto run(TestData&) -> void; +} // namespace dataView } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h index 16073657..99864a3f 100644 --- a/libNeonDomain/tests/domain-map/src/map.h +++ b/libNeonDomain/tests/domain-map/src/map.h @@ -3,8 +3,8 @@ #include #include "Neon/domain/Grids.h" -#include "Neon/domain/tools/TestData.h" #include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/tools/TestData.h" namespace map { @@ -18,5 +18,16 @@ extern template auto run(TestData(TestData&) -> void; extern template auto run(TestData&) -> void; +namespace dataView { + +template +auto run(TestData& data) -> void; + +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; + +} // namespace dataView } // namespace map diff --git a/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h b/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h index 4e975e88..8cd53082 100644 --- a/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h +++ b/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h @@ -22,11 +22,11 @@ using namespace Neon::domain::tool::testing; using namespace Neon::domain::tool; template -void runAllTestConfiguration(const std::string& gname, - std::function&, Neon::skeleton::Occ)> f, - Neon::skeleton::Occ occ, - int nGpus, - int minNumGpus) +void runAllTestConfiguration(const std::string& gname, + std::function&, Neon::skeleton::Occ)> f, + Neon::skeleton::Occ occ, + int nGpus, + int minNumGpus) { if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) { std::vector nGpuTest; @@ -70,7 +70,7 @@ void runAllTestConfiguration(const std::string& NEON_INFO(testData.toString()); - f(testData, occ); + f(gname, testData, occ); } } } diff --git a/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu b/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu index b657aa14..095959f9 100644 --- a/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu +++ b/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu @@ -59,7 +59,8 @@ auto laplaceOnIntegers(const Field& filedA, template -void singleStencil(TestData& data, +void singleStencil(std::string testName, + TestData& data, Neon::skeleton::Occ occ) { using Type = typename TestData::Type; @@ -84,7 +85,8 @@ void singleStencil(TestData& data, Neon::skeleton::Skeleton skl(data.getBackend()); Neon::skeleton::Options opt(occ, Neon::set::TransferMode::get); - skl.sequence(ops, "sUt_dGridStencil", opt); + skl.sequence(ops, testName, opt); + skl.ioToDot(testName, testName, true); for (int j = 0; j < nIterations; j++) { skl.run(); @@ -110,16 +112,25 @@ void singleStencil(TestData& data, ASSERT_TRUE(isOk); } -TEST(singleStencil, dGrid) +TEST(skeleton_stencil_occ_none, dGrid) { int nGpus = 1; using Grid = Neon::dGrid; using Type = int32_t; constexpr int C = 0; - runAllTestConfiguration("dGrid", singleStencil, Neon::skeleton::Occ::none, nGpus, 1); + runAllTestConfiguration("skeleton_stencil_occ_none_dGrid", singleStencil, Neon::skeleton::Occ::none, nGpus, 1); } -TEST(singleStencil, bGridSingleGpu) +TEST(skeleton_stencil_occ_standard, dGrid) +{ + int nGpus = 1; + using Grid = Neon::dGrid; + using Type = int32_t; + constexpr int C = 0; + runAllTestConfiguration("skeleton_stencil_occ_standard_dGrid", singleStencil, Neon::skeleton::Occ::standard, nGpus, 1); +} + +TEST(skeleton_stencil, bGridSingleGpu) { int nGpus = 1; using Grid = Neon::bGrid; From 2e533abb4bd1f9a68e327c9d172074750db8e90f Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Tue, 10 Oct 2023 17:54:20 +0200 Subject: [PATCH 92/94] Fixing CLI for lbm unitoform. --- benchmarks/lbm/src/Config.h | 2 +- libNeonSet/include/Neon/set/StencilSemantic.h | 2 +- libNeonSet/include/Neon/set/container/Loader_imp.h | 2 +- libNeonSet/src/set/StencilSemantic.cpp | 10 +++++----- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/benchmarks/lbm/src/Config.h b/benchmarks/lbm/src/Config.h index f8222bef..b5a1607a 100644 --- a/benchmarks/lbm/src/Config.h +++ b/benchmarks/lbm/src/Config.h @@ -36,7 +36,7 @@ struct Config Neon::skeleton::OccUtils::Cli occCli{Neon::skeleton::Occ::none}; // Neon OCC type Neon::set::TransferModeUtils::Cli transferModeCli{Neon::set::TransferMode::get}; // Neon transfer mode for halo update - Neon::set::StencilSemanticUtils::Cli stencilSemanticCli{Neon::set::StencilSemantic::streaming}; + Neon::set::StencilSemanticUtils::Cli stencilSemanticCli{Neon::set::StencilSemantic::lattice}; Neon::domain::tool::spaceCurves::EncoderTypeUtil::Cli spaceCurveCli{Neon::domain::tool::spaceCurves::EncoderType::sweep}; CollisionUtils::Cli collisionCli{Collision::bgk}; int vti = 0; // Export vti file diff --git a/libNeonSet/include/Neon/set/StencilSemantic.h b/libNeonSet/include/Neon/set/StencilSemantic.h index aa5338fc..28b596dc 100644 --- a/libNeonSet/include/Neon/set/StencilSemantic.h +++ b/libNeonSet/include/Neon/set/StencilSemantic.h @@ -10,7 +10,7 @@ namespace Neon::set { enum struct StencilSemantic { standard = 0 /*< Transfer for halo update on grid structure */, - streaming = 1 /*< Transfer for halo update on lattice structure */ + lattice = 1 /*< Transfer for halo update on lattice structure */ }; diff --git a/libNeonSet/include/Neon/set/container/Loader_imp.h b/libNeonSet/include/Neon/set/container/Loader_imp.h index e134effe..c9682ff9 100644 --- a/libNeonSet/include/Neon/set/container/Loader_imp.h +++ b/libNeonSet/include/Neon/set/container/Loader_imp.h @@ -115,7 +115,7 @@ auto Loader:: if (compute == Neon::Pattern::STENCIL && (stencilSemantic == StencilSemantic::standard || - stencilSemantic == StencilSemantic::streaming)) { + stencilSemantic == StencilSemantic::lattice)) { Neon::NeonException exp("Loader"); exp << "Loading a non const field for a stencil operation is not supported in Neon"; NEON_THROW(exp); diff --git a/libNeonSet/src/set/StencilSemantic.cpp b/libNeonSet/src/set/StencilSemantic.cpp index c5b24a60..0e6b2114 100644 --- a/libNeonSet/src/set/StencilSemantic.cpp +++ b/libNeonSet/src/set/StencilSemantic.cpp @@ -5,11 +5,11 @@ namespace Neon::set { auto StencilSemanticUtils::toString(StencilSemantic option) -> std::string { switch (option) { - case StencilSemantic::streaming: { - return "streaming"; + case StencilSemantic::lattice: { + return "lattice"; } case StencilSemantic::standard: { - return "grid"; + return "standard"; } } NEON_THROW_UNSUPPORTED_OPTION(""); @@ -17,7 +17,7 @@ auto StencilSemanticUtils::toString(StencilSemantic option) -> std::string auto StencilSemanticUtils::fromString(const std::string& occ) -> StencilSemantic { - std::array opts{StencilSemantic::standard, StencilSemantic::streaming}; + std::array opts{StencilSemantic::standard, StencilSemantic::lattice}; for (auto a : opts) { if (toString(a) == occ) { return a; @@ -28,7 +28,7 @@ auto StencilSemanticUtils::fromString(const std::string& occ) -> StencilSemantic auto StencilSemanticUtils::getOptions() -> std::array { - std::array opts = {StencilSemantic::standard, StencilSemantic::streaming}; + std::array opts = {StencilSemantic::standard, StencilSemantic::lattice}; return opts; } From bef23c1af36156d2b0b14d6d9bb62cd09be15e8e Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 11 Oct 2023 07:59:17 +0200 Subject: [PATCH 93/94] Fixing windows compilation --- libNeonCore/include/Neon/core/types/Macros.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libNeonCore/include/Neon/core/types/Macros.h b/libNeonCore/include/Neon/core/types/Macros.h index bcecdbb7..d9f47914 100644 --- a/libNeonCore/include/Neon/core/types/Macros.h +++ b/libNeonCore/include/Neon/core/types/Macros.h @@ -206,8 +206,12 @@ #define NEON_RESTRICT restrict #endif -#if defined(NEON_COMPILER_CUDA) && !defined(_WIN32) +#if defined(NEON_COMPILER_CUDA) +#if!defined(_WIN32) #define NEON_RESTRICT __restrict__ +#else +#define NEON_RESTRICT +#endif #endif #ifdef NEON_COMPILER_CLANG From b2235b4f57ee0b8fa6d7acaf35672d90ba681377 Mon Sep 17 00:00:00 2001 From: Massimiliano Meneghin Date: Wed, 11 Oct 2023 09:17:19 +0200 Subject: [PATCH 94/94] Fixing windows compilation --- benchmarks/lbm/src/ContainersD3QXX.h | 2 +- benchmarks/lbm/src/DeviceD3QXX.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h index 440d785a..bb4adb0d 100644 --- a/benchmarks/lbm/src/ContainersD3QXX.h +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -454,7 +454,7 @@ struct ContainerFactoryD3QXX cellType.wallNghBitflag = 0; if (cellType.classification == CellType::bulk) { - Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwdRegIdx) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&, gidx](auto fwdRegIdx) { using M = typename Lattice::template RegisterMapper; if constexpr (M::centerMemQ != M::fwdMemQ) { CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h index 4864f829..d7d16550 100644 --- a/benchmarks/lbm/src/DeviceD3QXX.h +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -208,8 +208,7 @@ struct DeviceD3QXX usqr); // double eqopp = eq - 6.* rho * t[k] * ck_u; - eqBk = eqFw - - c6 * rho * T::template getT() * ck_u; + eqBk = eqFw - c6 * rho * T::template getT() * ck_u; // pop_out = (1. - omega) * fin(i, k) + omega * eq; pop[M::fwdRegQ] = (c1 - omega) * static_cast(pop[M::fwdRegQ]) + omega * eqFw;