diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 10c30fea..efb267c6 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.19 FATAL_ERROR) -add_subdirectory("lbm-lid-driven-cavity-flow") +add_subdirectory(lbm) +# add_subdirectory("lbm-lid-driven-cavity-flow") # add_subdirectory("lbm-flow-over-sphere") diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py index 5aebe104..2ce5dcd3 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py +++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py @@ -4,9 +4,11 @@ GRID_LIST = "dGrid bGrid eGrid".split() STORAGE_FP_LIST = "double float".split() COMPUTE_FP_LIST = "double float".split() -OCC_LIST = "nOCC".split() +OCC_LIST = "nOCC sOCC".split() +HU_LIST = "huGrid huLattice".split() +CURVE_LIST = "sweep morton hilbert".split() WARM_UP_ITER = 10 -MAX_ITER = 100 +MAX_ITER = 10000 REPETITIONS = 5 import subprocess @@ -38,60 +40,79 @@ def countAll(): for COMPUTE_FP in COMPUTE_FP_LIST: for DEVICE_SET in DEVICE_SET_LIST: for GRID in GRID_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue + for HU in HU_LIST: + for CURVE in CURVE_LIST: + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue - counter += 1 + counter += 1 return counter SAMPLES = countAll() counter = 0 command = './lbm-lid-driven-cavity-flow' +# command = 'echo' with open(command + '.log', 'w') as fp: for DEVICE_TYPE in DEVICE_TYPE_LIST: DEVICE_SET_LIST = [DEVICE_ID_LIST[0]] if DEVICE_TYPE == 'gpu': for DEVICE in DEVICE_ID_LIST[1:]: DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) - for OCC in OCC_LIST: - for DOMAIN_SIZE in DOMAIN_SIZE_LIST: - for STORAGE_FP in STORAGE_FP_LIST: - for COMPUTE_FP in COMPUTE_FP_LIST: - for DEVICE_SET in DEVICE_SET_LIST: + for DEVICE_SET in DEVICE_SET_LIST: + for OCC in OCC_LIST: + for DOMAIN_SIZE in DOMAIN_SIZE_LIST: + for STORAGE_FP in STORAGE_FP_LIST: + for COMPUTE_FP in COMPUTE_FP_LIST: for GRID in GRID_LIST: - if STORAGE_FP == 'double' and COMPUTE_FP == 'float': - continue + for HU in HU_LIST: + for CURVE in CURVE_LIST: + + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue + + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) + parameters.append('--repetitions ' + str(REPETITIONS)) + parameters.append('--max-iter ' + str(MAX_ITER)) + parameters.append( + '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + + DEVICE_TYPE + '_' + + DEVICE_SET.replace(' ', '_') + '-' + + GRID + '_' + + DOMAIN_SIZE + '-' + + STORAGE_FP + '-' + COMPUTE_FP + '-' + + OCC + '-' + + HU + '-' + + CURVE) + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--curve ' + CURVE) - parameters = [] - parameters.append('--deviceType ' + DEVICE_TYPE) - parameters.append('--deviceIds ' + DEVICE_SET) - parameters.append('--grid ' + GRID) - parameters.append('--domain-size ' + DOMAIN_SIZE) - parameters.append('--warmup-iter ' + str(WARM_UP_ITER)) - parameters.append('--repetitions ' + str(REPETITIONS)) - parameters.append('--max-iter ' + str(MAX_ITER)) - parameters.append( - '--report-filename ' + 'lbm-lid-driven-cavity-flow___' + - DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' + - STORAGE_FP + '_' + COMPUTE_FP + '_' + - DEVICE_SET.replace(' ', '_') + '_' + OCC) - parameters.append('--computeFP ' + COMPUTE_FP) - parameters.append('--storageFP ' + STORAGE_FP) - parameters.append('--benchmark') - parameters.append('--' + OCC) + parameters.append('--benchmark') + parameters.append('--' + OCC) + parameters.append('--' + HU) - commandList = [] - commandList.append(command) - for el in parameters: - for s in el.split(): - commandList.append(s) + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) - fp.write("\n-------------------------------------------\n") - fp.write(' '.join(commandList)) - fp.write("\n-------------------------------------------\n") - fp.flush() - subprocess.run(commandList, text=True, stdout=fp) + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + print(' '.join(commandList)) + subprocess.run(commandList, text=True, stdout=fp) - counter += 1 - printProgressBar(counter * 100.0 / SAMPLES, 'Progress') + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h b/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h index 7037b6ae..1ca70c6f 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h @@ -22,13 +22,28 @@ struct CellType classification = c; wallNghBitflag = n; } + NEON_CUDA_HOST_DEVICE explicit CellType(Classification c) { classification = c; wallNghBitflag = 0; } + // Converting to int to exportVti + operator int() const { return int(classification); } + + template + static auto isWall(const uint32_t& wallNghBitFlag) + -> bool + { + return wallNghBitFlag & (uint32_t(1) << fwdRegIdx); + } + auto setWall(int fwdRegIdx) + -> void + { + wallNghBitflag = wallNghBitflag | ((uint32_t(1) << fwdRegIdx)); + } uint32_t wallNghBitflag; Classification classification; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp index 165dcff5..115125bd 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp @@ -41,6 +41,7 @@ auto Config::toString() const -> std::string s << "......... computeType " << c.computeType << std::endl; s << "........... storeType " << c.storeType << std::endl; + s << "............... curve " << c.curve << std::endl; s << ". ............... occ " << Neon::skeleton::OccUtils::toString(c.occ) << std::endl; s << "....... transfer Mode " << Neon::set::TransferModeUtils::toString(c.transferMode) << std::endl; @@ -60,43 +61,58 @@ auto Config::parseArgs(const int argc, char* argv[]) auto& config = *this; auto cli = - ( - clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device ids to use", - clipp::required("--deviceIds") & clipp::integers("gpus", config.devices) % "Device ids to use", - clipp::option("--grid") & clipp::value("grid", config.gridType) % "Could be dGrid, eGrid, bGrid", - clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain", - clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters", - clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations", - clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run.", - clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename", - - clipp::option("--computeFP") & clipp::value("computeFP", config.computeType) % "Could be double or float", - clipp::option("--storageFP") & clipp::value("storageFP", config.storeType) % "Could be double or float", - - ( - (clipp::option("--sOCC").set(config.occ, Neon::skeleton::Occ::standard) % "Standard OCC") | - (clipp::option("--nOCC").set(config.occ, Neon::skeleton::Occ::none) % "No OCC (on by default)")), - ( - (clipp::option("--put").set(config.transferMode, Neon::set::TransferMode::put) % "Set transfer mode to PUT") | - (clipp::option("--get").set(config.transferMode, Neon::set::TransferMode::get) % "Set transfer mode to GET (on by default)")), - ( - (clipp::option("--huLattice").set(config.stencilSemantic, Neon::set::StencilSemantic::streaming) % "Halo update with lattice semantic (on by default)") | - (clipp::option("--huGrid").set(config.stencilSemantic, Neon::set::StencilSemantic::standard) % "Halo update with grid semantic ")), - ( - (clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode") | - (clipp::option("--visual").set(config.benchmark, false) % "Run export partial data")), - - ( - clipp::option("--vti").set(config.vti, true) % "Standard OCC") + (clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device ids to use", + clipp::required("--deviceIds") & clipp::integers("gpus", config.devices) % "Device ids to use", + clipp::option("--grid") & clipp::value("grid", config.gridType) % "Could be dGrid, eGrid, bGrid", + clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain", + clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters", + clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations", + clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run.", + clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename", + + clipp::option("--computeFP") & clipp::value("computeFP", config.computeType) % "Could be double or float", + clipp::option("--storageFP") & clipp::value("storageFP", config.storeType) % "Could be double or float", + + clipp::option("--curve") & clipp::value("curve", config.curve) % "Could be sweep (the default), morton, or hilber", + ( + (clipp::option("--sOCC").set(config.occ, Neon::skeleton::Occ::standard) % "Standard OCC") | + (clipp::option("--nOCC").set(config.occ, Neon::skeleton::Occ::none) % "No OCC (on by default)")), + ( + (clipp::option("--put").set(config.transferMode, Neon::set::TransferMode::put) % "Set transfer mode to PUT") | + (clipp::option("--get").set(config.transferMode, Neon::set::TransferMode::get) % "Set transfer mode to GET (on by default)")), + ( + (clipp::option("--huLattice").set(config.stencilSemantic, Neon::set::StencilSemantic::streaming) % "Halo update with lattice semantic (on by default)") | + (clipp::option("--huGrid").set(config.stencilSemantic, Neon::set::StencilSemantic::standard) % "Halo update with grid semantic ")), + ( + (clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode") | + (clipp::option("--visual").set(config.benchmark, false) % "Run export partial data")), + + ( + clipp::option("--vti").set(config.vti, true) % "Standard OCC") ); + if (!clipp::parse(argc, argv, cli)) { auto fmt = clipp::doc_formatting{}.doc_column(31); std::cout << make_man_page(cli, argv[0], fmt) << '\n'; return -1; } + if (config.curve == "sweep") + config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep; + if (config.curve == "morton") + config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::morton; + if (config.curve == "hilbert") + config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::hilbert; + + if (config.curve != "sweep" && config.curve != "morton" && config.curve != "hilbert") { + auto fmt = clipp::doc_formatting{}.doc_column(31); + std::cout << config.curve << " is not a supported configuration" << std::endl; + std::cout << make_man_page(cli, argv[0], fmt) << '\n'; + return -1; + } + helpSetLbmParameters(); return 0; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h index af32972e..18695ce4 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h @@ -3,6 +3,7 @@ #include #include #include "Neon/core/tools/clipp.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/skeleton/Skeleton.h" template @@ -16,28 +17,29 @@ struct LbmParameters struct Config { - double Re = 100.; // Reynolds number - double ulb = 0.04; // Velocity in lattice units - int N = 160; // Number of nodes in x-direction - bool benchmark = false; // Run in benchmark mode ? - double max_t = 10.0; // Non-benchmark mode: Total time in dim.less units - int outFrequency = 200; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) - int dataFrequency = 0; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) - int benchIniIter = 1000; // Benchmark mode: Number of warmup iterations - int benchMaxIter = 2000; // Benchmark mode: Total number of iterations - int repetitions = 1; // Benchmark mode: number of time the test is run - std::string deviceType = "gpu"; - std::vector devices = std::vector(0); // Devices for the execution - std::string reportFile = "lbm-lid-driven-cavity-flow"; // Report file name - std::string gridType = "dGrid"; // Neon grid type - Neon::skeleton::Occ occ = Neon::skeleton::Occ::none; // Neon OCC type - Neon::set::TransferMode transferMode = Neon::set::TransferMode::get; // Neon transfer mode for halo update - Neon::set::StencilSemantic stencilSemantic = Neon::set::StencilSemantic::streaming; - bool vti = false; // Export vti file - std::string computeType = "double"; - std::string storeType = "double"; - - LbmParameters mLbmParameters; + double Re = 100.; // Reynolds number + double ulb = 0.04; // Velocity in lattice units + int N = 160; // Number of nodes in x-direction + bool benchmark = false; // Run in benchmark mode ? + double max_t = 10.0; // Non-benchmark mode: Total time in dim.less units + int outFrequency = 200; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) + int dataFrequency = 0; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) + int benchIniIter = 1000; // Benchmark mode: Number of warmup iterations + int benchMaxIter = 2000; // Benchmark mode: Total number of iterations + int repetitions = 1; // Benchmark mode: number of time the test is run + std::string deviceType = "gpu"; + std::vector devices = std::vector(0); // Devices for the execution + std::string reportFile = "lbm-lid-driven-cavity-flow"; // Report file name + std::string gridType = "dGrid"; // Neon grid type + Neon::skeleton::Occ occ = Neon::skeleton::Occ::none; // Neon OCC type + Neon::set::TransferMode transferMode = Neon::set::TransferMode::get; // Neon transfer mode for halo update + Neon::set::StencilSemantic stencilSemantic = Neon::set::StencilSemantic::streaming; + bool vti = false; // Export vti file + std::string computeType = "double"; + std::string storeType = "double"; + std::string curve = "sweep"; + Neon::domain::tool::spaceCurves::EncoderType spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep; + LbmParameters mLbmParameters; auto toString() const -> std::string; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h new file mode 100644 index 00000000..ce5f69a2 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h @@ -0,0 +1,33 @@ +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +namespace pull { +template +struct ContainerFactory +{ +}; +} // namespace pull + +namespace push { +template +struct ContainerFactory +{ +}; +} // namespace push + +namespace common { +template +struct ContainerFactory +{ +}; +} // namespace common +#include "ContainersD3Q19.h" +#include "ContainersD3Q27.h" \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h new file mode 100644 index 00000000..fcbda83d --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h @@ -0,0 +1,392 @@ +#pragma once + +#include "CellType.h" +#include "D3Q19.h" +#include "DeviceD3Q19.h" +#include "Methods.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +namespace pull { +/** + * Specialization for D3Q19 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using PullFunctions = pull::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + PullFunctions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + PullFunctions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + +}; +} // namespace pull +namespace push { +/** + * Specialization for D3Q19 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using PushFunctions = push::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + PushFunctions::localLoad(gidx, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + CommonFunctions::macroscopic(popIn, + NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + CommonFunctions::collideBgkUnrolled(gidx, + rho, u, + usqr, omega, + NEON_IO popIn); + + PushFunctions::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut); + } + }; + }); + return container; + } + + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId != Lattice::Memory::center) { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; + + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification != CellType::bulk) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId)); + } + } + }); + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } +}; +} // namespace push +namespace common { +/** + * Specialization for D3Q19 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using PullFunctions = pull::DeviceD3Q19; + using PushFunctions = push::DeviceD3Q19; + using CommonFunctions = common::DeviceD3Q19; + + template + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + if constexpr (method == int(Method::push)) { + using Factory = push::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + fOutField, + omega, + fOutField); + } + if constexpr (method == int(Method::pull)) { + using Factory = pull::ContainerFactory, + Grid_>; + return Factory::iteration(stencilSemantic, + fInField, + fOutField, + omega, + fOutField); + } + } + + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId != Lattice::Memory::center) { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; + + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification != CellType::bulk) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId)); + } + } + }); + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } + + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = + fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + + Storage popIn[Lattice::Q]; + CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn); + + if (cellInfo.classification == CellType::bulk) { + CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + + static auto + problemSetup(PopField& fInField /*! inpout population field */, + PopField& fOutField, + CellTypeField& cellTypeField, + Neon::double_3d ulid, + double ulb) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, ulid, ulb](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, Neon::Pattern::MAP); + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globalIdx = fIn.getGlobalIndex(gidx); + const auto domainDim = fIn.getDomainSize(); + + CellType flagVal; + flagVal.classification = CellType::bulk; + flagVal.wallNghBitflag = 0; + + typename Lattice::Precision::Storage popVal = 0; + + if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || + globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 || + globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) { + flagVal.classification = CellType::bounceBack; + + if (globalIdx.y == domainDim.y - 1) { + flagVal.classification = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + if (globalIdx.y == domainDim.y - 1) { + popVal = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); + } else { + popVal = 0; + } + fIn(gidx, q) = popVal; + fOut(gidx, q) = popVal; + }); + } else { + flagVal.classification = CellType::bulk; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + fIn(gidx, q) = Lattice::Memory::template getT(); + fOut(gidx, q) = Lattice::Memory::template getT(); + }); + } + cellInfoPartition(gidx, 0) = flagVal; + }; + }); + return container; + } +}; +} // namespace common \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h new file mode 100644 index 00000000..d5d024ea --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h @@ -0,0 +1,227 @@ +#pragma once + +#include "CellType.h" +#include "D3Q27.h" +#include "DeviceD3Q27.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" +#if 0 +/** + * Specialization for D3Q27 + */ +template +struct ContainerFactory, + Grid_> +{ + using Lattice = D3Q27; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using Functions = DeviceD3Q19; + + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popIn[Lattice::Q]; + Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + + Compute rho; + std::array u{.0, .0, .0}; + Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + Functions::collideBgkUnrolled(gidx, + popIn, + rho, u, + usqr, omega, + NEON_OUT fOut); + } + }; + }); + return container; + } + + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId != Lattice::Memory::center) { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; + + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification != CellType::bulk) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId)); + } + } + }); + + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } + + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + Storage popIn[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + + Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn); + Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u); + + } else { + if (cellInfo.classification == CellType::movingWall) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GORegisterId) { + if constexpr (GORegisterId == Lattice::Registers::center) { + popIn[Lattice::Registers::center] = fIn(gidx, Lattice::Memory::center); + } else { + popIn[GORegisterId] = fIn(gidx, Lattice::Memory::template mapFromRegisters()); + } + }); + + rho = 1.0; + u = std::array{static_cast(popIn[0]) / static_cast(6. * 1. / 18.), + static_cast(popIn[1]) / static_cast(6. * 1. / 18.), + static_cast(popIn[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + + static auto + problemSetup(PopField& fInField /*! inpout population field */, + PopField& fOutField, + CellTypeField& cellTypeField, + Neon::double_3d ulid, + double ulb) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, ulid, ulb](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, Neon::Pattern::MAP); + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globlalIdx = fIn.getGlobalIndex(gidx); + const auto domainDim = fIn.getDomainSize(); + CellType flagVal; + flagVal.classification = CellType::bulk; + flagVal.wallNghBitflag = 0; + typename Lattice::Precision::Storage val = 0; + + if (globlalIdx.x == 0 || globlalIdx.x == domainDim.x - 1 || + globlalIdx.y == 0 || globlalIdx.y == domainDim.y - 1 || + globlalIdx.z == 0 || globlalIdx.z == domainDim.z - 1) { + flagVal.classification = CellType::bounceBack; + + if (globlalIdx.y == domainDim.y - 1) { + flagVal.classification = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + if (globlalIdx.y == domainDim.y - 1) { + val = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); + } else { + val = 0; + } + fIn(gidx, q) = val; + fOut(gidx, q) = val; + }); + } else { + flagVal.classification = CellType::bulk; + cellInfoPartition(gidx, 0) = flagVal; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + fIn(gidx, q) = Lattice::Memory::template getT(); + fOut(gidx, q) = Lattice::Memory::template getT(); + }); + } + }; + }); + return container; + } +}; +#endif \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h index 15e8e0b1..4f9d4c8b 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h @@ -3,113 +3,72 @@ #include "Neon/Neon.h" #include "Neon/set/Backend.h" #include "Neon/set/memory/memSet.h" +#include "Precision.h" -template -struct D3Q19Template + +/** In each lattice we define two indexing schema + * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code. + * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields. + * + * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions. + * + */ +template +struct D3Q19 { public: + D3Q19() = delete; + static constexpr int Q = 19; /** number of directions */ static constexpr int D = 3; /** Space dimension */ + using Precision = Precision_; + using Self = D3Q19; - static constexpr int centerDirection = 9; /** Position of direction {0,0,0} */ - static constexpr int goRangeBegin = 0; /** Symmetry is represented as "go" direction and the "back" their opposite */ - static constexpr int goRangeEnd = 8; - static constexpr int goBackOffset = 10; /** Offset to compute apply symmetry */ - + static constexpr int RegisterMapping = 1; + static constexpr int MemoryMapping = 2; - explicit D3Q19Template(const Neon::Backend& backend) + struct Registers { - // The discrete velocities of the Lattice mesh. - c_vect = std::vector( - { - {-1, 0, 0} /*! 0 Symmetry first section (GO) */, - {0, -1, 0} /*! 1 */, - {0, 0, -1} /*! 2 */, - {-1, -1, 0} /*! 3 */, - {-1, 1, 0} /*! 4 */, - {-1, 0, -1} /*! 5 */, - {-1, 0, 1} /*! 6 */, - {0, -1, -1} /*! 7 */, - {0, -1, 1} /*! 8 */, - {0, 0, 0} /*! 9 The center */, - {1, 0, 0} /*! 10 Symmetry mirror section (BK) */, - {0, 1, 0} /*! 11 */, - {0, 0, 1} /*! 12 */, - {1, 1, 0} /*! 13 */, - {1, -1, 0} /*! 14 */, - {1, 0, 1} /*! 15 */, - {1, 0, -1} /*! 16 */, - {0, 1, 1} /*! 17 */, - {0, 1, -1} /*! 18 */, - }); - - auto c_neon = backend.devSet().newMemSet( - Neon::DataUse::HOST_DEVICE, - 1, - Neon::MemoryOptions(), - backend.devSet().newDataSet([&](Neon::SetIdx const&, auto& val) { - val = c_vect.size(); - })); - - for (Neon::SetIdx i = 0; i < backend.devSet().setCardinality(); i++) { - for (int j = 0; j < int(c_vect.size()); j++) { - c_neon.eRef(i, j).x = static_cast(c_vect[j].x); - c_neon.eRef(i, j).y = static_cast(c_vect[j].y); - c_neon.eRef(i, j).z = static_cast(c_vect[j].z); - } - } - // The opposite of a given direction. - std::vector opp_vect = { - 10 /*! 0 */, - 11 /*! 1 */, - 12 /*! 2 */, - 13 /*! 3 */, - 14 /*! 4 */, - 15 /*! 5 */, - 16 /*! 6 */, - 17 /*! 7 */, - 18 /*! 8 */, - 9 /*! 9 */, - 0 /*! 10 */, - 1 /*! 11 */, - 2 /*! 12 */, - 3 /*! 13 */, - 4 /*! 14 */, - 5 /*! 15 */, - 6 /*! 16 */, - 7 /*! 17 */, - 8 /*! 18 */, - }; + using Self = D3Q19::Registers; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1)}; + + static constexpr int center = 9; /** Position of direction {0,0,0} */ - { // Check correctness of opposite - for (int i = 0; i < static_cast(c_vect.size()); i++) { - auto point = c_vect[i]; - auto opposite = point * -1; - if (opposite != c_vect[opp_vect[i]]) { - Neon::NeonException exp(""); - exp << "Incompatible opposite"; - NEON_THROW(exp); + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; } } } - this->opp = backend.devSet().newMemSet( - Neon::DataUse::HOST_DEVICE, - 1, - Neon::MemoryOptions(), - backend.devSet().newDataSet([&](Neon::SetIdx const&, auto& val) { - val = opp_vect.size(); - })); - - - for (Neon::SetIdx i = 0; i < backend.devSet().setCardinality(); i++) { - for (size_t j = 0; j < opp_vect.size(); j++) { - this->opp.eRef(i, j, 0) = opp_vect[j]; - } - } + static constexpr std::array opposite{ + 10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8}; - // The lattice weights. - t_vect = { + static constexpr std::array t{ 1. / 18. /*! 0 */, 1. / 18. /*! 1 */, 1. / 18. /*! 2 */, @@ -128,48 +87,195 @@ struct D3Q19Template 1. / 36. /*! 15 */, 1. / 36. /*! 16 */, 1. / 36. /*! 17 */, - 1. / 36. /*! 18 */, + 1. / 36. /*! 18 */ }; - this->t = backend.devSet().newMemSet( - Neon::DataUse::HOST_DEVICE, - 1, - Neon::MemoryOptions(), - backend.devSet().newDataSet([&](Neon::SetIdx const&, auto&val) { - val= opp_vect.size(); - })); + static constexpr int fwdRegIdxListLen = (Q-1)/2; + static constexpr std::array fwdRegIdxList{0, 1, 2, 3, 4, 5, 6, 7, 8}; + template + static inline NEON_CUDA_HOST_DEVICE auto + getCk_u(std::array const& u) -> Compute + { + if constexpr (tegIdx == 0 || tegIdx == 9) { + return u[0]; + } + if constexpr (tegIdx == 1 || tegIdx == 10) { + return u[1]; + } + if constexpr (tegIdx == 2 || tegIdx == 11) { + return u[2]; + } + if constexpr (tegIdx == 3 || tegIdx == 12) { + return u[0] + u[1]; + } + if constexpr (tegIdx == 4 || tegIdx == 13) { + return u[0] - u[1]; + } + if constexpr (tegIdx == 5 || tegIdx == 14) { + return u[0] + u[2]; + } + if constexpr (tegIdx == 6 || tegIdx == 15) { - for (Neon::SetIdx i = 0; i < backend.devSet().setCardinality(); i++) { - for (size_t j = 0; j < t_vect.size(); j++) { - this->t.eRef(i, j, 0) = t_vect[j]; + return u[0] - u[2]; + } + if constexpr (tegIdx == 7 || tegIdx == 16) { + + return u[1] + u[2]; + } + if constexpr (tegIdx == 8 || tegIdx == 17) { + return u[1] - u[2]; } } + }; + + struct Memory + { + using Self = D3Q19::Memory; + + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1)}; + + + static constexpr int center = 9; /** Position of direction {0,0,0} */ + + static constexpr std::array toRegisters{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; + + static constexpr std::array toMemory{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}; - if (backend.runtime() == Neon::Runtime::stream) { - this->c.template update(backend.streamSet(0), Neon::DeviceType::CUDA); - this->opp.template update(backend.streamSet(0), Neon::DeviceType::CUDA); - this->t.template update(backend.streamSet(0), Neon::DeviceType::CUDA); + + template + NEON_CUDA_HOST_DEVICE static constexpr auto mapToRegisters() + -> int + { + return toRegisters[go]; } - } + template + NEON_CUDA_HOST_DEVICE static constexpr auto mapFromRegisters() + -> int + { + return toMemory[go]; + } - template - static constexpr auto getOpposite() - -> int - { - if constexpr (go == centerDirection) - return centerDirection; - if constexpr (go <= goRangeEnd) - return go + goBackOffset; - if constexpr (go <= goRangeEnd + goBackOffset) - return go - goBackOffset; - } + template + NEON_CUDA_HOST_DEVICE static constexpr auto getOpposite() + -> int + { + return opposite[go]; + } + + static constexpr std::array opposite{ + 10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8}; + + template + static constexpr auto helpGetValueforT() + -> typename Precision::Storage + { + auto goInRegisterSpace = Self::template mapToRegisters(); + return Registers::t[goInRegisterSpace]; + } + + template + struct MemMapper + { + constexpr static int fwMemIdx = fwMemIdx_; + constexpr static int fwX = Memory::stencil[fwMemIdx].x; + constexpr static int fwY = Memory::stencil[fwMemIdx].y; + constexpr static int fwZ = Memory::stencil[fwMemIdx].z; + constexpr static int bkMemIdx = Memory::opposite[fwMemIdx]; + constexpr static int bkX = Memory::stencil[bkMemIdx].x; + constexpr static int bkY = Memory::stencil[bkMemIdx].y; + constexpr static int bkZ = Memory::stencil[bkMemIdx].z; - Neon::set::MemSet c; - Neon::set::MemSet opp; - Neon::set::MemSet t; - std::vector t_vect; - std::vector c_vect; + constexpr static int fwRegIdx = Memory::template mapToRegisters(); + constexpr static int centerRegIdx = Registers::center; + constexpr static int centerMemIdx = Memory::center; + }; + + template + struct RegMapper + { + constexpr static int fwRegIdx = fwRegIdx_; + constexpr static int bkRegIdx = Registers::opposite[fwRegIdx]; + constexpr static int fwMemIdx = Registers::template mapToMemory(); + constexpr static int bkMemIdx = Registers::template mapToMemory(); + constexpr static int centerRegIdx = Registers::center; + constexpr static int centerMemIdx = Memory::center; + }; + + static constexpr std::array t{ + 1. / 18. /*! 0 */, + 1. / 18. /*! 1 */, + 1. / 18. /*! 2 */, + 1. / 36. /*! 3 */, + 1. / 36. /*! 4 */, + 1. / 36. /*! 5 */, + 1. / 36. /*! 6 */, + 1. / 36. /*! 7 */, + 1. / 36. /*! 8 */, + 1. / 3. /*! 9 */, + 1. / 18. /*! 10 */, + 1. / 18. /*! 11 */, + 1. / 18. /*! 12 */, + 1. / 36. /*! 13 */, + 1. / 36. /*! 14 */, + 1. / 36. /*! 15 */, + 1. / 36. /*! 16 */, + 1. / 36. /*! 17 */, + 1. / 36. /*! 18 */}; + + template + NEON_CUDA_HOST_DEVICE static constexpr auto getT() + -> typename Precision::Storage + { + return t[direction]; + } + template + NEON_CUDA_HOST_DEVICE static constexpr auto getDirection() + -> typename Neon::index_3d + { + return stencil[direction]; + } + }; + + + public: + template + static auto getDirectionAsVector() + -> std::vector + { + std::vector vec; + if constexpr (mappingType == RegisterMapping) { + for (auto const& a : Registers::stencil) { + vec.push_back(a); + } + } else if constexpr (mappingType == MemoryMapping) { + for (auto const& a : Memory::stencil) { + vec.push_back(a); + } + } + return vec; + } }; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h new file mode 100644 index 00000000..9f2c7f95 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h @@ -0,0 +1,200 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" +#include "Precision.h" + + +/** In each lattice we define two indexing schema + * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code. + * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields. + * + * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions. + * + */ +template +struct D3Q27 +{ + public: + D3Q27() = delete; + + static constexpr int Q = 27; /** number of directions */ + static constexpr int D = 3; /** Space dimension */ + using Precision = Precision_; + using Self = D3Q27; + + static constexpr int RegisterMapping = 1; + static constexpr int MemoryMapping = 2; + + struct Registers + { + using Self = D3Q27::Registers; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(-1, -1, -1), + Neon::index_3d(-1, -1, 1), + Neon::index_3d(-1, 1, -1), + Neon::index_3d(-1, 1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1), + Neon::index_3d(1, 1, 1), + Neon::index_3d(1, 1, -1), + Neon::index_3d(1, -1, 1), + Neon::index_3d(1, -1, -1)}; + + static constexpr int center = 13; /** Position of direction {0,0,0} */ + + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; + } + } + } + + static constexpr std::array opposite{ + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 13, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 + }; + + static constexpr std::array t{ + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216., + 8. / 27., + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; + }; + + struct Memory + { + using Self = D3Q27::Memory; + static constexpr std::array stencil{ + Neon::index_3d(-1, 0, 0), + Neon::index_3d(0, -1, 0), + Neon::index_3d(0, 0, -1), + Neon::index_3d(-1, -1, 0), + Neon::index_3d(-1, 1, 0), + Neon::index_3d(-1, 0, -1), + Neon::index_3d(-1, 0, 1), + Neon::index_3d(0, -1, -1), + Neon::index_3d(0, -1, 1), + Neon::index_3d(-1, -1, -1), + Neon::index_3d(-1, -1, 1), + Neon::index_3d(-1, 1, -1), + Neon::index_3d(-1, 1, 1), + Neon::index_3d(0, 0, 0), + Neon::index_3d(1, 0, 0), + Neon::index_3d(0, 1, 0), + Neon::index_3d(0, 0, 1), + Neon::index_3d(1, 1, 0), + Neon::index_3d(1, -1, 0), + Neon::index_3d(1, 0, 1), + Neon::index_3d(1, 0, -1), + Neon::index_3d(0, 1, 1), + Neon::index_3d(0, 1, -1), + Neon::index_3d(1, 1, 1), + Neon::index_3d(1, 1, -1), + Neon::index_3d(1, -1, 1), + Neon::index_3d(1, -1, -1)}; + + + static constexpr int center = 13; /** Position of direction {0,0,0} */ + + template + static constexpr auto mapToRegisters() + -> int + { + auto direction = stencil[go]; + for (int i = 0; i < Q; ++i) { + if (Registers::stencil[i] == direction) { + return i; + } + } + } + + template + static constexpr auto mapFromRegisters() + -> int + { + auto direction = Registers::stencil[go]; + for (int i = 0; i < Q; ++i) { + if (Self::stencil[i] == direction) { + return i; + } + } + } + + template + static constexpr auto getOpposite() + -> int + { + auto opposite3d = stencil[go] * -1; + for (int i = 0; i < Q; ++i) { + if (stencil[i] == opposite3d) { + return i; + } + } + } + + static constexpr std::array opposite{ + 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 13, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 + }; + + template + static constexpr auto helpGetValueforT() + -> typename Precision::Storage + { + auto goInRegisterSpace = Self::template mapToRegisters(); + return Registers::t[goInRegisterSpace]; + } + + static constexpr std::array t{ + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216., + 8. / 27., + 2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., + 1. / 216., 1. / 216., 1. / 216., 1. / 216.}; + }; + + public: + template + static auto getDirectionAsVector() + -> std::vector + { + std::vector vec; + if constexpr (mappingType == RegisterMapping) { + for (auto const& a : Registers::stencil) { + vec.push_back(a); + } + } else if constexpr (mappingType == MemoryMapping) { + for (auto const& a : Memory::stencil) { + vec.push_back(a); + } + } + return vec; + } +}; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h new file mode 100644 index 00000000..fff6f2b3 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h @@ -0,0 +1,214 @@ +#pragma once +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +namespace pull { +template +struct DeviceD3Q19 +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx) { + using M = typename Lattice::template MappersIdxSetWithFwdMem; + + if constexpr (fwMemIdx == Lattice::Memory::center) { + popIn[M::centerRegIdx] = fin(gidx, M::centerMemIdx); + } else { + if (CellType::isWall()) { + popIn[M::fwRegIdx] = fin(gidx, M::bkMemIdx) + + fin.template getNghData(gidx, M::bkMemIdx)(); + } else { + popIn[M::fwRegIdx] = fin.template getNghData(gidx, fwMemIdx)(); + } + } + }); + } +}; + +#undef CAST_TO_COMPUTE +} // namespace pull + +namespace push { +template +struct DeviceD3Q19 +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + pushStream(Idx const& gidx, + const uint32_t& wallNghBitFlag, + NEON_OUT Storage pOut[Lattice::Q], + NEON_OUT typename PopField::Partition const& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx_) { + using M = typename Lattice::template MappersIdxSetWithFwdMem; + + if constexpr (M::fwMemIdx == M::centerMemIdx) { + fOut(gidx, M::fwMemIdx) = pOut[M::fwRegIdx]; + } else { + if (CellType::isWall()) { + // fout(i, opp[k]) = + // pop_out + + // f(nb, k); + fOut(gidx, M::bkMemIdx) = + pOut[M::fwdRegIdx] + + fOut.template getNghData(gidx, M::fwMemIdx)(); + } else { + // fout(nb, k) = pop_out; + fOut.writeNgh(gidx, M::fwMemIdx, pOut[M::fwdRegIdx]); + } + } + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + localLoad(Idx const& gidx, + NEON_IN typename PopField::Partition const& fOut, + Storage NEON_RESTRICT pOut[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx_) { + using M = typename Lattice::template MappersIdxSetWithFwdMem; + pOut[M::fwdRegIdx] = fOut(gidx, M::fwMemIdx); + }); + } +}; +} // namespace push + + +namespace common { +template +struct DeviceD3Q19 +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const Storage pop[Lattice::Q], + NEON_OUT Compute& rho, + NEON_OUT std::array& u) + -> void + { + +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! Compute iterator */, + Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + NEON_IO Storage pop[Lattice::Q]) + + -> void + { + + constexpr Compute c1over18 = 1. / 18.; + constexpr Compute c1over36 = 1. / 36.; + constexpr Compute c4dot5 = 4.5; + constexpr Compute c3 = 3.; + constexpr Compute c1 = 1.; + constexpr Compute c6 = 6.; + + constexpr int regCenter = Lattice::Registers::center; + constexpr int regFir = Lattice::Registers::center; + + Neon::ConstexprFor<0, Lattice::Registers::fwdRegIdxListLen, 1>( + [&](auto fwdRegIdxListIdx) { + using M = typename Lattice::template RegMapper; + using T = typename Lattice::Registers; + + Compute eqFw; + Compute eqBk; + + const Compute ck_u = T::template getCk_u(u); + // double eq = rho * t[k] * + // (1. + + // 3. * ck_u + + // 4.5 * ck_u * ck_u - + // usqr); + eqFw = rho * T::t[M::fwRegIdx] * + (c1 + + c3 * ck_u + + c4dot5 * ck_u * ck_u - + usqr); + + // double eqopp = eq - 6.* rho * t[k] * ck_u; + eqBk = eqFw - + c6 * rho * c1over36 * T::t[M::fwRegIdx] * ck_u; + + // pop_out = (1. - omega) * fin(i, k) + omega * eq; + pop[M::fwRegIdx] = (c1 - omega) * static_cast(pop[M::fwRegIdx]) + omega * eqFw; + // pop_out_opp = (1. - omega) * fin(i, opp[k]) + omega * eqopp; + pop[M::bkRegIdx] = (c1 - omega) * static_cast(pop[M::bkRegIdx]) + omega * eqBk; + }); + { // Center; + using T = typename Lattice::Registers; + using M = typename Lattice::template RegMapper; + // eq = rho * t[k] * (1. - usqr); + const Compute eqCenter = rho * T::t[M::fwRegIdx] * (c1 - usqr); + // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; + pop[Lattice::Registers::center] = (c1 - omega) * static_cast(pop[M::fwRegIdx]) + omega * eqCenter; + } + } +}; +} // namespace common \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h new file mode 100644 index 00000000..f977492b --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h @@ -0,0 +1,217 @@ +#pragma once +#include "CellType.h" +#include "D3Q27.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + + +template +struct DeviceD3Q27 +{ + using Lattice = D3Q27; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + if constexpr (GOMemoryId == Lattice::Memory::center) { + popIn[Lattice::Registers::center] = fin(gidx, Lattice::Memory::center); + } else { + constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId]; + constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x; + constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y; + constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z; + constexpr int GORegistersId = Lattice::Memory::template mapToRegisters(); + + if (wallBitFlag & (uint32_t(1) << GOMemoryId)) { + popIn[GORegistersId] = + fin(gidx, BKMemoryId) + + fin.template getNghData(gidx, BKMemoryId)(); + } else { + popIn[GORegistersId] = + fin.template getNghData(gidx, GOMemoryId)(); + } + } + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const Storage pop[Lattice::Q], + NEON_OUT Compute& rho, + NEON_OUT std::array& u) + -> void + { + +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6) + POP(9) + POP(10) + POP(11) + POP(12); + const Compute X_P1 = POP(14) + POP(17) + POP(18) + POP(19) + POP(20) + POP(23) + POP(24) + POP(25) + POP(26); + const Compute X_0 = POP(1) + POP(2) + POP(7) + POP(8) + POP(13) + POP(15) + POP(16) + POP(21) + POP(22); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(9) + POP(10) + POP(18) + POP(25) + POP(26); + const Compute Y_P1 = POP(15) + POP(17) + POP(21) + POP(22) + POP(23) + POP(24) + POP(4) + POP(11) + POP(12); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(9) + POP(11) + POP(20) + POP(22) + POP(24) + POP(26); + const Compute Z_P1 = POP(16) + POP(19) + POP(21) + POP(23) + POP(25) + POP(6) + POP(8) + POP(10) + POP(12); +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[0] = (Y_P1 - Y_M1) / rho; + u[0] = (Z_P1 - Z_M1) / rho; + } + + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Idx const& i /*! Compute iterator */, + const Storage pop[Lattice::Q], + Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + typename PopField::Partition& fOut /*! Population */) + + -> void + { + const Compute cku1 = u[0] + u[1]; + const Compute cku2 = -u[0] + u[1]; + const Compute cku3 = u[0] + u[2]; + const Compute cku4 = -u[0] + u[2]; + const Compute cku5 = u[1] + u[2]; + const Compute cku6 = -u[1] + u[2]; + const Compute cku7 = u[0] + u[1] + u[2]; + const Compute cku8 = -u[0] + u[1] + u[2]; + const Compute cku9 = u[0] - u[1] + u[2]; + const Compute cku0 = u[0] + u[1] - u[2]; + + std::array feqRM; + + constexpr int F000 = 13; + constexpr int FM00 = 0; + constexpr int F0M0 = 1; + constexpr int F00M = 2; + constexpr int FMM0 = 3; + constexpr int FMP0 = 4; + constexpr int FM0M = 5; + constexpr int FM0P = 6; + constexpr int F0MM = 7; + constexpr int F0MP = 8; + constexpr int FMMM = 9; + constexpr int FMMP = 10; + constexpr int FMPM = 11; + constexpr int FMPP = 12; + constexpr int FP00 = 14; + constexpr int F0P0 = 15; + constexpr int F00P = 16; + constexpr int FPP0 = 17; + constexpr int FPM0 = 18; + constexpr int FP0P = 19; + constexpr int FP0M = 20; + constexpr int F0PP = 21; + constexpr int F0PM = 22; + constexpr int FPPP = 23; + constexpr int FPPM = 24; + constexpr int FPMP = 25; + constexpr int FPMM = 26; + + constexpr Compute c1over18 = 1. / 18.; + constexpr Compute c1over36 = 1. / 36.; + constexpr Compute c4dot5 = 4.5; + constexpr Compute c3 = 3.; + constexpr Compute c1 = 1.; + constexpr Compute c6 = 6.; + + feqRM[F000] = rho * Lattice::Registers::t[F000] * (c1- usqr); + + feqRM[FM00] = rho * Lattice::Registers::t[FM00] * (c1- c3* u[0] + c4dot5* u[0] * u[0] - usqr); + feqRM[FP00] = rho * Lattice::Registers::t[FP00] * (c6 * u[0]) + feqRM[FM00]; + + feqRM[F0M0] = rho * Lattice::Registers::t[F0M0] * (c1- c3* u[1] + c4dot5* u[1] * u[1] - usqr); + feqRM[F0P0] = rho * Lattice::Registers::t[F0P0] * (c6 * u[1]) + feqRM[F0M0]; + + feqRM[F00M] = rho * Lattice::Registers::t[F00M] * (c1- c3* u[2] + c4dot5* u[2] * u[2] - usqr); + feqRM[F00P] = rho * Lattice::Registers::t[F00P] * (c6 * u[2]) + feqRM[F00M]; + + feqRM[FMM0] = rho * Lattice::Registers::t[FMM0] * (c1- c3* cku1 + c4dot5* cku1 * cku1 - usqr); + feqRM[FPP0] = rho * Lattice::Registers::t[FPP0] * (c6 * cku1) + feqRM[FMM0]; + feqRM[FPM0] = rho * Lattice::Registers::t[FPM0] * (c1- c3* cku2 + c4dot5* cku2 * cku2 - usqr); + feqRM[FMP0] = rho * Lattice::Registers::t[FMP0] * (c6 * cku2) + feqRM[FPM0]; + + feqRM[FM0M] = rho * Lattice::Registers::t[FM0M] * (c1- c3* cku3 + c4dot5* cku3 * cku3 - usqr); + feqRM[FP0P] = rho * Lattice::Registers::t[FP0P] * (c6 * cku3) + feqRM[FM0M]; + feqRM[FP0M] = rho * Lattice::Registers::t[FP0M] * (c1- c3* cku4 + c4dot5* cku4 * cku4 - usqr); + feqRM[FM0P] = rho * Lattice::Registers::t[FM0P] * (c6 * cku4) + feqRM[FP0M]; + + feqRM[F0MM] = rho * Lattice::Registers::t[F0MM] * (c1- c3* cku5 + c4dot5* cku5 * cku5 - usqr); + feqRM[F0PP] = rho * Lattice::Registers::t[F0PP] * (c6 * cku5) + feqRM[F0MM]; + feqRM[F0PM] = rho * Lattice::Registers::t[F0PM] * (c1- c3* cku6 + c4dot5* cku6 * cku6 - usqr); + feqRM[F0MP] = rho * Lattice::Registers::t[F0MP] * (c6 * cku6) + feqRM[F0PM]; + + feqRM[FMMM] = rho * Lattice::Registers::t[FMMM] * (c1- c3* cku7 + c4dot5* cku7 * cku7 - usqr); + feqRM[FPPP] = rho * Lattice::Registers::t[FPPP] * (c6 * cku7) + feqRM[FMMM]; + feqRM[FPMM] = rho * Lattice::Registers::t[FPMM] * (c1- c3* cku8 + c4dot5* cku8 * cku8 - usqr); + feqRM[FMPP] = rho * Lattice::Registers::t[FMPP] * (c6 * cku8) + feqRM[FPMM]; + feqRM[FMPM] = rho * Lattice::Registers::t[FMPM] * (c1- c3* cku9 + c4dot5* cku9 * cku9 - usqr); + feqRM[FPMP] = rho * Lattice::Registers::t[FPMP] * (c6 * cku9) + feqRM[FMPM]; + feqRM[FMMP] = rho * Lattice::Registers::t[FMMP] * (c1- c3* cku0 + c4dot5* cku0 * cku0 - usqr); + feqRM[FPPM] = rho * Lattice::Registers::t[FPPM] * (c6 * cku0) + feqRM[FMMP]; + + // BGK Collision based on the second-order equilibrium + std::array foutRM; + + foutRM[F000] = (c1- omega) * static_cast(pop[F000]) + omega * feqRM[F000]; + + foutRM[FP00] = (c1- omega) * static_cast(pop[FP00]) + omega * feqRM[FP00]; + foutRM[FM00] = (c1- omega) * static_cast(pop[FM00]) + omega * feqRM[FM00]; + + foutRM[F0P0] = (c1- omega) * static_cast(pop[F0P0]) + omega * feqRM[F0P0]; + foutRM[F0M0] = (c1- omega) * static_cast(pop[F0M0]) + omega * feqRM[F0M0]; + + foutRM[F00P] = (c1- omega) * static_cast(pop[F00P]) + omega * feqRM[F00P]; + foutRM[F00M] = (c1- omega) * static_cast(pop[F00M]) + omega * feqRM[F00M]; + + foutRM[FPP0] = (c1- omega) * static_cast(pop[FPP0]) + omega * feqRM[FPP0]; + foutRM[FMP0] = (c1- omega) * static_cast(pop[FMP0]) + omega * feqRM[FMP0]; + foutRM[FPM0] = (c1- omega) * static_cast(pop[FPM0]) + omega * feqRM[FPM0]; + foutRM[FMM0] = (c1- omega) * static_cast(pop[FMM0]) + omega * feqRM[FMM0]; + + foutRM[FP0P] = (c1- omega) * static_cast(pop[FP0P]) + omega * feqRM[FP0P]; + foutRM[FM0P] = (c1- omega) * static_cast(pop[FM0P]) + omega * feqRM[FM0P]; + foutRM[FP0M] = (c1- omega) * static_cast(pop[FP0M]) + omega * feqRM[FP0M]; + foutRM[FM0M] = (c1- omega) * static_cast(pop[FM0M]) + omega * feqRM[FM0M]; + + foutRM[F0PP] = (c1- omega) * static_cast(pop[F0PP]) + omega * feqRM[F0PP]; + foutRM[F0MP] = (c1- omega) * static_cast(pop[F0MP]) + omega * feqRM[F0MP]; + foutRM[F0PM] = (c1- omega) * static_cast(pop[F0PM]) + omega * feqRM[F0PM]; + foutRM[F0MM] = (c1- omega) * static_cast(pop[F0MM]) + omega * feqRM[F0MM]; + + foutRM[FPPP] = (c1- omega) * static_cast(pop[FPPP]) + omega * feqRM[FPPP]; + foutRM[FMPP] = (c1- omega) * static_cast(pop[FMPP]) + omega * feqRM[FMPP]; + foutRM[FPMP] = (c1- omega) * static_cast(pop[FPMP]) + omega * feqRM[FPMP]; + foutRM[FPPM] = (c1- omega) * static_cast(pop[FPPM]) + omega * feqRM[FPPM]; + foutRM[FMMP] = (c1- omega) * static_cast(pop[FMMP]) + omega * feqRM[FMMP]; + foutRM[FMPM] = (c1- omega) * static_cast(pop[FMPM]) + omega * feqRM[FMPM]; + foutRM[FPMM] = (c1- omega) * static_cast(pop[FPMM]) + omega * feqRM[FPMM]; + foutRM[FMMM] = (c1- omega) * static_cast(pop[FMMM]) + omega * feqRM[FMMM]; + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) { + fOut(i, GOMemoryId) = static_cast(foutRM[Lattice::Memory::template mapToRegisters()]); + }); + } +}; + diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h deleted file mode 100644 index b92d9acc..00000000 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h +++ /dev/null @@ -1,101 +0,0 @@ -#include "CellType.h" -#include "D3Q19.h" -#include "LbmTools.h" -#include "Neon/Neon.h" -#include "Neon/set/Backend.h" -#include "Neon/set/Containter.h" -#include "Neon/skeleton/Skeleton.h" - -template -struct LbmSkeleton -{ -}; - - -template -struct LbmIterationD3Q19 -{ - using LbmStoreType = typename PopulationField::Type; - using CellTypeField = typename PopulationField::Grid::template Field; - using D3Q19 = D3Q19Template; - using LbmTools = LbmContainers; - - - LbmIterationD3Q19(Neon::set::StencilSemantic stencilSemantic, - Neon::skeleton::Occ occ, - Neon::set::TransferMode transfer, - PopulationField& fIn /*! inpout population field */, - PopulationField& fOut, - CellTypeField& cellTypeField /*! Cell type field */, - LbmComputeType omega /*! LBM omega parameter */) - { - pop[0] = fIn; - pop[1] = fOut; - - setupSkeletons(0, stencilSemantic, occ, transfer, pop[0], pop[1], cellTypeField, omega); - setupSkeletons(1, stencilSemantic, occ, transfer, pop[1], pop[0], cellTypeField, omega); - - parity = 0; - } - auto getInput() - -> PopulationField& - { - return pop[parity]; - } - - auto getOutput() - -> PopulationField& - { - int other = parity == 0 ? 1 : 0; - return pop[other]; - } - - auto run() - -> void - { - lbmTwoPop[parity].run(); - updateParity(); - } - - auto sync() - -> void - { - pop[0].getBackend().syncAll(); - } - - private: - auto updateParity() - -> void - { - parity = parity == 0 ? 1 : 0; - } - - auto setupSkeletons(int target, - Neon::set::StencilSemantic stencilSemantic, - Neon::skeleton::Occ occ, - Neon::set::TransferMode transfer, - PopulationField& inField /*! inpout population field */, - PopulationField& outField, - CellTypeField& cellTypeField /*! Cell type field */, - LbmComputeType omega /*! LBM omega parameter */) - { - std::vector ops; - lbmTwoPop[target] = Neon::skeleton::Skeleton(inField.getBackend()); - Neon::skeleton::Options opt(occ, transfer); - ops.push_back(LbmTools::iteration(stencilSemantic, - inField, - cellTypeField, - omega, - outField)); - std::stringstream appName; - appName << "LBM_iteration_" << std::to_string(target); - lbmTwoPop[target].sequence(ops, appName.str(), opt); - } - - Neon::skeleton::Skeleton lbmTwoPop[2]; - PopulationField pop[2]; - int parity; -}; \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h new file mode 100644 index 00000000..22ae8177 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h @@ -0,0 +1,117 @@ +#include "CellType.h" +#include "ContainerFactory.h" +#include "ContainersD3Q19.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/Containter.h" +#include "Neon/skeleton/Skeleton.h" + +template +struct LbmSkeleton +{ +}; + + +template +struct LbmSkeleton, + Grid_> +{ + using Lattice = D3Q19; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + using ContainerFactory = common::ContainerFactory; + + LbmSkeleton(Neon::set::StencilSemantic stencilSemantic, + Neon::skeleton::Occ occ, + Neon::set::TransferMode transfer, + PopField& fIn /*! inpout population field */, + PopField& fOut, + CellTypeField& cellTypeField /*! Cell type field */, + Compute omega /*! LBM omega parameter */) + { + pop[0] = fIn; + pop[1] = fOut; + + setupSkeletons(0, stencilSemantic, occ, transfer, pop[0], pop[1], cellTypeField, omega); + setupSkeletons(1, stencilSemantic, occ, transfer, pop[1], pop[0], cellTypeField, omega); + + parity = 0; + } + + auto getInput() + -> PopField& + { + return pop[parity]; + } + + auto getOutput() + -> PopField& + { + int other = parity == 0 ? 1 : 0; + return pop[other]; + } + + auto run() + -> void + { + lbmTwoPop[parity].run(); + updateParity(); + } + + auto sync() + -> void + { + pop[0].getBackend().syncAll(); + } + + private: + auto updateParity() + -> void + { + parity = parity == 0 ? 1 : 0; + } + + auto setupSkeletons(int target, + Neon::set::StencilSemantic stencilSemantic, + Neon::skeleton::Occ occ, + Neon::set::TransferMode transfer, + PopField& inField /*! inpout population field */, + PopField& outField, + CellTypeField& cellTypeField /*! Cell type field */, + Compute omega /*! LBM omega parameter */) + { + std::vector ops; + lbmTwoPop[target] = Neon::skeleton::Skeleton(inField.getBackend()); + Neon::skeleton::Options opt(occ, transfer); + ops.push_back(ContainerFactory::template iteration(stencilSemantic, + inField, + cellTypeField, + omega, + outField)); + std::stringstream appName; + appName << "LBM_iteration_" << std::to_string(target); + lbmTwoPop[target].sequence(ops, appName.str(), opt); + } + + Neon::skeleton::Skeleton lbmTwoPop[2]; + PopField pop[2]; + int parity; +}; \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h similarity index 97% rename from benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h rename to benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h index 5728a5d3..489b3782 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h @@ -8,7 +8,7 @@ template -struct LbmContainers +struct LbmContainersTemplateOnly { }; @@ -19,13 +19,13 @@ struct LbmContainers */ template -struct LbmContainers, - PopulationField, - LbmComputeType> +struct LbmContainersTemplateOnly, + PopulationField, + LbmComputeType> { using LbmStoreType = typename PopulationField::Type; using CellTypeField = typename PopulationField::Grid::template Field; - using Lattice = D3Q19Template; + using Lattice = D3Q19; using Idx = typename PopulationField::Idx; using Grid = typename PopulationField::Grid; using Rho = typename Grid::template Field; @@ -36,21 +36,21 @@ struct LbmContainers(gidx); \ } else { \ - popIn[GOid] = fin.template nghVal(i, GOid, 0.0).value; \ + popIn[GOid] = fin.template nghVal(gidx).value; \ } \ } \ { /*BK*/ \ if (wallBitFlag & (uint32_t(1) << BKid)) { \ - popIn[BKid] = fin(i, GOid); \ + popIn[BKid] = fin.template read(gidx); \ } else { \ - popIn[BKid] = fin.template nghVal(i, BKid, 0.0).value; \ + popIn[BKid] = fin.template nghVal(gidx).value; \ } \ } \ } static inline NEON_CUDA_HOST_DEVICE auto - loadPopulation(Idx const& i, + loadPopulation(Idx const& gidx, const uint32_t& wallBitFlag, typename PopulationField::Partition const& fin, NEON_OUT LbmStoreType popIn[19]) diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h new file mode 100644 index 00000000..4d3bf178 --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h @@ -0,0 +1,8 @@ +#pragma once + +enum class Method +{ + push, + pull, + aa +}; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h index be94ab76..7e6697ef 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h @@ -23,6 +23,12 @@ void recordBackend(Neon::Backend& bk, report.recordBk(bk); } +void recordGrid(Neon::domain::interface::GridBase& g, + Report& report) +{ + report.recordGrid(g); +} + } // namespace diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h new file mode 100644 index 00000000..a45ff69e --- /dev/null +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h @@ -0,0 +1,13 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" + +template +struct Precision +{ + using Storage = StorageFP; + using Compute = ComputeFP; +}; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h index 565a9108..4ca0827b 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h @@ -3,7 +3,7 @@ #include #include #include "Config.h" - +#include "Neon/domain/interface/GridBase.h" struct Report { Neon::Report mReport; @@ -36,4 +36,5 @@ struct Report auto save() -> void; void recordBk(Neon::Backend& backend); + void recordGrid(Neon::domain::interface::GridBase& g); }; diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp b/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp index 2e88f907..049d1735 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp @@ -29,6 +29,7 @@ Report::Report(const Config& c) mReport.addMember("computeType", c.computeType); mReport.addMember("storeType", c.storeType); + mReport.addMember("spaceCurve", Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(c.spaceCurve)); mReport.addMember("occ", Neon::skeleton::OccUtils::toString(c.occ)); @@ -100,3 +101,8 @@ void Report::recordBk(Neon::Backend& backend) { backend.toReport(mReport); } + +void Report::recordGrid(Neon::domain::interface::GridBase& g) +{ + g.toReport(mReport, true); +} \ No newline at end of file diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu index c603415c..146f108f 100644 --- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu +++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu @@ -2,10 +2,11 @@ #include "D3Q19.h" #include "Neon/domain/bGrid.h" #include "Neon/domain/dGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/eGrid.h" #include "CellType.h" -#include "LbmIteration.h" +#include "LbmSkeleton.h" #include "Metrics.h" #include "Repoert.h" @@ -14,15 +15,28 @@ namespace CavityTwoPop { int backendWasReported = false; namespace details { -template +template auto run(Config& config, Report& report) -> void { - using Lattice = D3Q19Template; - using PopulationField = typename Grid::template Field; + using Storage = Storage_; + using Compute = Compute_; + using Precision = Precision; + using Lattice = D3Q19; + using PopulationField = typename Grid::template Field; + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using RhoField = typename Grid::template Field; + using UField = typename Grid::template Field; + + using Skeleton = LbmSkeleton; + using ContainerFactory = ContainerFactory; Neon::Backend bk = [&] { if (config.deviceType == "cpu") { @@ -38,49 +52,50 @@ auto run(Config& config, NEON_THROW(exce); }(); - if (!backendWasReported) { - metrics::recordBackend(bk, report); - backendWasReported = true; - } Neon::double_3d ulid(1., 0., 0.); - Lattice lattice(bk); - // Neon Grid and Fields initialization auto [start, clock_iter] = metrics::restartClock(bk, true); Grid grid( bk, {config.N, config.N, config.N}, [](const Neon::index_3d&) { return true; }, - lattice.c_vect); + Lattice::template getDirectionAsVector(), + 1.0, 0.0, + config.spaceCurve); - PopulationField pop0 = grid.template newField("Population", Lattice::Q, StorageFP(0.0)); - PopulationField pop1 = grid.template newField("Population", Lattice::Q, StorageFP(0.0)); + if (!backendWasReported) { + metrics::recordBackend(bk, report); + metrics::recordGrid(grid, report); + backendWasReported = true; + } - typename Grid::template Field rho; - typename Grid::template Field u; + PopulationField pop0 = grid.template newField("Population", Lattice::Q, Storage(0.0)); + PopulationField pop1 = grid.template newField("Population", Lattice::Q, Storage(0.0)); + + typename Grid::template Field rho; + typename Grid::template Field u; if (!config.benchmark) { std::cout << "Allocating rho and u" << std::endl; - rho = grid.template newField("rho", 1, StorageFP(0.0)); - u = grid.template newField("u", 3, StorageFP(0.0)); + rho = grid.template newField("rho", 1, Storage(0.0)); + u = grid.template newField("u", 3, Storage(0.0)); } CellType defaultCelltype; auto flag = grid.template newField("Material", 1, defaultCelltype); - auto lbmParameters = config.getLbmParameters(); + auto lbmParameters = config.getLbmParameters(); - LbmIterationD3Q19 - iteration(config.stencilSemantic, - config.occ, - config.transferMode, - pop0, - pop1, - flag, - lbmParameters.omega); + Skeleton iteration(config.stencilSemantic, + config.occ, + config.transferMode, + pop0, + pop1, + flag, + lbmParameters.omega); auto exportRhoAndU = [&bk, &rho, &u, &iteration, &flag, &grid, &ulid](int iterationId) { - if ((iterationId) % 100 == 0) { + if ((iterationId) % 1 == 0) { auto& f = iteration.getInput(); { bk.syncAll(); @@ -91,7 +106,7 @@ auto run(Config& config, bk.syncAll(); } - auto container = LbmContainers::computeRhoAndU(f, flag, rho, u); + auto container = ContainerFactory::computeRhoAndU(f, flag, rho, u); container.run(Neon::Backend::mainStreamIdx); u.updateHostData(Neon::Backend::mainStreamIdx); rho.updateHostData(Neon::Backend::mainStreamIdx); @@ -105,7 +120,8 @@ auto run(Config& config, u.ioToVtk("u_" + iterIdStr, "u", false); rho.ioToVtk("rho_" + iterIdStr, "rho", false); // iteration.getInput().ioToVtk("pop_" + iterIdStr, "u", false); - // flag.ioToVtk("flag_" + iterIdStr, "u", false); + flag.template ioToVtk("flag_" + iterIdStr, "flag", false); + flag.template ioToVtk("flag_" + iterIdStr, "flag", false); std::vector> xPosVal; std::vector> yPosVal; @@ -162,71 +178,20 @@ auto run(Config& config, Neon::index_3d dim(config.N, config.N, config.N); - const auto& t = lattice.t_vect; - const auto& c = lattice.c_vect; - - inPop.forEachActiveCell([&c, &t, &dim, &flag, &ulid, &config](const Neon::index_3d& idx, - const int& k, - StorageFP& val) { - val = t.at(k); - - if (idx.x == 0 || idx.x == dim.x - 1 || - idx.y == 0 || idx.y == dim.y - 1 || - idx.z == 0 || idx.z == dim.z - 1) { - - if (idx.y == dim.y - 1) { - val = -6. * t.at(k) * config.ulb * - (c.at(k).v[0] * ulid.v[0] + - c.at(k).v[1] * ulid.v[1] + - c.at(k).v[2] * ulid.v[2]); - } else { - val = 0; - } - } - }); - - outPop.forEachActiveCell([&c, &t, &dim, &flag, &ulid, &config](const Neon::index_3d& idx, - const int& k, - StorageFP& val) { - val = t.at(k); - - if (idx.x == 0 || idx.x == dim.x - 1 || - idx.y == 0 || idx.y == dim.y - 1 || - idx.z == 0 || idx.z == dim.z - 1) { - - if (idx.y == dim.y - 1) { - val = -6. * t.at(k) * config.ulb * - (c.at(k).v[0] * ulid.v[0] + - c.at(k).v[1] * ulid.v[1] + - c.at(k).v[2] * ulid.v[2]); - } else { - val = 0; - } - } - }); - - flag.forEachActiveCell([&dim](const Neon::index_3d& idx, - const int&, - CellType& flagVal) { - flagVal.classification = CellType::bulk; - flagVal.wallNghBitflag = 0; - - if (idx.x == 0 || idx.x == dim.x - 1 || - idx.y == 0 || idx.y == dim.y - 1 || - idx.z == 0 || idx.z == dim.z - 1) { + // const auto& t = Lattice::Memory::t; + // const auto& c = Lattice::Memory::stencil; - flagVal.classification = CellType::bounceBack; - - if (idx.y == dim.y - 1) { - flagVal.classification = CellType::movingWall; - } - } - }); + ContainerFactory::problemSetup(inPop, + outPop, + flag, + ulid, + config.ulb) + .run(Neon::Backend::mainStreamIdx); - inPop.updateDeviceData(Neon::Backend::mainStreamIdx); - outPop.updateDeviceData(Neon::Backend::mainStreamIdx); - flag.updateDeviceData(Neon::Backend::mainStreamIdx); + inPop.updateHostData(Neon::Backend::mainStreamIdx); + outPop.updateHostData(Neon::Backend::mainStreamIdx); + flag.updateHostData(Neon::Backend::mainStreamIdx); { bk.syncAll(); flag.newHaloUpdate(Neon::set::StencilSemantic::standard /*semantic*/, @@ -236,7 +201,7 @@ auto run(Config& config, bk.syncAll(); } - auto container = LbmContainers::computeWallNghMask(flag, flag); + auto container = ContainerFactory::computeWallNghMask(flag, flag); container.run(Neon::Backend::mainStreamIdx); bk.syncAll(); } @@ -275,15 +240,15 @@ auto run(Config& config, metrics::recordMetrics(bk, config, report, start, clock_iter); } -template +template auto runFilterComputeType(Config& config, Report& report) -> void { if (config.computeType == "double") { - return run(config, report); - } - if (config.computeType == "float") { - return run(config, report); + return run(config, report); } +// if (config.computeType == "float") { +// return run(config, report); +// } NEON_DEV_UNDER_CONSTRUCTION(""); } @@ -295,23 +260,96 @@ auto runFilterStoreType(Config& config, if (config.storeType == "double") { return runFilterComputeType(config, report); } - if (config.storeType == "float") { - return runFilterComputeType(config, report); - } +// if (config.storeType == "float") { +// return runFilterComputeType(config, report); +// } + NEON_DEV_UNDER_CONSTRUCTION(""); } } // namespace details +#ifdef NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS +constexpr bool skipTest = false; +#else +constexpr bool skipTest = false; +#endif + auto run(Config& config, Report& report) -> void { if (config.gridType == "dGrid") { return details::runFilterStoreType(config, report); } - if (config.gridType == "eGrid") { - return details::runFilterStoreType(config, report); - } - if (config.gridType == "bGrid") { - return details::runFilterStoreType(config, report); - } +// if (config.gridType == "eGrid") { +// if constexpr (!skipTest) { +// return details::runFilterStoreType(config, report); +// } else { +// NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") +// } +// } +// if (config.gridType == "bGrid" || config.gridType == "bGrid_8_8_8") { +// return details::runFilterStoreType(config, report); +// } +// if (config.gridType == "bGrid_4_4_4") { +// if constexpr (!skipTest) { +// using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; +// using Grid = Neon::domain::details::bGrid::bGrid; +// return details::runFilterStoreType(config, report); +// } else { +// NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") +// } +// } +// if (config.gridType == "bGrid_2_2_2") { +// if constexpr (!skipTest) { +// using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>; +// using Grid = Neon::domain::details::bGrid::bGrid; +// return details::runFilterStoreType(config, report); +// } else { +// NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") +// } +// } + // if (config.gridType == "bGrid_32_8_4") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_4") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_2_8") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_2") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "dGridSoA") { + // if constexpr (!skipTest) { + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + NEON_THROW_UNSUPPORTED_OPERATION("Unknown grid type: " + config.gridType); } } // namespace CavityTwoPop diff --git a/benchmarks/lbm/CMakeLists.txt b/benchmarks/lbm/CMakeLists.txt new file mode 100644 index 00000000..7f0c1415 --- /dev/null +++ b/benchmarks/lbm/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.19 FATAL_ERROR) + +SET(APP "lbm") + +file(GLOB_RECURSE SrcFiles src/*.*) + +add_executable(${APP} ${SrcFiles}) + +target_link_libraries(${APP} + PUBLIC libNeonDomain + PUBLIC libNeonSkeleton) + +set_target_properties(${APP} PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) + +target_compile_options(${APP} INTERFACE + $<$:${NeonCXXFlags}> + $<$:${NeonCUDAFlags}> + ) + +add_custom_command( + TARGET ${APP} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.sh + ${CMAKE_BINARY_DIR}/bin/${APP}.sh) + +add_custom_command( + TARGET ${APP} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.py + ${CMAKE_BINARY_DIR}/bin/${APP}.py +) \ No newline at end of file diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py new file mode 100644 index 00000000..730dd05c --- /dev/null +++ b/benchmarks/lbm/lbm.py @@ -0,0 +1,132 @@ +deviceType_LIST = 'cpu gpu'.split() +deviceIds_LIST= "0 1 2 3 4 5 6 7".split() +grid_LIST= "dGrid bGrid_4_4_4".split() +domainSize_LIST= "64 128 192 256 320 384 448 512".split() +computeFP_LIST= "double float".split() +storageFP_LIST= "double float".split() +occ_LIST="none".split() +transferMode_LIST= "get put".split() +stencilSemantic_LIST= "grid streaming".split() +spaceCurve_LIST= "sweep morton hilbert".split() +collision_LIST = "bgk kbc".split() +streamingMethod_LIST= "push pull aa".split() +lattice_LIST= "d3q19 d3q27".split() + +warmupIter_INT = 10 +repetitions_INT = 5 +maxIter_INT = 10000 + +import subprocess +import sys + + +def printProgressBar(value, label): + n_bar = 40 # size of progress bar + max = 100 + j = value / max + sys.stdout.write('\r') + bar = 'â–ˆ' * int(n_bar * j) + bar = bar + '-' * int(n_bar * (1 - j)) + + sys.stdout.write(f"{label.ljust(10)} | [{bar:{n_bar}s}] {int(100 * j)}% ") + sys.stdout.flush() + + +def countAll(): + counter = 0 + for DEVICE_TYPE in deviceType_LIST: + DEVICE_SET_LIST = [deviceIds_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in deviceIds_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for DEVICE_SET in DEVICE_SET_LIST: + for OCC in occ_LIST: + for DOMAIN_SIZE in domainSize_LIST: + for STORAGE_FP in storageFP_LIST: + for COMPUTE_FP in computeFP_LIST: + for GRID in grid_LIST: + for CURVE in spaceCurve_LIST: + for LATTICE in lattice_LIST: + for TRANSFERMODE in transferMode_LIST: + for STENCILSEMANTIC in stencilSemantic_LIST: + for COLLISION in collision_LIST: + if LATTICE != "d3q27" and COLLISION == 'kbc': + continue + for STREAMINGMETHOD in streamingMethod_LIST: + if STREAMINGMETHOD != 'pull' and len(DEVICE_SET_LIST) != 1: + continue + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue + + counter += 1 + return counter + + +SAMPLES = countAll() +counter = 0 +command = './lbm' +# command = 'echo' +with open(command + '.log', 'w') as fp: + for DEVICE_TYPE in deviceType_LIST: + DEVICE_SET_LIST = [deviceIds_LIST[0]] + if DEVICE_TYPE == 'gpu': + for DEVICE in deviceIds_LIST[1:]: + DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE) + for DEVICE_SET in DEVICE_SET_LIST: + for OCC in occ_LIST: + for DOMAIN_SIZE in domainSize_LIST: + for STORAGE_FP in storageFP_LIST: + for COMPUTE_FP in computeFP_LIST: + for GRID in grid_LIST: + for CURVE in spaceCurve_LIST: + for LATTICE in lattice_LIST: + for TRANSFERMODE in transferMode_LIST: + for STENCILSEMANTIC in stencilSemantic_LIST: + for COLLISION in collision_LIST: + if LATTICE != "d3q27" and COLLISION == 'kbc': + continue + for STREAMINGMETHOD in streamingMethod_LIST: + if STREAMINGMETHOD != 'pull' and len(DEVICE_SET_LIST) != 1: + continue + if STORAGE_FP == 'double' and COMPUTE_FP == 'float': + continue + if STORAGE_FP == 'float' and COMPUTE_FP == 'double': + continue + + parameters = [] + parameters.append('--deviceType ' + DEVICE_TYPE) + parameters.append('--deviceIds ' + DEVICE_SET) + parameters.append('--grid ' + GRID) + parameters.append('--domain-size ' + DOMAIN_SIZE) + parameters.append('--max-iter ' + str(maxIter_INT)) + parameters.append('--report-filename ' + 'lbm') + parameters.append('--computeFP ' + COMPUTE_FP) + parameters.append('--storageFP ' + STORAGE_FP) + parameters.append('--occ ' + OCC) + parameters.append('--transferMode ' + TRANSFERMODE) + parameters.append('--stencilSemantic ' + STENCILSEMANTIC) + parameters.append('--spaceCurve ' + CURVE) + parameters.append('--collision ' + COLLISION) + parameters.append('--streamingMethod ' + STREAMINGMETHOD) + parameters.append('--lattice ' + LATTICE) + parameters.append('--benchmark ') + parameters.append('--warmup-iter ' + str(warmupIter_INT)) + parameters.append('--repetitions ' + str(repetitions_INT)) + + commandList = [] + commandList.append(command) + for el in parameters: + for s in el.split(): + commandList.append(s) + + fp.write("\n-------------------------------------------\n") + fp.write(' '.join(commandList)) + fp.write("\n-------------------------------------------\n") + fp.flush() + print(' '.join(commandList)) + subprocess.run(commandList, text=True, stdout=fp) + + counter += 1 + printProgressBar(counter * 100.0 / SAMPLES, 'Progress') diff --git a/benchmarks/lbm/lbm.sh b/benchmarks/lbm/lbm.sh new file mode 100644 index 00000000..7cc5108c --- /dev/null +++ b/benchmarks/lbm/lbm.sh @@ -0,0 +1,30 @@ +set -x + +DOMAIN_SIZE_LIST="64 128 192 256 320 384 448 512" +GRID_LIST="dGrid bGrid eGrid" +STORAGE_FP_LIST="double float" +COMPUTE_FP_LIST="double float" +OCC="nOCC" + +for DOMAIN_SIZE in ${DOMAIN_SIZE_LIST}; do + for STORAGE_FP in ${STORAGE_FP_LIST}; do + for COMPUTE_FP in ${COMPUTE_FP_LIST}; do + for GRID in ${GRID_LIST}; do + + if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then + continue + fi + + echo ./lbm-lid-driven-cavity-flow \ + --deviceType gpu --deviceIds 0 \ + --grid "${GRID}" \ + --domain-size "${DOMAIN_SIZE}" \ + --warmup-iter 10 --max-iter 100 --repetitions 5 \ + --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \ + --computeFP "${COMPUTE_FP}" \ + --storageFP "${STORAGE_FP}" \ + --${OCC} --benchmark + done + done + done +done diff --git a/benchmarks/lbm/src/CellType.h b/benchmarks/lbm/src/CellType.h new file mode 100644 index 00000000..47c0397b --- /dev/null +++ b/benchmarks/lbm/src/CellType.h @@ -0,0 +1,56 @@ +#pragma once + +struct CellType +{ + enum Classification : int + { + bounceBack, + movingWall, + bulk, + undefined + }; + + NEON_CUDA_HOST_DEVICE CellType(int dummy = 0) + { + (void)dummy; + classification = bulk; + wallNghBitflag = 0; + } + + NEON_CUDA_HOST_DEVICE explicit CellType(Classification c, uint32_t n) + { + classification = c; + wallNghBitflag = n; + } + + NEON_CUDA_HOST_DEVICE explicit CellType(Classification c) + { + classification = c; + wallNghBitflag = 0; + } + + // Converting to int to exportVti + operator int() const { return int(classification); } + + template + NEON_CUDA_HOST_DEVICE static auto isWall(const uint32_t& wallNghBitFlag) + -> bool + { + return wallNghBitFlag & (uint32_t(1) << fwdRegQ); + } + + NEON_CUDA_HOST_DEVICE auto setWall(int fwdRegIdx) + -> void + { + wallNghBitflag = wallNghBitflag | ((uint32_t(1) << fwdRegIdx)); + } + + uint32_t wallNghBitflag; + Classification classification; +}; + +std::ostream& operator<<(std::ostream& os, const CellType& dt) +{ + os << static_cast(dt.classification); + return os; +} \ No newline at end of file diff --git a/benchmarks/lbm/src/Collision.cpp b/benchmarks/lbm/src/Collision.cpp new file mode 100644 index 00000000..3f7510cd --- /dev/null +++ b/benchmarks/lbm/src/Collision.cpp @@ -0,0 +1,127 @@ +#include "Collision.h" + + +auto CollisionUtils::toString(Collision occ) -> std::string +{ + switch (occ) { + case Collision::bgk: { + return "bgk"; + } + case Collision::kbc: { + return "kbc"; + } + } + NEON_THROW_UNSUPPORTED_OPTION(""); +} + +auto CollisionUtils::fromString(const std::string& occ) -> Collision +{ + std::array occs{Collision::bgk, Collision::kbc}; + for (auto a : occs) { + if (toString(a) == occ) { + return a; + } + } + NEON_THROW_UNSUPPORTED_OPTION(""); +} + +auto CollisionUtils::getOptions() -> std::array +{ + std::array opts = {Collision::bgk, Collision::kbc}; + return opts; +} + +CollisionUtils::Cli::Cli() +{ + mSet = false; +} + +CollisionUtils::Cli::Cli(std::string s) +{ + set(s); +} + +CollisionUtils::Cli::Cli(Collision model) +{ + mOption = model; + mSet = true; +} + +auto CollisionUtils::Cli::getOption() const -> Collision +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "Collision model was not set."; + NEON_ERROR(errorMsg.str()); + } + return mOption; +} + +auto CollisionUtils::Cli::getOptionStr() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "Collision model was not set."; + NEON_ERROR(errorMsg.str()); + } + return CollisionUtils::toString(mOption); +} + +auto CollisionUtils::Cli::set(const std::string& opt) + -> void +{ + try { + mOption = CollisionUtils::fromString(opt); + } catch (...) { + std::stringstream errorMsg; + errorMsg << "Collision: " << opt << " is not a valid option (valid options are {"; + auto options = CollisionUtils::getOptions(); + int i = 0; + for (auto o : options) { + if (i != 0) { + errorMsg << ", " << CollisionUtils::toString(o); + } + errorMsg << CollisionUtils::toString(o); + i = 1; + } + errorMsg << "})"; + NEON_ERROR(errorMsg.str()); + } + mSet = true; +} + +auto CollisionUtils::Cli::getAllOptionsStr() const -> std::string +{ + std::stringstream s; + auto options = CollisionUtils::getOptions(); + int i = 0; + for (auto o : options) { + if (i != 0) { + s << ", "; + } + s << CollisionUtils::toString(o); + i = 1; + } + std::string msg = s.str(); + return msg; +} + + +auto CollisionUtils::Cli::getDoc() const -> std::string +{ + std::stringstream s; + s << getAllOptionsStr(); + s << " default: " << CollisionUtils::toString(getOption()); + return s.str(); +} + +auto CollisionUtils::Cli::addToReport(Neon::Report& report) const -> void +{ + report.addMember("Collision", CollisionUtils::toString(this->getOption())); +} + +auto CollisionUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void +{ + report.addMember("Collision", CollisionUtils::toString(this->getOption()), &subBlock); +} + diff --git a/benchmarks/lbm/src/Collision.h b/benchmarks/lbm/src/Collision.h new file mode 100644 index 00000000..c022a018 --- /dev/null +++ b/benchmarks/lbm/src/Collision.h @@ -0,0 +1,43 @@ +#pragma once +#include "Neon/Report.h" +#include "Neon/set/Backend.h" +#include "Neon/set/Containter.h" + + +enum class Collision +{ + bgk, + kbc +}; + +struct CollisionUtils +{ + static constexpr int nOptions = 2; + + static auto toString(Collision occ) -> std::string; + static auto fromString(const std::string& occ) -> Collision; + static auto getOptions() -> std::array; + + struct Cli + { + explicit Cli(std::string); + explicit Cli(Collision model); + Cli(); + + auto getOption() const -> Collision; + auto getOptionStr() const -> std::string; + + auto set(const std::string& opt) -> void; + auto getAllOptionsStr() const -> std::string; + auto getDoc() const -> std::string; + + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; + auto addToReport(Neon::Report& report) const -> void; + + private: + bool mSet = false; + Collision mOption; + }; +}; + + diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp new file mode 100644 index 00000000..ae30c720 --- /dev/null +++ b/benchmarks/lbm/src/Config.cpp @@ -0,0 +1,151 @@ +#include "Config.h" +#include +#include + +auto Config::toString() const -> std::string +{ + std::stringstream s; + const Config& c = *this; + + auto vecToSting = [](const std::vector& v) { + std::stringstream s; + bool firstTime = true; + for (auto e : v) { + if (firstTime) { + firstTime = false; + } else { + s << " "; + } + s << std::to_string(e); + } + return s.str(); + }; + + s << "\n==>[Neon Runtime Parameters]" << std::endl; + s << ".......... deviceType " << c.deviceType << std::endl; + s << ".......... numDevices " << c.devices.size() << std::endl; + s << "............. devices " << vecToSting(c.devices) << std::endl; + s << ".......... reportFile " << c.reportFile << std::endl; + s << "............ gridType " << c.gridType << std::endl; + + s << ".......... spaceCurve " << c.spaceCurveCli.getStringOption() << std::endl; + s << "................. occ " << c.occCli.getStringOption() << std::endl; + s << "........ transferMode " << c.transferModeCli.getStringOption() << std::endl; + s << "..... stencilSemantic " << c.stencilSemanticCli.getStringOption() << std::endl; + + s << "\n==>[LBM Implementation]" << std::endl; + s << "............. lattice " << c.lattice << std::endl; + s << ".... streaming method " << c.streamingMethod << std::endl; + s << "........... collision " << c.collisionCli.getOptionStr() << std::endl; + s << "......... computeType " << c.computeTypeStr << std::endl; + s << "........... storeType " << c.storeTypeStr << std::endl; + + s << "\n==>[Physics Parameters]" << std::endl; + s << ".................. Re " << c.Re << std::endl; + s << "................. ulb " << c.ulb << std::endl; + s << "................... N " << c.N << std::endl; + s << "................. nu " << mLbmParameters.nu << std::endl; + s << ".............. omega " << mLbmParameters.omega << std::endl; + s << "................. dx " << mLbmParameters.dx << std::endl; + s << "................. dt " << mLbmParameters.dt << std::endl; + + s << "\n==>[Test Parameters]" << std::endl; + s << "........... benchmark " << c.benchmark << std::endl; + s << "............... max_t " << c.max_t << std::endl; + s << "................. vti " << c.vti << std::endl; + s << "........ benchIniIter " << c.benchIniIter << std::endl; + s << "........ benchMaxIter " << c.benchMaxIter << std::endl; + + + return s.str(); +} + +auto Config::parseArgs(const int argc, char* argv[]) + -> int +{ + auto& config = *this; + + auto cli = + ( + + clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device type (cpu or gpu)", + clipp::required("--deviceIds") & clipp::integers("ids", config.devices) % "Device ids", + + clipp::option("--grid") & clipp::value("grid", config.gridType) % Config::getOptionList(config.gridTypeOptions, config.gridType), + clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain", + clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations", + clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename", + + clipp::option("--computeFP") & clipp::value("computeFP", config.computeTypeStr) % Config::getOptionList(config.gridTypeOptions, config.gridType), + clipp::option("--storageFP") & clipp::value("storageFP", config.storeTypeStr) % "double, float", + + clipp::option("--occ") & clipp::value("occ")([&config](const std::string& s) { config.occCli.set(s); }) % config.occCli.getDoc(), + clipp::option("--transferMode") & clipp::value("transferMode")([&config](const std::string& s) { config.transferModeCli.set(s); }) % config.transferModeCli.getDoc(), + clipp::option("--stencilSemantic") & clipp::value("stencilSemantic")([&config](const std::string& s) { config.stencilSemanticCli.set(s); }) % config.stencilSemanticCli.getDoc(), + clipp::option("--spaceCurve") & clipp::value("spaceCurve")([&config](const std::string& s) { config.spaceCurveCli.set(s); }) % config.spaceCurveCli.getDoc(), + clipp::option("--collision") & clipp::value("collision")([&config](const std::string& s) { config.collisionCli.set(s); }) % config.collisionCli.getDoc(), + + clipp::option("--streamingMethod") & clipp::value("streamingMethod", config.streamingMethod) % Config::getOptionList(config.streamingMethodOption, config.streamingMethod), + clipp::option("--lattice") & clipp::value("lattice", config.lattice) % Config::getOptionList(config.latticeOptions, config.lattice), + ( + ( + clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode", + clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters", + clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run." + + ) | + (clipp::option("--vti") & clipp::integer("OutputFrequency", config.vti) % "Voxels along each dimension of the cube domain")) + + ); + + + if (!clipp::parse(argc, argv, cli)) { + auto fmt = clipp::doc_formatting{}.doc_column(31); + std::cout << make_man_page(cli, argv[0], fmt) << '\n'; + std::cout << '\n'; + std::cout << '\n'; + std::cout << "Export example" << '\n'; + std::cout << "./lbm --deviceType cpu --deviceIds 0 --grid dGrid --domain-size 100 --max-iter 2000 --nOCC --huGrid --vti 1" << '\n'; + std::cout << "Benchmark example " << '\n'; + std::cout << "./lbm --deviceType gpu --deviceIds 0 1 2 3 4 --grid dGrid --domain-size 100 --max-iter 2000 --computeFP double --storageFP double --nOCC --huGrid --benchmark --warmup-iter 10 --repetitions 5" << '\n'; + + std::cout <<" ./lbm --deviceType gpu\\\n" + " --deviceIds 0\\\n" + " --grid dGrid\\\n" + " --domain-size 100\\\n" + " --max-iter 1000\\\n" + " --computeFP float\\\n" + " --storageFP float\\\n" + " --occ none\\\n" + " --transferMode put\\\n" + " --stencilSemantic grid\\\n" + " --spaceCurve sweep\\\n" + " --collision bgk\\\n" + " --streamingMethod pull\\\n" + " --lattice d3q19\\\n" + " --vti 10"; + + return -1; + } + + helpSetLbmParameters(); + + std::stringstream s; + for (int i = 0; i < argc; i++) { + s << argv[i]; + if (i + 1 != argc) { + s << " "; + } + } + mArgv = s.str(); + + return 0; +} + +auto Config::helpSetLbmParameters() -> void +{ + mLbmParameters.nu = ulb * static_cast(N - 2) / Re; + mLbmParameters.omega = 1. / (3. * mLbmParameters.nu + 0.5); + mLbmParameters.dx = 1. / static_cast(N - 2); + mLbmParameters.dt = mLbmParameters.dx * ulb; +} diff --git a/benchmarks/lbm/src/Config.h b/benchmarks/lbm/src/Config.h new file mode 100644 index 00000000..b5a1607a --- /dev/null +++ b/benchmarks/lbm/src/Config.h @@ -0,0 +1,105 @@ +#pragma once + +#include +#include +#include "Collision.h" +#include "Neon/core/tools/clipp.h" +#include "Neon/domain/tools/SpaceCurves.h" +#include "Neon/skeleton/Skeleton.h" + +template +struct LbmParameters +{ + ComputeType nu = 0; + ComputeType omega = 0; + ComputeType dx = 0; + ComputeType dt = 0; +}; + +struct Config +{ + double Re = 100.; // Reynolds number + double ulb = 0.04; // Velocity in lattice units + int N = 160; // Number of nodes in x-direction + bool benchmark = false; // Run in benchmark mode ? + double max_t = 10.0; // Non-benchmark mode: Total time in dim.less units + int benchIniIter = 1000; // Benchmark mode: Number of warmup iterations + int benchMaxIter = 2000; // Benchmark mode: Total number of iterations + int repetitions = 1; // Benchmark mode: number of time the test is run + + std::string deviceType = "gpu"; + std::vector devices = std::vector(0); // Devices for the execution + std::string reportFile = "lbm-lid-driven-cavity-flow"; // Report file name + + std::vector gridTypeOptions = {"dGrid", "eGrid", "bGrid"}; + std::string gridType = gridTypeOptions[0]; // Neon grid type + + Neon::skeleton::OccUtils::Cli occCli{Neon::skeleton::Occ::none}; // Neon OCC type + Neon::set::TransferModeUtils::Cli transferModeCli{Neon::set::TransferMode::get}; // Neon transfer mode for halo update + Neon::set::StencilSemanticUtils::Cli stencilSemanticCli{Neon::set::StencilSemantic::lattice}; + Neon::domain::tool::spaceCurves::EncoderTypeUtil::Cli spaceCurveCli{Neon::domain::tool::spaceCurves::EncoderType::sweep}; + CollisionUtils::Cli collisionCli{Collision::bgk}; + int vti = 0; // Export vti file + + std::vector computeTypeOptions = {"double", "float"}; + std::string computeTypeStr = computeTypeOptions[0]; + + std::vector storeTypeOptions = {"double", "float"}; + std::string storeTypeStr = storeTypeOptions[0]; + + + std::vector latticeOptions = {"d3q19", "d3q27"}; + std::string lattice = latticeOptions[0]; + + std::vector streamingMethodOption = {"push", "pull"}; + std::string streamingMethod = "push"; + + LbmParameters mLbmParameters; + + std::string mArgv; + + auto getOptionList(std::vector list, std::string defaultVal) -> std::string + { + std::stringstream s; + for (int i = 0; i < int(list.size()); i++) { + s << list[i]; + if (list[i] == defaultVal) { + s << " (default) "; + } + } + return s.str(); + } + + auto check(std::vector list, std::string userValue) -> bool + { + for (int i = 0; i < int(list.size()); i++) { + if (list[i] == userValue) { + return true; + } + } + return false; + } + + auto toString() + const -> std::string; + + auto parseArgs(int argc, char* argv[]) + -> int; + + template + auto getLbmParameters() + -> LbmParameters + { + LbmParameters output; + output.nu = static_cast(mLbmParameters.nu); + output.omega = static_cast(mLbmParameters.omega); + output.dx = static_cast(mLbmParameters.dx); + output.dt = static_cast(mLbmParameters.dt); + + return output; + } + + private: + auto helpSetLbmParameters() + -> void; +}; diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h new file mode 100644 index 00000000..bb4adb0d --- /dev/null +++ b/benchmarks/lbm/src/ContainersD3QXX.h @@ -0,0 +1,616 @@ +#pragma once + +#include "./Methods.h" +#include "CellType.h" +#include "D3Q19.h" +#include "DeviceD3QXX.h" +#include "Methods.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" + +/** + * Specialization for D3Q19 + */ +template +struct ContainerFactoryD3QXX +{ + using Lattice = Lattice_; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + // using PullFunctions = pull::DeviceD3Q19; + // using CommonFunctions = common::DeviceD3Q19; + using Device = DeviceD3QXX; + + struct AA + { + struct Even + { + // collide + + static auto + iteration(const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + NEON_IO PopField& fpopField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fpopField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& popMem = L.load(fpopField); + const auto& cellInfoPartition = L.load(cellTypeField); + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popRegisters[Lattice::Q]; + Device::Common::localLoad(gidx, popMem, NEON_OUT popRegisters); + + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + if constexpr (CollisionId == Collision::bgk) { + Device::Common::collideBgkUnrolled(rho, u, + usqr, capturedOmega, + NEON_IO popRegisters); + } + if constexpr (CollisionId == Collision::kbc) { + Device::Common::collideKBCUnrolled(rho, u, + usqr, capturedOmega, + capturedInvBeta, + NEON_IO popRegisters); + } + Device::Common::localStoreOpposite(gidx, popRegisters, popMem); + } + }; + }); + return container; + } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + return Push::computeRhoAndU(fInField, cellTypeField, rhoField, uField); + } + }; + struct Odd + { + // pullStream - collide - pushStream + + static auto + iteration(const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + NEON_IO PopField& fpopField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fpopField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fpop = L.load(fpopField); + const auto& cellInfoPartition = L.load(cellTypeField); + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popRegisters[Lattice::Q]; + Device::AA::pullStream(gidx, cellInfo.wallNghBitflag, fpop, NEON_OUT popRegisters); + + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popRegisters, + NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + + if constexpr (CollisionId == Collision::bgk) { + Device::Common::collideBgkUnrolled(rho, u, + usqr, capturedOmega, + NEON_IO popRegisters); + } + if constexpr (CollisionId == Collision::kbc) { + Device::Common::collideKBCUnrolled(rho, u, + usqr, capturedOmega, + capturedInvBeta, + NEON_IO popRegisters); + } + Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popRegisters, NEON_OUT fpop); + } + }; + }); + return container; + } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + return Pull::computeRhoAndU(fInField, cellTypeField, rhoField, uField); + } + }; + }; + + struct Pull + { + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popRegisters[Lattice::Q]; + Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popRegisters); + + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + if constexpr (CollisionId == Collision::bgk) { + Device::Common::collideBgkUnrolled(rho, u, + usqr, capturedOmega, + NEON_IO popRegisters); + } + if constexpr (CollisionId == Collision::kbc) { + Device::Common::collideKBCUnrolled(rho, u, + usqr, capturedOmega, + capturedInvBeta, + NEON_IO popRegisters); + } + Device::Common::localStore(gidx, popRegisters, fOut); + } + }; + }); + return container; + } + + static auto + localCollide(const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "D3Q19_TwoPop_Pull", + [&, omega](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField); + auto& fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popRegisters[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters); + + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + if constexpr (CollisionId == Collision::bgk) { + Device::Common::collideBgkUnrolled(rho, u, + usqr, capturedOmega, + NEON_IO popRegisters); + } + if constexpr (CollisionId == Collision::kbc) { + Device::Common::collideKBCUnrolled(rho, u, + usqr, capturedOmega, + capturedInvBeta, + NEON_IO popRegisters); + } + Device::Common::localStore(gidx, popRegisters, fOut); + } + }; + }); + return container; + } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + + Neon::set::Container container = + fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + + Storage popRegisters[Lattice::Q]; + + if (cellInfo.classification == CellType::bulk) { + Storage popRegisters[Lattice::Q]; + Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popRegisters); + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); + } else { + Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters); + if (cellInfo.classification == CellType::movingWall) { + rho = 1.0; + u = std::array{static_cast(popRegisters[0]) / static_cast(6. * 1. / 18.), + static_cast(popRegisters[1]) / static_cast(6. * 1. / 18.), + static_cast(popRegisters[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + }; + struct Push + { + static auto + iteration(Neon::set::StencilSemantic stencilSemantic, + const PopField& fInField /*! Input population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + const Compute omega /*! LBM omega parameter */, + PopField& fOutField /*! Output Population field */) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM-iteration", + [=](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL, stencilSemantic); + auto fOut = L.load(fOutField); + const auto& cellInfoPartition = L.load(cellTypeField); + + [[maybe_unused]] const Compute beta = omega * 0.5; + [[maybe_unused]] const Compute invBeta = 1.0 / beta; + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + [[maybe_unused]] const Compute capturedOmega = omega; + [[maybe_unused]] const Compute capturedInvBeta = invBeta; + + CellType cellInfo = cellInfoPartition(gidx, 0); + if (cellInfo.classification == CellType::bulk) { + + Storage popRegisters[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters); + + Compute rho; + std::array u{.0, .0, .0}; + Device::Common::macroscopic(popRegisters, + NEON_OUT rho, NEON_OUT u); + + Compute usqr = 1.5 * (u[0] * u[0] + + u[1] * u[1] + + u[2] * u[2]); + + + if constexpr (CollisionId == Collision::bgk) { + Device::Common::collideBgkUnrolled(rho, u, + usqr, capturedOmega, + NEON_IO popRegisters); + } + if constexpr (CollisionId == Collision::kbc) { + Device::Common::collideKBCUnrolled(rho, u, + usqr, capturedOmega, + capturedInvBeta, + NEON_IO popRegisters); + } + Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popRegisters, NEON_OUT fOut); + } + }; + }); + return container; + } + + static auto + computeRhoAndU([[maybe_unused]] const PopField& fInField /*! inpout population field */, + const CellTypeField& cellTypeField /*! Cell type field */, + Rho& rhoField /*! output Population field */, + U& uField /*! output Population field */) + + -> Neon::set::Container + { + + Neon::set::Container container = + fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, + Neon::Pattern::STENCIL); + auto& rhoXpu = L.load(rhoField); + auto& uXpu = L.load(uField); + + const auto& cellInfoPartition = L.load(cellTypeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellInfo = cellInfoPartition(gidx, 0); + Compute rho = 0; + std::array u{.0, .0, .0}; + + Storage popRegisters[Lattice::Q]; + Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters); + + if (cellInfo.classification == CellType::bulk) { + Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u); + } else { + if (cellInfo.classification == CellType::movingWall) { + rho = 1.0; + u = std::array{static_cast(popRegisters[0]) / static_cast(6. * 1. / 18.), + static_cast(popRegisters[1]) / static_cast(6. * 1. / 18.), + static_cast(popRegisters[2]) / static_cast(6. * 1. / 18.)}; + } + } + + rhoXpu(gidx, 0) = static_cast(rho); + uXpu(gidx, 0) = static_cast(u[0]); + uXpu(gidx, 1) = static_cast(u[1]); + uXpu(gidx, 2) = static_cast(u[2]); + }; + }); + return container; + } + }; + struct Common + { + + + + static auto + computeWallNghMask(const CellTypeField& infoInField, + CellTypeField& infoOutpeField) + + -> Neon::set::Container + { + Neon::set::Container container = infoInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL); + auto& infoOut = L.load(infoOutpeField); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + CellType cellType = infoIn(gidx, 0); + cellType.wallNghBitflag = 0; + + if (cellType.classification == CellType::bulk) { + Neon::ConstexprFor<0, Lattice::Q, 1>([&, gidx](auto fwdRegIdx) { + using M = typename Lattice::template RegisterMapper; + if constexpr (M::centerMemQ != M::fwdMemQ) { + CellType nghCellType = infoIn.template getNghData(gidx, 0, CellType::undefined)(); + if (nghCellType.classification == CellType::bounceBack || + nghCellType.classification == CellType::movingWall) { + cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemQ)); + } + } + }); + infoOut(gidx, 0) = cellType; + } + }; + }); + return container; + } + + + template + static auto + userSettingBc(UserLambda userLambda, + PopField& pField, + CellTypeField& cellTypeField /*! Cell type field */) + -> Neon::set::Container + { + Neon::set::Container container = pField.getGrid().newContainer( + "UserSettingBc", + [&](Neon::set::Loader& L) -> auto { + auto& p = L.load(pField, Neon::Pattern::MAP); + auto& flag = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globalIdx = p.getGlobalIndex(gidx); + Storage pValues[Lattice::Q]; + CellType::Classification cellClass; + userLambda(globalIdx, pValues, cellClass); + + CellType flagVal(cellClass); + flag(gidx, 0) = flagVal; + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + p(gidx, M::fwdMemQ) = pValues[M::fwdRegQ]; + }); + }; + }); + return container; + } + + static auto + copyPopulation(PopField& fInField, + PopField& foutField) + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&](Neon::set::Loader& L) -> auto { + auto const& pIn = L.load(fInField, Neon::Pattern::MAP); + auto& pOut = L.load(foutField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + pOut(gidx, q) = pIn(gidx, q); + }); + }; + }); + return container; + } + + + static auto + problemSetup(PopField& fInField /*! inpout population field */, + PopField& fOutField, + CellTypeField& cellTypeField, + Neon::double_3d ulid, + double ulb) + + -> Neon::set::Container + { + Neon::set::Container container = fInField.getGrid().newContainer( + "LBM_iteration", + [&, ulid, ulb](Neon::set::Loader& L) -> auto { + auto& fIn = L.load(fInField, Neon::Pattern::MAP); + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + const auto globalIdx = fIn.getGlobalIndex(gidx); + const auto domainDim = fIn.getDomainSize(); + + CellType flagVal; + flagVal.classification = CellType::bulk; + flagVal.wallNghBitflag = 0; + + typename Lattice::Precision::Storage popVal = 0; + + if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || + globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 || + globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) { + flagVal.classification = CellType::bounceBack; + + if (globalIdx.y == domainDim.y - 1) { + flagVal.classification = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + if (globalIdx.y == domainDim.y - 1) { + popVal = -6. * Lattice::Memory::template getT() * ulb * + (Lattice::Memory::template getDirection().v[0] * ulid.v[0] + + Lattice::Memory::template getDirection().v[1] * ulid.v[1] + + Lattice::Memory::template getDirection().v[2] * ulid.v[2]); + } else { + popVal = 0; + } + fIn(gidx, q) = popVal; + fOut(gidx, q) = popVal; + }); + } else { + flagVal.classification = CellType::bulk; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + fIn(gidx, q) = Lattice::Memory::template getT(); + fOut(gidx, q) = Lattice::Memory::template getT(); + }); + } + cellInfoPartition(gidx, 0) = flagVal; + }; + }); + return container; + } + + static auto + setToEquilibrium(PopField& fOutField, + CellTypeField& cellTypeField) + -> Neon::set::Container + { + Neon::set::Container container = fOutField.getGrid().newContainer( + "LBM_setToEquilibrium", + [&](Neon::set::Loader& L) -> auto { + auto& fOut = L.load(fOutField, Neon::Pattern::MAP); + auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable { + { // All pints are pre-set to bulk + CellType flagVal; + flagVal.classification = CellType::bulk; + cellInfoPartition(gidx, 0) = flagVal; + } + + { // All cells are pre-set to Equilibrium + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + fOut(gidx, M::fwdMemQ) = Lattice::Registers::template getT(); + }); + } + }; + }); + return container; + } + }; +}; \ No newline at end of file diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h new file mode 100644 index 00000000..11a3408a --- /dev/null +++ b/benchmarks/lbm/src/D3Q19.h @@ -0,0 +1,385 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" +#include "Precision.h" + + +/** In each lattice we define two indexing schema + * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code. + * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields. + * + * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions. + * + */ +template +struct D3Q19 +{ + public: + D3Q19() = delete; + + static constexpr int Q = 19; /** number of directions */ + static constexpr int D = 3; /** Space dimension */ + using Precision = Precision_; + using Self = D3Q19; + + static constexpr int RegisterMapping = 1; + static constexpr int MemoryMapping = 2; + + struct Registers + { + + using Self = D3Q19::Registers; + + static constexpr int center = 9; /** Position of direction {0,0,0} */ + + template + static constexpr auto getVelocityComponent() -> int + { + static_assert(myQ < Q); + static_assert(myXYZ < 3); + +#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((myXYZ) == 0) { \ + return XXX; \ + } \ + if constexpr ((myXYZ) == 1) { \ + return YYY; \ + } \ + if constexpr ((myXYZ) == 2) { \ + return ZZZ; \ + } \ + } + + ADD_COMPONENT(0, -1, 0, 0) + ADD_COMPONENT(1, 0, -1, 0) + ADD_COMPONENT(2, 0, 0, -1) + ADD_COMPONENT(3, -1, -1, 0) + ADD_COMPONENT(4, -1, 1, 0) + ADD_COMPONENT(5, -1, 0, -1) + ADD_COMPONENT(6, -1, 0, 1) + ADD_COMPONENT(7, 0, -1, -1) + ADD_COMPONENT(8, 0, -1, 1) + ADD_COMPONENT(9, 0, 0, 0) + ADD_COMPONENT(10, 1, 0, 0) + ADD_COMPONENT(11, 0, 1, 0) + ADD_COMPONENT(12, 0, 0, 1) + ADD_COMPONENT(13, 1, 1, 0) + ADD_COMPONENT(14, 1, -1, 0) + ADD_COMPONENT(15, 1, 0, 1) + ADD_COMPONENT(16, 1, 0, -1) + ADD_COMPONENT(17, 0, 1, 1) + ADD_COMPONENT(18, 0, 1, -1) + +#undef ADD_COMPONENT + } + + template + static constexpr auto getOpposite() -> int + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 10) + ADD_COMPONENT(1, 11) + ADD_COMPONENT(2, 12) + ADD_COMPONENT(3, 13) + ADD_COMPONENT(4, 14) + ADD_COMPONENT(5, 15) + ADD_COMPONENT(6, 16) + ADD_COMPONENT(7, 17) + ADD_COMPONENT(8, 18) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 0) + ADD_COMPONENT(11, 1) + ADD_COMPONENT(12, 2) + ADD_COMPONENT(13, 3) + ADD_COMPONENT(14, 4) + ADD_COMPONENT(15, 5) + ADD_COMPONENT(16, 6) + ADD_COMPONENT(17, 7) + ADD_COMPONENT(18, 8) +#undef ADD_COMPONENT + } + + template + static constexpr auto getT() -> typename Precision::Storage + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + + ADD_COMPONENT(0, 1. / 18.) + ADD_COMPONENT(1, 1. / 18.) + ADD_COMPONENT(2, 1. / 18.) + ADD_COMPONENT(3, 1. / 36.) + ADD_COMPONENT(4, 1. / 36.) + ADD_COMPONENT(5, 1. / 36.) + ADD_COMPONENT(6, 1. / 36.) + ADD_COMPONENT(7, 1. / 36.) + ADD_COMPONENT(8, 1. / 36.) + ADD_COMPONENT(9, 1. / 3.) + ADD_COMPONENT(10, 1. / 18.) + ADD_COMPONENT(11, 1. / 18.) + ADD_COMPONENT(12, 1. / 18.) + ADD_COMPONENT(13, 1. / 36.) + ADD_COMPONENT(14, 1. / 36.) + ADD_COMPONENT(15, 1. / 36.) + ADD_COMPONENT(16, 1. / 36.) + ADD_COMPONENT(17, 1. / 36.) + ADD_COMPONENT(18, 1. / 36.) + +#undef ADD_COMPONENT + } + + + template + static constexpr auto getVelocity() -> const typename Neon::index_3d + { + return Neon::index_3d(getVelocityComponent, + getVelocityComponent, + getVelocityComponent); + } + + // Identifying first half of the directions + // For each direction in the list, the opposite is not present. + // Center is also removed + static constexpr int firstHalfQLen = (Q - 1) / 2; + static constexpr std::array firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8}; + + template + static inline NEON_CUDA_HOST_DEVICE auto + getCk_u(std::array const& u) + -> Compute + { + if constexpr (tegIdx == 0 || tegIdx == 10) { + return -u[0]; + } + if constexpr (tegIdx == 1 || tegIdx == 11) { + return -u[1]; + } + if constexpr (tegIdx == 2 || tegIdx == 12) { + return -u[2]; + } + if constexpr (tegIdx == 3 || tegIdx == 13) { + return -u[0] - u[1]; + } + if constexpr (tegIdx == 4 || tegIdx == 14) { + return -u[0] + u[1]; + } + if constexpr (tegIdx == 5 || tegIdx == 15) { + return -u[0] - u[2]; + } + if constexpr (tegIdx == 6 || tegIdx == 16) { + + return -u[0] + u[2]; + } + if constexpr (tegIdx == 7 || tegIdx == 17) { + + return -u[1] - u[2]; + } + if constexpr (tegIdx == 8 || tegIdx == 18) { + return -u[1] + u[2]; + } + } + }; + + struct Memory + { + using Self = D3Q19::Memory; + + template + static constexpr auto getVelocityComponent() -> int + { + static_assert(myQ < Q); + static_assert(myXYZ < 3); + +#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((myXYZ) == 0) { \ + return XXX; \ + } \ + if constexpr ((myXYZ) == 1) { \ + return YYY; \ + } \ + if constexpr ((myXYZ) == 2) { \ + return ZZZ; \ + } \ + } + ADD_COMPONENT(0, -1, 0, 0) + ADD_COMPONENT(1, 0, -1, 0) + ADD_COMPONENT(2, 0, 0, -1) + ADD_COMPONENT(3, -1, -1, 0) + ADD_COMPONENT(4, -1, 1, 0) + ADD_COMPONENT(5, -1, 0, -1) + ADD_COMPONENT(6, -1, 0, 1) + ADD_COMPONENT(7, 0, -1, -1) + ADD_COMPONENT(8, 0, -1, 1) + ADD_COMPONENT(9, 0, 0, 0) + ADD_COMPONENT(10, 1, 0, 0) + ADD_COMPONENT(11, 0, 1, 0) + ADD_COMPONENT(12, 0, 0, 1) + ADD_COMPONENT(13, 1, 1, 0) + ADD_COMPONENT(14, 1, -1, 0) + ADD_COMPONENT(15, 1, 0, 1) + ADD_COMPONENT(16, 1, 0, -1) + ADD_COMPONENT(17, 0, 1, 1) + ADD_COMPONENT(18, 0, 1, -1) + +#undef ADD_COMPONENT + } + + + static constexpr int center = 9; /** Position of direction {0,0,0} */ + + template + static constexpr auto mapToRegisters() + -> int + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 0) + ADD_COMPONENT(1, 1) + ADD_COMPONENT(2, 2) + ADD_COMPONENT(3, 3) + ADD_COMPONENT(4, 4) + ADD_COMPONENT(5, 5) + ADD_COMPONENT(6, 6) + ADD_COMPONENT(7, 7) + ADD_COMPONENT(8, 8) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 10) + ADD_COMPONENT(11, 11) + ADD_COMPONENT(12, 12) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 14) + ADD_COMPONENT(15, 15) + ADD_COMPONENT(16, 16) + ADD_COMPONENT(17, 17) + ADD_COMPONENT(18, 18) +#undef ADD_COMPONENT + } + + template + static constexpr auto mapToMemory() + -> int + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 0) + ADD_COMPONENT(1, 1) + ADD_COMPONENT(2, 2) + ADD_COMPONENT(3, 3) + ADD_COMPONENT(4, 4) + ADD_COMPONENT(5, 5) + ADD_COMPONENT(6, 6) + ADD_COMPONENT(7, 7) + ADD_COMPONENT(8, 8) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 10) + ADD_COMPONENT(11, 11) + ADD_COMPONENT(12, 12) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 14) + ADD_COMPONENT(15, 15) + ADD_COMPONENT(16, 16) + ADD_COMPONENT(17, 17) + ADD_COMPONENT(18, 18) +#undef ADD_COMPONENT + } + + template + static constexpr auto getOpposite() -> int + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 10) + ADD_COMPONENT(1, 11) + ADD_COMPONENT(2, 12) + ADD_COMPONENT(3, 13) + ADD_COMPONENT(4, 14) + ADD_COMPONENT(5, 15) + ADD_COMPONENT(6, 16) + ADD_COMPONENT(7, 17) + ADD_COMPONENT(8, 18) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 0) + ADD_COMPONENT(11, 1) + ADD_COMPONENT(12, 2) + ADD_COMPONENT(13, 3) + ADD_COMPONENT(14, 4) + ADD_COMPONENT(15, 5) + ADD_COMPONENT(16, 6) + ADD_COMPONENT(17, 7) + ADD_COMPONENT(18, 8) +#undef ADD_COMPONENT + } + }; + + + template + struct RegisterMapper + { + constexpr static int fwdRegQ = fwdRegIdx_; + constexpr static int bkwRegQ = Registers::template getOpposite(); + constexpr static int fwdMemQ = Memory::template mapToMemory(); + constexpr static int bkwMemQ = Memory::template mapToMemory(); + constexpr static int centerRegQ = Registers::center; + constexpr static int centerMemQ = Memory::center; + + constexpr static int fwdMemQX = Memory::template getVelocityComponent(); + constexpr static int fwdMemQY = Memory::template getVelocityComponent(); + constexpr static int fwdMemQZ = Memory::template getVelocityComponent(); + + constexpr static int bkwMemQX = Memory::template getVelocityComponent(); + constexpr static int bkwMemQY = Memory::template getVelocityComponent(); + constexpr static int bkwMemQZ = Memory::template getVelocityComponent(); + }; + + + public: + template + static auto getDirectionAsVector() + -> std::vector + { + std::vector vec; + if constexpr (mappingType == RegisterMapping) { + Neon::ConstexprFor<0, Q, 1>( + [&vec](auto q) { + Neon::index_3d val(Registers::template getVelocityComponent(), + Registers::template getVelocityComponent(), + Registers::template getVelocityComponent()); + vec.push_back(val); + }); + } else if constexpr (mappingType == MemoryMapping) { + Neon::ConstexprFor<0, Q, 1>( + [&vec](auto q) { + Neon::index_3d val(Memory::template getVelocityComponent(), + Memory::template getVelocityComponent(), + Memory::template getVelocityComponent()); + vec.push_back(val); + }); + } + return vec; + } +}; \ No newline at end of file diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h new file mode 100644 index 00000000..535dd2de --- /dev/null +++ b/benchmarks/lbm/src/D3Q27.h @@ -0,0 +1,476 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" +#include "Precision.h" + + +/** In each lattice we define two indexing schema + * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code. + * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields. + * + * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions. + * + */ +template +struct D3Q27 +{ + public: + D3Q27() = delete; + + static constexpr int Q = 27; /** number of directions */ + static constexpr int D = 3; /** Space dimension */ + using Precision = Precision_; + using Self = D3Q27; + + static constexpr int RegisterMapping = 1; + static constexpr int MemoryMapping = 2; + + struct Registers + { + + using Self = D3Q27::Registers; + + static constexpr int center = 13; /** Position of direction {0,0,0} */ + // Identifying first half of the directions + // For each direction in the list, the opposite is not present. + // Center is also removed + static constexpr int firstHalfQLen = (Q - 1) / 2; + static constexpr std::array firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + + template + static constexpr auto getVelocityComponent() -> int + { + static_assert(myQ < Q); + static_assert(myXYZ < 3); + +#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((myXYZ) == 0) { \ + return XXX; \ + } \ + if constexpr ((myXYZ) == 1) { \ + return YYY; \ + } \ + if constexpr ((myXYZ) == 2) { \ + return ZZZ; \ + } \ + } + + ADD_COMPONENT(0, -1, 0, 0) + ADD_COMPONENT(1, 0, -1, 0) + ADD_COMPONENT(2, 0, 0, -1) + ADD_COMPONENT(3, -1, -1, 0) + ADD_COMPONENT(4, -1, 1, 0) + ADD_COMPONENT(5, -1, 0, -1) + ADD_COMPONENT(6, -1, 0, 1) + ADD_COMPONENT(7, 0, -1, -1) + ADD_COMPONENT(8, 0, -1, 1) + ADD_COMPONENT(9, -1, -1, -1) + ADD_COMPONENT(10, -1, -1, 1) + ADD_COMPONENT(11, -1, 1, -1) + ADD_COMPONENT(12, -1, 1, 1) + ADD_COMPONENT(13, 0, 0, 0) + ADD_COMPONENT(14, 1, 0, 0) + ADD_COMPONENT(15, 0, 1, 0) + ADD_COMPONENT(16, 0, 0, 1) + ADD_COMPONENT(17, 1, 1, 0) + ADD_COMPONENT(18, 1, -1, 0) + ADD_COMPONENT(19, 1, 0, 1) + ADD_COMPONENT(20, 1, 0, -1) + ADD_COMPONENT(21, 0, 1, 1) + ADD_COMPONENT(22, 0, 1, -1) + ADD_COMPONENT(23, 1, 1, 1) + ADD_COMPONENT(24, 1, 1, -1) + ADD_COMPONENT(25, 1, -1, 1) + ADD_COMPONENT(26, 1, -1, -1) + +#undef ADD_COMPONENT + } + + template + static constexpr auto getOpposite() -> int + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + + + ADD_COMPONENT(0, 14) + ADD_COMPONENT(1, 15) + ADD_COMPONENT(2, 16) + ADD_COMPONENT(3, 17) + ADD_COMPONENT(4, 18) + ADD_COMPONENT(5, 19) + ADD_COMPONENT(6, 20) + ADD_COMPONENT(7, 21) + ADD_COMPONENT(8, 22) + ADD_COMPONENT(9, 23) + ADD_COMPONENT(10, 24) + ADD_COMPONENT(11, 25) + ADD_COMPONENT(12, 26) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 0) + ADD_COMPONENT(15, 1) + ADD_COMPONENT(16, 2) + ADD_COMPONENT(17, 3) + ADD_COMPONENT(18, 4) + ADD_COMPONENT(19, 5) + ADD_COMPONENT(20, 6) + ADD_COMPONENT(21, 7) + ADD_COMPONENT(22, 8) + ADD_COMPONENT(23, 9) + ADD_COMPONENT(24, 10) + ADD_COMPONENT(25, 11) + ADD_COMPONENT(26, 12) + + +#undef ADD_COMPONENT + } + + template + static constexpr auto getT() -> typename Precision::Storage + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + + ADD_COMPONENT(0, 2. / 27.) + ADD_COMPONENT(1, 2. / 27.) + ADD_COMPONENT(2, 2. / 27.) + ADD_COMPONENT(3, 1. / 54.) + ADD_COMPONENT(4, 1. / 54.) + ADD_COMPONENT(5, 1. / 54.) + ADD_COMPONENT(6, 1. / 54.) + ADD_COMPONENT(7, 1. / 54.) + ADD_COMPONENT(8, 1. / 54.) + ADD_COMPONENT(9, 1. / 216.) + ADD_COMPONENT(10, 1. / 216.) + ADD_COMPONENT(11, 1. / 216.) + ADD_COMPONENT(12, 1. / 216.) + ADD_COMPONENT(13, 8. / 27.) + ADD_COMPONENT(14, 2. / 27.) + ADD_COMPONENT(15, 2. / 27.) + ADD_COMPONENT(16, 2. / 27.) + ADD_COMPONENT(17, 1. / 54.) + ADD_COMPONENT(18, 1. / 54.) + ADD_COMPONENT(19, 1. / 54.) + ADD_COMPONENT(20, 1. / 54.) + ADD_COMPONENT(21, 1. / 54.) + ADD_COMPONENT(22, 1. / 54.) + ADD_COMPONENT(23, 1. / 216.) + ADD_COMPONENT(24, 1. / 216.) + ADD_COMPONENT(25, 1. / 216.) + ADD_COMPONENT(26, 1. / 216.) + +#undef ADD_COMPONENT + } + + template + static constexpr auto getMomentumComponet() -> typename Precision::Storage + { + static_assert(myQ < Q); + static_assert(mementumID < 6); + +#define ADD_COMPONENT(QQ, AA, BB, CC, DD, EE, FF) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((mementumID) == 0) { \ + return AA; \ + } \ + if constexpr ((mementumID) == 1) { \ + return BB; \ + } \ + if constexpr ((mementumID) == 2) { \ + return CC; \ + } \ + if constexpr ((mementumID) == 3) { \ + return DD; \ + } \ + if constexpr ((mementumID) == 4) { \ + return EE; \ + } \ + if constexpr ((mementumID) == 5) { \ + return FF; \ + } \ + } + + ADD_COMPONENT(0, 1, 0, 0, 0, 0, 0) + ADD_COMPONENT(1, 0, 0, 0, 1, 0, 0) + ADD_COMPONENT(2, 0, 0, 0, 0, 0, 1) + ADD_COMPONENT(3, 1, 1, 0, 1, 0, 0) + ADD_COMPONENT(4, 1, -1, 0, 1, 0, 0) + ADD_COMPONENT(5, 1, 0, 1, 0, 0, 1) + ADD_COMPONENT(6, 1, 0, -1, 0, 0, 1) + ADD_COMPONENT(7, 0, 0, 0, 1, 1, 1) + ADD_COMPONENT(8, 0, 0, 0, 1, -1, 1) + ADD_COMPONENT(9, 1, 1, 1, 1, 1, 1) + ADD_COMPONENT(10, 1, 1, -1, 1, -1, 1) + ADD_COMPONENT(11, 1, -1, 1, 1, -1, 1) + ADD_COMPONENT(12, 1, -1, -1, 1, 1, 1) + ADD_COMPONENT(13, 0, 0, 0, 0, 0, 0) + ADD_COMPONENT(14, 1, 0, 0, 0, 0, 0) + ADD_COMPONENT(15, 0, 0, 0, 1, 0, 0) + ADD_COMPONENT(16, 0, 0, 0, 0, 0, 1) + ADD_COMPONENT(17, 1, 1, 0, 1, 0, 0) + ADD_COMPONENT(18, 1, -1, 0, 1, 0, 0) + ADD_COMPONENT(19, 1, 0, 1, 0, 0, 1) + ADD_COMPONENT(20, 1, 0, -1, 0, 0, 1) + ADD_COMPONENT(21, 0, 0, 0, 1, 1, 1) + ADD_COMPONENT(22, 0, 0, 0, 1, -1, 1) + ADD_COMPONENT(23, 1, 1, 1, 1, 1, 1) + ADD_COMPONENT(24, 1, 1, -1, 1, -1, 1) + ADD_COMPONENT(25, 1, -1, 1, 1, -1, 1) + ADD_COMPONENT(26, 1, -1, -1, 1, 1, 1) + +#undef ADD_COMPONENT + } + + + template + static constexpr auto getVelocity() -> const typename Neon::index_3d + { + return Neon::index_3d(getVelocityComponent, + getVelocityComponent, + getVelocityComponent); + } + + // // Identifying first half of the directions + // // For each direction in the list, the opposite is not present. + // // Center is also removed + // static constexpr int firstHalfQLen = (Q - 1) / 2; + // static constexpr std::array firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8}; + }; + + struct Memory + { + using Self = D3Q27::Memory; + + template + static constexpr auto getVelocityComponent() -> int + { + static_assert(myQ < Q); + static_assert(myXYZ < 3); + +#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \ + if constexpr ((myQ) == (QQ)) { \ + if constexpr ((myXYZ) == 0) { \ + return XXX; \ + } \ + if constexpr ((myXYZ) == 1) { \ + return YYY; \ + } \ + if constexpr ((myXYZ) == 2) { \ + return ZZZ; \ + } \ + } + + ADD_COMPONENT(0, -1, 0, 0) + ADD_COMPONENT(1, 0, -1, 0) + ADD_COMPONENT(2, 0, 0, -1) + ADD_COMPONENT(3, -1, -1, 0) + ADD_COMPONENT(4, -1, 1, 0) + ADD_COMPONENT(5, -1, 0, -1) + ADD_COMPONENT(6, -1, 0, 1) + ADD_COMPONENT(7, 0, -1, -1) + ADD_COMPONENT(8, 0, -1, 1) + ADD_COMPONENT(9, -1, -1, -1) + ADD_COMPONENT(10, -1, -1, 1) + ADD_COMPONENT(11, -1, 1, -1) + ADD_COMPONENT(12, -1, 1, 1) + ADD_COMPONENT(13, 0, 0, 0) + ADD_COMPONENT(14, 1, 0, 0) + ADD_COMPONENT(15, 0, 1, 0) + ADD_COMPONENT(16, 0, 0, 1) + ADD_COMPONENT(17, 1, 1, 0) + ADD_COMPONENT(18, 1, -1, 0) + ADD_COMPONENT(19, 1, 0, 1) + ADD_COMPONENT(20, 1, 0, -1) + ADD_COMPONENT(21, 0, 1, 1) + ADD_COMPONENT(22, 0, 1, -1) + ADD_COMPONENT(23, 1, 1, 1) + ADD_COMPONENT(24, 1, 1, -1) + ADD_COMPONENT(25, 1, -1, 1) + ADD_COMPONENT(26, 1, -1, -1) + +#undef ADD_COMPONENT + } + + static constexpr int center = 13; /** Position of direction {0,0,0} */ + + template + static constexpr auto mapToRegisters() + -> int + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 0) + ADD_COMPONENT(1, 1) + ADD_COMPONENT(2, 2) + ADD_COMPONENT(3, 3) + ADD_COMPONENT(4, 4) + ADD_COMPONENT(5, 5) + ADD_COMPONENT(6, 6) + ADD_COMPONENT(7, 7) + ADD_COMPONENT(8, 8) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 10) + ADD_COMPONENT(11, 11) + ADD_COMPONENT(12, 12) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 14) + ADD_COMPONENT(15, 15) + ADD_COMPONENT(16, 16) + ADD_COMPONENT(17, 17) + ADD_COMPONENT(18, 18) + + ADD_COMPONENT(19, 19) + ADD_COMPONENT(20, 20) + ADD_COMPONENT(21, 21) + ADD_COMPONENT(22, 22) + ADD_COMPONENT(23, 23) + ADD_COMPONENT(24, 24) + ADD_COMPONENT(25, 25) + ADD_COMPONENT(26, 26) + +#undef ADD_COMPONENT + } + + template + static constexpr auto mapToMemory() + -> int + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 0) + ADD_COMPONENT(1, 1) + ADD_COMPONENT(2, 2) + ADD_COMPONENT(3, 3) + ADD_COMPONENT(4, 4) + ADD_COMPONENT(5, 5) + ADD_COMPONENT(6, 6) + ADD_COMPONENT(7, 7) + ADD_COMPONENT(8, 8) + ADD_COMPONENT(9, 9) + ADD_COMPONENT(10, 10) + ADD_COMPONENT(11, 11) + ADD_COMPONENT(12, 12) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 14) + ADD_COMPONENT(15, 15) + ADD_COMPONENT(16, 16) + ADD_COMPONENT(17, 17) + ADD_COMPONENT(18, 18) + + ADD_COMPONENT(19, 19) + ADD_COMPONENT(20, 20) + ADD_COMPONENT(21, 21) + ADD_COMPONENT(22, 22) + ADD_COMPONENT(23, 23) + ADD_COMPONENT(24, 24) + ADD_COMPONENT(25, 25) + ADD_COMPONENT(26, 26) +#undef ADD_COMPONENT + } + + template + static constexpr auto getOpposite() -> int + { + static_assert(myQ < Q); + +#define ADD_COMPONENT(QQ, XXX) \ + if constexpr ((myQ) == (QQ)) { \ + return XXX; \ + } + ADD_COMPONENT(0, 14) + ADD_COMPONENT(1, 15) + ADD_COMPONENT(2, 16) + ADD_COMPONENT(3, 17) + ADD_COMPONENT(4, 18) + ADD_COMPONENT(5, 19) + ADD_COMPONENT(6, 20) + ADD_COMPONENT(7, 21) + ADD_COMPONENT(8, 22) + ADD_COMPONENT(9, 23) + ADD_COMPONENT(10, 24) + ADD_COMPONENT(11, 25) + ADD_COMPONENT(12, 26) + ADD_COMPONENT(13, 13) + ADD_COMPONENT(14, 0) + ADD_COMPONENT(15, 1) + ADD_COMPONENT(16, 2) + ADD_COMPONENT(17, 3) + ADD_COMPONENT(18, 4) + ADD_COMPONENT(19, 5) + ADD_COMPONENT(20, 6) + ADD_COMPONENT(21, 7) + ADD_COMPONENT(22, 8) + ADD_COMPONENT(23, 9) + ADD_COMPONENT(24, 10) + ADD_COMPONENT(25, 11) + ADD_COMPONENT(26, 12) +#undef ADD_COMPONENT + } + }; + + + template + struct RegisterMapper + { + constexpr static int fwdRegQ = fwdRegIdx_; + constexpr static int bkwRegQ = Registers::template getOpposite(); + constexpr static int fwdMemQ = Memory::template mapToMemory(); + constexpr static int bkwMemQ = Memory::template mapToMemory(); + constexpr static int centerRegQ = Registers::center; + constexpr static int centerMemQ = Memory::center; + + constexpr static int fwdMemQX = Memory::template getVelocityComponent(); + constexpr static int fwdMemQY = Memory::template getVelocityComponent(); + constexpr static int fwdMemQZ = Memory::template getVelocityComponent(); + + constexpr static int bkwMemQX = Memory::template getVelocityComponent(); + constexpr static int bkwMemQY = Memory::template getVelocityComponent(); + constexpr static int bkwMemQZ = Memory::template getVelocityComponent(); + }; + + + public: + template + static auto getDirectionAsVector() + -> std::vector + { + std::vector vec; + if constexpr (mappingType == RegisterMapping) { + Neon::ConstexprFor<0, Q, 1>( + [&vec](auto q) { + Neon::index_3d val(Registers::template getVelocityComponent(), + Registers::template getVelocityComponent(), + Registers::template getVelocityComponent()); + vec.push_back(val); + }); + } else if constexpr (mappingType == MemoryMapping) { + Neon::ConstexprFor<0, Q, 1>( + [&vec](auto q) { + Neon::index_3d val(Memory::template getVelocityComponent(), + Memory::template getVelocityComponent(), + Memory::template getVelocityComponent()); + vec.push_back(val); + }); + } + return vec; + } +}; \ No newline at end of file diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h new file mode 100644 index 00000000..d7d16550 --- /dev/null +++ b/benchmarks/lbm/src/DeviceD3QXX.h @@ -0,0 +1,363 @@ +#pragma once +#include "CellType.h" +#include "D3Q19.h" +#include "Neon/Neon.h" +#include "Neon/set/Containter.h" +template +struct DeviceD3QXX +{ + using Lattice = Lattice_; + using Precision = Precision_; + using Compute = typename Precision::Compute; + using Storage = typename Precision::Storage; + using Grid = Grid_; + + using PopField = typename Grid::template Field; + using CellTypeField = typename Grid::template Field; + + using Idx = typename PopField::Idx; + using Rho = typename Grid::template Field; + using U = typename Grid::template Field; + + struct Pull + { + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using QPullingReference = typename Lattice::template RegisterMapper; + + if constexpr (QPullingReference::fwdRegQ == QPullingReference::centerRegQ) { + popIn[QPullingReference::centerRegQ] = fin(gidx, QPullingReference::centerMemQ); + } else { + if (CellType::isWall(wallBitFlag)) { + // The cell in the opposite direction of the pull is a wall + popIn[QPullingReference::fwdRegQ] = fin(gidx, QPullingReference::bkwRegQ) + + fin.template getNghData(gidx, QPullingReference::fwdMemQ)(); + } else { + popIn[QPullingReference::fwdRegQ] = fin.template getNghData(gidx, QPullingReference::fwdMemQ)(); + } + } + }); + } + }; + + struct AA + { + static inline NEON_CUDA_HOST_DEVICE auto + pullStream(Idx const& gidx, + const uint32_t& wallBitFlag, + typename PopField::Partition const& fin, + NEON_OUT Storage popIn[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using QPullingReference = typename Lattice::template RegisterMapper; + + if constexpr (QPullingReference::fwdRegQ == QPullingReference::centerRegQ) { + popIn[QPullingReference::centerRegQ] = fin(gidx, QPullingReference::centerMemQ); + } else { + if (CellType::isWall(wallBitFlag)) { + // The cell in the opposite direction of the pull is a wall + popIn[QPullingReference::fwdRegQ] = fin(gidx, QPullingReference::fwdRegQ) + + fin.template getNghData(gidx, QPullingReference::bkwMemQ)(); + } else { + popIn[QPullingReference::fwdRegQ] = fin.template getNghData(gidx, QPullingReference::bkwMemQ)(); + } + } + }); + } + }; + + struct Push + { + static inline NEON_CUDA_HOST_DEVICE auto + pushStream(Idx const& gidx, + const uint32_t& wallNghBitFlag, + NEON_OUT Storage pOut[Lattice::Q], + NEON_OUT typename PopField::Partition& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + + if constexpr (M::fwdMemQ == M::centerMemQ) { + fOut(gidx, M::centerMemQ) = pOut[M::centerRegQ]; + } else { + if (CellType::isWall(wallNghBitFlag)) { + const auto pop_out = pOut[M::fwdRegQ]; + const auto f_nb_k = fOut.template getNghData(gidx, M::fwdMemQ)(); + + // fout(i, opp[k]) = + fOut(gidx, M::bkwMemQ) = + // pop_out + + pop_out + + // f(nb, k); + f_nb_k; + } else { + // fout(nb, + fOut.template writeNghData(gidx, + // k) + M::fwdMemQ, + // = pop_out; + pOut[M::fwdRegQ]); + } + } + }); + } + }; + + + struct Common + { + static inline NEON_CUDA_HOST_DEVICE auto + macroscopic(const Storage pop[Lattice::Q], + NEON_OUT Compute& rho, + NEON_OUT std::array& u) + -> void + { + if constexpr (Lattice::Q == 19) { +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6); + const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16); + const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14); + const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18); + const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17); +#undef POP + + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + return; + } + if constexpr (Lattice::Q == 27) { +#define POP(IDX) static_cast(pop[IDX]) + const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6) + POP(9) + POP(10) + POP(11) + POP(12); + const Compute X_P1 = POP(14) + POP(17) + POP(18) + POP(19) + POP(20) + POP(23) + POP(24) + POP(25) + POP(26); + const Compute X_0 = POP(1) + POP(2) + POP(7) + POP(8) + POP(13) + POP(15) + POP(16) + POP(21) + POP(22); + + const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(9) + POP(10) + POP(18) + POP(25) + POP(26); + const Compute Y_P1 = POP(15) + POP(17) + POP(21) + POP(22) + POP(23) + POP(24) + POP(4) + POP(11) + POP(12); + + const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(9) + POP(11) + POP(20) + POP(22) + POP(24) + POP(26); + const Compute Z_P1 = POP(16) + POP(19) + POP(21) + POP(23) + POP(25) + POP(6) + POP(8) + POP(10) + POP(12); +#undef POP + rho = X_M1 + X_P1 + X_0; + u[0] = (X_P1 - X_M1) / rho; + u[1] = (Y_P1 - Y_M1) / rho; + u[2] = (Z_P1 - Z_M1) / rho; + return; + } + printf("Error: macroscopic function does not support the selected lattice.\n"); + } + + static inline NEON_CUDA_HOST_DEVICE auto + collideBgkUnrolled(Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + NEON_IO Storage pop[Lattice::Q]) + + -> void + { + + // constexpr Compute c1over18 = 1. / 18.; + constexpr Compute c4dot5 = 4.5; + constexpr Compute c3 = 3.; + constexpr Compute c1 = 1.; + constexpr Compute c6 = 6.; + + // constexpr int regCenter = Lattice::Registers::center; + // constexpr int regFir = Lattice::Registers::center; + + Neon::ConstexprFor<0, Lattice::Registers::firstHalfQLen, 1>( + [&](auto q) { + using M = typename Lattice::template RegisterMapper; + using T = typename Lattice::Registers; + + Compute eqFw; + Compute eqBk; + + const Compute ck_u = u[0] * Lattice::Registers::template getVelocityComponent() + + u[1] * Lattice::Registers::template getVelocityComponent() + + u[2] * Lattice::Registers::template getVelocityComponent(); + + // double eq = rho * t[k] * + // (1. + + // 3. * ck_u + + // 4.5 * ck_u * ck_u - + // usqr); + eqFw = rho * T::template getT() * + (c1 + + c3 * ck_u + + c4dot5 * ck_u * ck_u - + usqr); + + // double eqopp = eq - 6.* rho * t[k] * ck_u; + eqBk = eqFw - c6 * rho * T::template getT() * ck_u; + + // pop_out = (1. - omega) * fin(i, k) + omega * eq; + pop[M::fwdRegQ] = (c1 - omega) * static_cast(pop[M::fwdRegQ]) + omega * eqFw; + // pop_out_opp = (1. - omega) * fin(i, opp[k]) + omega * eqopp; + pop[M::bkwRegQ] = (c1 - omega) * static_cast(pop[M::bkwRegQ]) + omega * eqBk; + }); + { // Center; + using T = typename Lattice::Registers; + using M = typename Lattice::template RegisterMapper; + // eq = rho * t[k] * (1. - usqr); + const Compute eqCenter = rho * T::template getT() * (c1 - usqr); + // fout(i, k) = (1. - omega) * fin(i, k) + omega * eq; + pop[M::centerRegQ] = (c1 - omega) * static_cast(pop[M::centerRegQ]) + omega * eqCenter; + } + } + + static inline NEON_CUDA_HOST_DEVICE auto + localLoad(Idx const& gidx, + NEON_IN typename PopField::Partition const& fOut, + Storage NEON_RESTRICT pOut[Lattice::Q]) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + pOut[M::fwdRegQ] = fOut(gidx, M::fwdMemQ); + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + localStore(Idx const& gidx, + Storage NEON_RESTRICT pOut[Lattice::Q], + NEON_OUT typename PopField::Partition& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + fOut(gidx, M::fwdMemQ) = pOut[M::fwdRegQ]; + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + localStoreOpposite(Idx const& gidx, + Storage NEON_RESTRICT pOut[Lattice::Q], + NEON_OUT typename PopField::Partition& fOut) + { + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + fOut(gidx, M::bkwMemQ) = pOut[M::fwdRegQ]; + }); + } + + static inline NEON_CUDA_HOST_DEVICE auto + collideKBCUnrolled(Compute const& rho /*! Density */, + std::array const& u /*! Velocity */, + Compute const& usqr /*! Usqr */, + Compute const& omega /*! Omega */, + Compute const& invBeta /*! invBeta */, + [[maybe_unused]] NEON_IO Storage pop[Lattice::Q]) + + -> void + { + if constexpr (Lattice::Q == 27) { + constexpr Compute tiny = Compute(1e-7); + + Compute Pi[6] = {0, 0, 0, 0, 0, 0}; + Compute e0 = 0; + Compute e1 = 0; + Compute deltaS[Lattice::Q]; + Compute fneq[Lattice::Q]; + Compute feq[Lattice::Q]; + const Compute beta = omega * 0.5; + + auto fdecompose_shear = [&](const int q) -> Compute { + const Compute Nxz = Pi[0] - Pi[5]; + const Compute Nyz = Pi[3] - Pi[5]; + if (q == 0 /* -1, 0, 0 */) { + return (2.0 * Nxz - Nyz) / 6.0; + } else if (q == 14 /* 1, 0, -1 */) { + return (2.0 * Nxz - Nyz) / 6.0; + } else if (q == 1 /* 0, -1, 0 */) { + return (-Nxz + 2.0 * Nyz) / 6.0; + } else if (q == 15 /* 0, 1, 0 */) { + return (-Nxz + 2.0 * Nyz) / 6.0; + } else if (q == 2 /* 0, 0, -1 */) { + return (-Nxz - Nyz) / 6.0; + } else if (q == 16 /* 0, 0, 1 */) { + return (-Nxz - Nyz) / 6.0; + } else if (q == 3 /* -1, -1, 0 */ || q == 17 /* 1, 1, 0 */) { + return Pi[1] / 4.0; + } else if (q == 18 /* 1, -1, 0 */ || q == 4 /* -1, 1, 0 */) { + return -Pi[1] / 4.0; + } else if (q == 5 /* -1, 0, -1 */ || q == 19 /* 1, 0, 1 */) { + return Pi[2] / 4.0; + } else if (q == 20 /* 1, 0, -1 */ || q == 6 /* -1, 0, 1 */) { + return -Pi[2] / 4.0; + } else if (q == 21 /* 0, 1, 1 */ || q == 7 /* 0, -1, -1 */) { + return Pi[4] / 4.0; + } else if (q == 22 /* 0, 1, -1 */ || q == 8 /* 0, -1, 1 */) { + return -Pi[4] / 4.0; + } else { + return Compute(0); + } + }; + + // equilibrium + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + const Compute cu = Compute(3) * + (u[0] * Lattice::Registers::template getVelocityComponent() + + u[1] * Lattice::Registers::template getVelocityComponent() + + u[2] * Lattice::Registers::template getVelocityComponent()); + + + feq[q] = rho * Lattice::Registers::template getT() * (1. + cu + 0.5 * cu * cu - usqr); + fneq[q] = pop[q] - feq[q]; + }); + + // momentum_flux + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + // Neon::ConstexprFor<0, 6, 1>([&](auto i) { + // Pi[i] += fneq[q] * Lattice::Registers::template getMomentByDirection(); + // }); + Pi[0] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[1] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[2] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[3] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[4] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + Pi[5] += fneq[q] * Lattice::Registers::template getMomentumComponet(); + }); + + // fdecompose_shear + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + deltaS[q] = rho * fdecompose_shear(q); + + Compute deltaH = fneq[q] - deltaS[q]; + + e0 += (deltaS[q] * deltaH / feq[q]); + e1 += (deltaH * deltaH / feq[q]); + }); + + // gamma + Compute gamma = invBeta - (2.0 - invBeta) * e0 / (tiny + e1); + + + // fout + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + Compute deltaH = fneq[q] - deltaS[q]; + pop[q] = pop[q] - beta * (2.0 * deltaS[q] + gamma * deltaH); + }); + } else { + printf("ERROR %d \n", Lattice::Q); + } + } + }; +}; \ No newline at end of file diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h new file mode 100644 index 00000000..ce465ce4 --- /dev/null +++ b/benchmarks/lbm/src/Lbm.h @@ -0,0 +1,475 @@ +#include "./Config.h" +#include "./Methods.h" +#include "./Metrics.h" +#include "./Repoert.h" +#include "CellType.h" +#include "ContainersD3QXX.h" +#include "D3Q19.h" +#include "Methods.h" +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/Containter.h" +#include "Neon/skeleton/Skeleton.h" + +int backendWasReported = false; + +template +struct Lbm +{ + using Grid = Grid_; + using Lattice = Lattice_; + using Precision = Precision_; + + using PField = typename Grid::template Field; + using CField = typename Grid::template Field; + using RhoField = typename Grid::template Field; + using UField = typename Grid::template Field; + + // using CommonContainerFactory = common::ContainerFactory; + using ContainerFactory = ContainerFactoryD3QXX; + + template + Lbm(Config& config, + Report& report, + Lambda activeMask) + { + configurations = config; + reportPtr = &report; + + + // Setting the backend + Neon::Backend bk = [&] { + if (config.deviceType == "cpu") { + Neon::Backend bk(config.devices, Neon::Runtime::openmp); + return bk; + } + if (config.deviceType == "gpu") { + Neon::Backend bk(config.devices, Neon::Runtime::stream); + return bk; + } + Neon::NeonException exce("run"); + exce << config.deviceType << " is not a supported option as device type"; + NEON_THROW(exce); + }(); + + auto [gridInitClockStart, notcare] = metrics::restartClock(bk, true); + + // Setting the grid + grid = Grid( + bk, {config.N, config.N, config.N}, + [&](const Neon::index_3d& p) { return activeMask(p); }, + Lattice::template getDirectionAsVector(), + 1.0, 0.0, + config.spaceCurveCli.getOption()); + + // Allocating Populations + for (int i = 0; i < lbm::MethodUtils::getNumberOfPFields(); i++) { + std::stringstream name; + name << "PopField_0" << i; + using Storage = typename Precision::Storage; + std::cout << "Allocating population field (#" << std::to_string(i + 1) << std::endl; + auto field = grid.template newField(name.str(), + Lattice::Q, + Storage(0.0)); + pFieldList.push_back(field); + } + + // Allocating cell type field + CellType defaultCelltype; + cellFlagField = grid.template newField("cellFlags", 1, defaultCelltype); + + // Allocating rho and u + if (config.vti != 0) { + std::cout << "Allocating rho and u" << std::endl; + using Storage = typename Precision::Storage; + rho = grid.template newField("rho", 1, Storage(0.0)); + u = grid.template newField("u", 3, Storage(0.0)); + } + + { // Setting Equilibrium all population field + for (auto& pField : pFieldList) { + // Set all to eq + ContainerFactory::Common::setToEquilibrium(pField, cellFlagField).run(Neon::Backend::mainStreamIdx); + } + } + metrics::recordGridInitMetrics(bk, *reportPtr, gridInitClockStart); + } + + // Lambda = void(*)(Neon::Index3d) -> std::tuple> + template + auto setBC(Lambda bcSetFunction) -> void + { + auto [setBcClockStart, notcare] = metrics::restartClock(grid.getBackend(), true); + + std::cout << "Setting the problem's boundary." << std::endl; + grid.getBackend().sync(Neon::Backend::mainStreamIdx); + // Compute ngh mask + ContainerFactory::Common::userSettingBc(bcSetFunction, + pFieldList[0], + cellFlagField) + .run(Neon::Backend::mainStreamIdx); + + for (int i = 1; i < int(pFieldList.size()); i++) { + ContainerFactory::Common::copyPopulation(pFieldList[0], + pFieldList[i]) + .run(Neon::Backend::mainStreamIdx); + } + cellFlagField.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + grid.getBackend().sync(Neon::Backend::mainStreamIdx); + ContainerFactory::Common::computeWallNghMask(cellFlagField, + cellFlagField) + .run(Neon::Backend::mainStreamIdx); + cellFlagField.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + metrics::recordProblemSetupMetrics(grid.getBackend(), *reportPtr, setBcClockStart); + } + + auto helpPrep() -> void + { + grid.getBackend().sync(Neon::Backend::mainStreamIdx); + // One collide if 2Pop - pull + // One iteration if 2Pop = push + if constexpr (lbm::Method::pull == method) { + // For pull we set up the system in a way that it does one single collide as first operation + using Compute = typename Precision::Compute; + auto lbmParameters = configurations.template getLbmParameters(); + { + skeleton = std::vector(2); + for (int iteration : {0, 1}) { + iterationPhase.resetPhase(iteration); + int skIdx = iterationPhase.getSkeletonIdx(); + auto even = ContainerFactory::Pull::iteration( + configurations.stencilSemanticCli.getOption(), + pFieldList.at(iterationPhase.getInputIdx()), + cellFlagField, + lbmParameters.omega, + pFieldList.at(iterationPhase.getOutputIdx())); + + std::vector ops; + skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); + Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption()); + ops.push_back(even); + std::stringstream appName; + + if (skIdx % 2 == 0) + appName << "LBM_pull_even"; + else + appName << "LBM_pull_odd"; + + skeleton.at(skIdx).sequence(ops, appName.str(), opt); + + if (skIdx % 2 == 0) + skeleton.at(skIdx).ioToDot("lbm-pull-even","lbm_pull_even",true); + else + skeleton.at(skIdx).ioToDot("lbm-pull-odd","lbm_pull_even", true); + } + } + { + // Let's compute 1 collide operation to prepare the input of the first iteration + iterationPhase.resetPhase(0); + ContainerFactory::Pull::localCollide(pFieldList.at(iterationPhase.getInputIdx()), + cellFlagField, + lbmParameters.omega, + pFieldList.at(iterationPhase.getOutputIdx())) + .run(Neon::Backend::mainStreamIdx); + pFieldList[0].getBackend().syncAll(); + iterationPhase.updateIterationPhase(); + } + return; + } + if constexpr (lbm::Method::push == method) { + using Compute = typename Precision::Compute; + auto lbmParameters = configurations.template getLbmParameters(); + skeleton = std::vector(2); + for (int iteration : {0, 1}) { + iterationPhase.resetPhase(iteration); + int skIdx = iterationPhase.getSkeletonIdx(); + auto even = ContainerFactory::Push::iteration( + configurations.stencilSemanticCli.getOption(), + pFieldList.at(iterationPhase.getInputIdx()), + cellFlagField, + lbmParameters.omega, + pFieldList.at(iterationPhase.getOutputIdx())); + + std::vector ops; + skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); + Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption()); + ops.push_back(even); + std::stringstream appName; + if (iteration % 2 == 0) + appName << "LBM_push_even"; + else + appName << "LBM_push_odd"; + skeleton.at(skIdx).sequence(ops, appName.str(), opt); + } + + { + iterationPhase.resetPhase(0); + int skIdx = iterationPhase.getSkeletonIdx(); + skeleton.at(skIdx).run(); + iterationPhase.updateIterationPhase(); + } + return; + } + if constexpr (lbm::Method::aa == method) { + using Compute = typename Precision::Compute; + auto lbmParameters = configurations.template getLbmParameters(); + skeleton = std::vector(2); + for (int iteration : {0, 1}) { + iterationPhase.resetPhase(iteration); + int skIdx = iterationPhase.getSkeletonIdx(); + Neon::set::Container lbmIteration; + std::stringstream appName; + if (iterationPhase.getPhase() == IterationPhase::Phase::even) { + lbmIteration = ContainerFactory::AA::Even::iteration( + cellFlagField, + lbmParameters.omega, + pFieldList.at(0)); + appName << "LBM_aa_even"; + } else { + lbmIteration = ContainerFactory::AA::Odd::iteration( + cellFlagField, + lbmParameters.omega, + pFieldList.at(0)); + appName << "LBM_aa_even"; + } + std::vector ops; + skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend()); + Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption()); + ops.push_back(lbmIteration); + skeleton.at(skIdx).sequence(ops, appName.str(), opt); + } + + { + iterationPhase.resetPhase(0); + int const skIdx = iterationPhase.getSkeletonIdx(); + skeleton.at(skIdx).run(); + iterationPhase.updateIterationPhase(); + } + return; + } + NEON_DEV_UNDER_CONSTRUCTION(""); + } + + auto iterate() -> void + { + helpPrep(); + // Iteration keep track of all iterations + // clock_iter keeps tracks of the iteration done after the last clock reset + std::cout << "Starting main LBM loop." << std::endl; + + auto& bk = grid.getBackend(); + auto [start, clock_iter] = metrics::restartClock(bk, true); + int time_iter = 0; + // Reset the clock, to be used when a benchmark simulation is executed. + tie(start, clock_iter) = metrics::restartClock(bk, true); + + for (time_iter = 0; time_iter < configurations.benchMaxIter; ++time_iter) { + if ((configurations.vti > 1) && ((time_iter % configurations.vti) == 0)) { + bk.syncAll(); + helpExportVti(); + } + + if (configurations.benchmark && time_iter == configurations.benchIniIter) { + std::cout << "Warm up completed (" << time_iter << " iterations ).\n" + << "Starting benchmark step (" + << configurations.benchMaxIter - configurations.benchIniIter << " iterations)." + << std::endl; + tie(start, clock_iter) = metrics::restartClock(bk, false); + } + + skeleton[iterationPhase.getSkeletonIdx()].run(); + + ++clock_iter; + iterationPhase.updateIterationPhase(); + } + std::cout << "Iterations completed." << std::endl; + metrics::recordMetrics(bk, configurations, *reportPtr, start, clock_iter); + } + + auto helpExportVti() -> void + { + grid.getBackend().syncAll(); + auto& pop = pFieldList.at(iterationPhase.getOutputIdx()); + bool done = false; + if constexpr (method == lbm::Method::push) { + auto computeRhoAndU = ContainerFactory::Push::computeRhoAndU(pop, cellFlagField, rho, u); + computeRhoAndU.run(Neon::Backend::mainStreamIdx); + done = true; + } + if constexpr (method == lbm::Method::pull) { + pop.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + auto computeRhoAndU = ContainerFactory::Pull::computeRhoAndU(pop, cellFlagField, rho, u); + computeRhoAndU.run(Neon::Backend::mainStreamIdx); + done = true; + } + if constexpr (method == lbm::Method::aa) { + if (iterationPhase.getPhase() == IterationPhase::Phase::even) { + auto computeRhoAndU = ContainerFactory::AA::Even::computeRhoAndU(pop, cellFlagField, rho, u); + computeRhoAndU.run(Neon::Backend::mainStreamIdx); + } else { + auto computeRhoAndU = ContainerFactory::AA::Odd::computeRhoAndU(pop, cellFlagField, rho, u); + computeRhoAndU.run(Neon::Backend::mainStreamIdx); + } + done = true; + } + if (!done) { + NEON_DEV_UNDER_CONSTRUCTION("helpExportVti"); + } + u.updateHostData(Neon::Backend::mainStreamIdx); + rho.updateHostData(Neon::Backend::mainStreamIdx); + // pop.updateHostData(Neon::Backend::mainStreamIdx); + grid.getBackend().sync(Neon::Backend::mainStreamIdx); + + size_t numDigits = 5; + std::string iterIdStr = std::to_string(iterationPhase.getCounter()); + iterIdStr = std::string(numDigits - std::min(numDigits, iterIdStr.length()), '0') + iterIdStr; + + // pop.ioToVtk("pop_" + iterIdStr, "pop", false); + u.ioToVtk("u_" + iterIdStr, "u", false, Neon::IoFileType::BINARY); + rho.ioToVtk("rho_" + iterIdStr, "rho", false, Neon::IoFileType::BINARY); + cellFlagField.template ioToVtk("cellFlagField_" + iterIdStr, "flag", false); + +#if 0 + std::vector> xPosVal; + std::vector> yPosVal; + const double scale = 1.0 / ulid.v[0]; + + const Neon::index_3d grid_dim = grid.getDimension(); + u.forEachActiveCell([&](const Neon::index_3d& id, const int& card, auto& val) { + if (id.x == grid_dim.x / 2 && id.z == grid_dim.z / 2) { + if (card == 0) { + yPosVal.push_back({static_cast(id.v[1]) / static_cast(grid_dim.y), val * scale}); + } + } + + if (id.y == grid_dim.y / 2 && id.z == grid_dim.z / 2) { + if (card == 1) { + xPosVal.push_back({static_cast(id.v[0]) / static_cast(grid_dim.x), val * scale}); + } + } + }, + Neon::computeMode_t::seq); + + // sort the position so the linear interpolation works + std::sort(xPosVal.begin(), xPosVal.end(), [=](std::pair& a, std::pair& b) { + return a.first < b.first; + }); + + std::sort(yPosVal.begin(), yPosVal.end(), [=](std::pair& a, std::pair& b) { + return a.first < b.first; + }); + + auto writeToFile = [](const std::vector>& posVal, std::string filename) { + std::ofstream file; + file.open(filename); + for (auto v : posVal) { + file << v.first << " " << v.second << "\n"; + } + file.close(); + }; + writeToFile(yPosVal, "NeonUniformLBM_" + iterIdStr + "_Y.dat"); + writeToFile(xPosVal, "NeonUniformLBM_" + iterIdStr + "_X.dat"); +#endif + } + + + struct IterationPhase + { + enum Phase + { + even, + odd, + }; + + private: + Phase state{Phase::even}; + + int counter = 0; + + public: + auto getCounter() const -> int + { + return counter; + } + + auto resetPhase(Phase newPhase) + { + state = newPhase; + counter = 0; + } + + auto resetPhase(int iteration) + { + if (iteration != 0 && iteration != 1) { + NEON_THROW_UNSUPPORTED_OPERATION(""); + } + state = iteration == 0 ? even : odd; + counter = 0; + } + + auto getPhase() const -> Phase + { + return state; + } + + auto updateIterationPhase() -> void + { + state = state == even ? odd : even; + counter++; + } + + auto getInputIdx() -> int + { + if constexpr (method == lbm::Method::pull || method == lbm::Method::push) { + return state == IterationPhase::even ? 0 : 1; + } + if constexpr (method == lbm::Method::aa) { + return 0; + } + NEON_THROW_UNSUPPORTED_OPERATION("helpGetInputIdx"); + } + auto getOutputIdx() -> int + { + if constexpr (method == lbm::Method::pull || method == lbm::Method::push) { + return state == IterationPhase::even ? 1 : 0; + } + if constexpr (method == lbm::Method::aa) { + return 0; + } + NEON_THROW_UNSUPPORTED_OPERATION("helpGetInputIdx"); + } + + auto getSkeletonIdx() -> int + { + if constexpr (method == lbm::Method::pull || method == lbm::Method::push || method == lbm::Method::aa) { + return state == IterationPhase::even ? 0 : 1; + } + NEON_THROW_UNSUPPORTED_OPERATION("helpGetInputIdx"); + } + }; + + Config configurations; + IterationPhase iterationPhase; + bool prepDone = false; + Grid grid; + std::vector pFieldList; + CField cellFlagField; + RhoField rho; + UField u; + std::vector skeleton; + Report* reportPtr; +}; diff --git a/benchmarks/lbm/src/Methods.h b/benchmarks/lbm/src/Methods.h new file mode 100644 index 00000000..11a1da6f --- /dev/null +++ b/benchmarks/lbm/src/Methods.h @@ -0,0 +1,59 @@ +#pragma once +#include "Neon/core/core.h" + +namespace lbm { +enum class Method +{ + push = 0, + pull = 1, + aa = 2 +}; + +struct MethodUtils +{ + template + static auto getNumberOfPFields() -> int + { + switch (method) { + case Method::pull: + return 2; + case Method::push: + return 2; + case Method::aa: + return 1; + } + std::stringstream msg; + msg << "The following LBM method is not recognized" << lbm::MethodUtils::toString(method) << std::endl; + NEON_THROW_UNSUPPORTED_OPERATION(msg.str()); + } + + static auto toString(lbm::Method method) -> std::string + { + switch (method) { + case Method::pull: + return "pull"; + case Method::push: + return "push"; + case Method::aa: + return "aa"; + } + std::stringstream msg; + msg << "The following LBM method is not recognized" << lbm::MethodUtils::toString(method) << std::endl; + NEON_THROW_UNSUPPORTED_OPERATION(msg.str()); + } + + static auto formInt(int method) -> Method + { + if (method == int(Method::pull)) + return Method::pull; + if (method == int(Method::push)) + return Method::push; + if (method == int(Method::aa)) + return Method::aa; + + std::stringstream msg; + msg << "The following LBM method is not recognized" << method << std::endl; + NEON_THROW_UNSUPPORTED_OPERATION(msg.str()); + } +}; +} // namespace lbm \ No newline at end of file diff --git a/benchmarks/lbm/src/Metrics.h b/benchmarks/lbm/src/Metrics.h new file mode 100644 index 00000000..10356a4a --- /dev/null +++ b/benchmarks/lbm/src/Metrics.h @@ -0,0 +1,88 @@ +#pragma once +#include +#include "Config.h" +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Repoert.h" + +namespace metrics { +// Return a new clock for the current time, for benchmarking. +namespace { + +auto restartClock(Neon::Backend& bk, bool sync = true) +{ + if (sync) { + bk.syncAll(); + } + return make_pair(std::chrono::high_resolution_clock::now(), 0); +} + +void recordBackend(Neon::Backend& bk, + Report& report) +{ + report.recordBk(bk); +} + +void recordGrid(Neon::domain::interface::GridBase& g, + Report& report) +{ + report.recordGrid(g); +} + +} // namespace + + +// Compute the time elapsed since a starting point, and the corresponding +// benchmarks of the code in Mega Lattice site updates per second (MLups). +template +void recordMetrics(Neon::Backend& bk, + const Config& config, + Report& report, + TimePoint start, + int clock_iter) +{ + bk.syncAll(); + size_t nElements = config.N * config.N * config.N; + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(stop - start); + double mlups = static_cast(nElements * clock_iter) / duration.count(); + + report.recordLoopTime(duration.count(), "microseconds"); + report.recordMLUPS(mlups); + + std::cout << "Metrics: " << std::endl; + std::cout << "-- time: " << std::setprecision(4) << duration.count() << " microseconds" << std::endl; + std::cout << "-- MLUPS: " << std::setprecision(4) << mlups << " MLUPS" << std::endl; +} + +template +void recordGridInitMetrics(Neon::Backend& bk, + Report& report, + TimePoint start) +{ + bk.syncAll(); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(stop - start); + + report.recordNeonGridInitTime(duration.count(), "microseconds"); + + std::cout << "Metrics: " << std::endl; + std::cout << "- Grid Init: " << std::setprecision(4) << duration.count() << " microseconds" << std::endl; +} + + +template +void recordProblemSetupMetrics(Neon::Backend& bk, + Report& report, + TimePoint start) +{ + bk.syncAll(); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(stop - start); + + report.recordProblemSetupTime(duration.count(), "microseconds"); + + std::cout << "Metrics: " << std::endl; + std::cout << " Problem Setup: " << std::setprecision(4) << duration.count() << " microseconds" << std::endl; +} +} // namespace metrics \ No newline at end of file diff --git a/benchmarks/lbm/src/Precision.h b/benchmarks/lbm/src/Precision.h new file mode 100644 index 00000000..a45ff69e --- /dev/null +++ b/benchmarks/lbm/src/Precision.h @@ -0,0 +1,13 @@ +#pragma once + +#include "Neon/Neon.h" +#include "Neon/set/Backend.h" +#include "Neon/set/memory/memSet.h" + +template +struct Precision +{ + using Storage = StorageFP; + using Compute = ComputeFP; +}; diff --git a/benchmarks/lbm/src/Repoert.h b/benchmarks/lbm/src/Repoert.h new file mode 100644 index 00000000..095bce9a --- /dev/null +++ b/benchmarks/lbm/src/Repoert.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include +#include "Config.h" +#include "Neon/domain/interface/GridBase.h" +struct Report +{ + Neon::Report mReport; + std::string mFname; + + std::vector mMLUPS; + std::vector mLoopTime; + std::vector mNeonGridInitTime; + std::vector mProblemSetupTime; + + std::string mtimeUnit = ""; + + explicit Report(const Config& c); + + auto recordMLUPS(double mlups) + -> void; + + auto recordLoopTime(double time, + const std::string& unit) + -> void; + + auto recordNeonGridInitTime(double time, + const std::string& unit) + -> void; + + auto recordProblemSetupTime(double time, + const std::string& unit) + -> void; + + auto save(std::stringstream & testCode) + -> void; + void recordBk(Neon::Backend& backend); + void recordGrid(Neon::domain::interface::GridBase& g); +}; diff --git a/benchmarks/lbm/src/Report.cpp b/benchmarks/lbm/src/Report.cpp new file mode 100644 index 00000000..e332de43 --- /dev/null +++ b/benchmarks/lbm/src/Report.cpp @@ -0,0 +1,113 @@ +#include +#include +#include "Repoert.h" + +Report::Report(const Config& c) + : mReport("lbm-lid-driven-cavity-flow") +{ + mFname = c.reportFile; + + mReport.addMember("argv", c.mArgv); + + mReport.addMember("Re", c.Re); + mReport.addMember("ulb", c.ulb); + mReport.addMember("N", c.N); + mReport.addMember("benchmark", c.benchmark); + mReport.addMember("max_t", c.max_t); + mReport.addMember("repetitions", c.repetitions); + mReport.addMember("vti", c.vti); + + + mReport.addMember("benchIniIter", c.benchIniIter); + mReport.addMember("benchMaxIter", c.benchMaxIter); + + mReport.addMember("deviceType", c.deviceType); + mReport.addMember("numDevices", c.devices.size()); + mReport.addMember("devices", c.devices); + mReport.addMember("reportFile", c.reportFile); + mReport.addMember("gridType", c.gridType); + + + + c.occCli.addToReport(mReport); + c.transferModeCli.addToReport(mReport); + c.stencilSemanticCli.addToReport(mReport); + c.spaceCurveCli.addToReport(mReport); + c.collisionCli.addToReport(mReport); + + mReport.addMember("computeTypeStr", c.computeTypeStr); + mReport.addMember("storeTypeStr", c.storeTypeStr); + mReport.addMember("streamingMethod", c.streamingMethod); + mReport.addMember("lattice", c.lattice); + + + mReport.addMember("nu", c.mLbmParameters.nu); + mReport.addMember("omega", c.mLbmParameters.omega); + mReport.addMember("dx", c.mLbmParameters.dx); + mReport.addMember("dt", c.mLbmParameters.dt); +} + +auto Report:: + recordMLUPS(double mlups) + -> void +{ + mMLUPS.push_back(mlups); +} + +auto Report:: + recordLoopTime(double time, + const std::string& unit) + -> void +{ + if (mtimeUnit.length() == 0) { + mtimeUnit = unit; + } + if (unit.length() != mtimeUnit.length()) { + NEON_THROW_UNSUPPORTED_OPERATION("Time unit inconsistency"); + } + mLoopTime.push_back(time); +} + +auto Report::recordNeonGridInitTime(double time, const std::string& unit) -> void +{ + if (mtimeUnit.length() == 0) { + mtimeUnit = unit; + } + if (unit.length() != mtimeUnit.length()) { + NEON_THROW_UNSUPPORTED_OPERATION("Time unit inconsistency"); + } + mNeonGridInitTime.push_back(time); +} + +auto Report::recordProblemSetupTime(double time, const std::string& unit) -> void +{ + if (mtimeUnit.length() == 0) { + mtimeUnit = unit; + } + if (unit.length() != mtimeUnit.length()) { + NEON_THROW_UNSUPPORTED_OPERATION("Time unit inconsistency"); + } + mProblemSetupTime.push_back(time); +} + +auto Report:: + save(std::stringstream & testCode) + -> void +{ + mReport.addMember("MLUPS", mMLUPS); + mReport.addMember(std::string("Loop Time (") + mtimeUnit + ")", mLoopTime); + mReport.addMember(std::string("Problem Setup Time (") + mtimeUnit + ")", mProblemSetupTime); + mReport.addMember(std::string("Neon Grid Init Time (") + mtimeUnit + ")", mNeonGridInitTime); + + mReport.write(mFname + testCode.str(), true); +} + +void Report::recordBk(Neon::Backend& backend) +{ + backend.toReport(mReport); +} + +void Report::recordGrid(Neon::domain::interface::GridBase& g) +{ + g.toReport(mReport, true); +} \ No newline at end of file diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu new file mode 100644 index 00000000..55d6bac5 --- /dev/null +++ b/benchmarks/lbm/src/RunCavityTwoPop.cu @@ -0,0 +1,307 @@ +#include "Config.h" + +#include "D3Q19.h" +#include "D3Q27.h" + +#include "Neon/domain/bGrid.h" +#include "Neon/domain/dGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/eGrid.h" + +#include "./Lbm.h" +#include "CellType.h" +#include "Metrics.h" +#include "Repoert.h" +namespace CavityTwoPop { + +int backendWasReported = false; +// #include +// #include "/usr/include/fenv.h" + +namespace details { +template +auto run(Config& config, + Report& report, + [[maybe_unused]] std::stringstream& code) -> void +{ + using Storage = Storage_; + using Compute = Compute_; + using Precision = Precision; + using Lattice = Lattice_; // D3Q27; + + code << "_" << config.deviceType << "_"; + for (auto const& id : config.devices) { + code << id; + } + code << "_SS" << config.stencilSemanticCli.getStringOption(); + code << "_SF" << config.spaceCurveCli.getStringOption(); + code << "_TM" << config.transferModeCli.getStringOption(); + code << "_Occ" << config.occCli.getStringOption(); + code << "__"; + // using PopulationField = typename Grid::template Field; + + // using PopField = typename Grid::template Field; + // using CellTypeField = typename Grid::template Field; + + // using Idx = typename PopField::Idx; + // using RhoField = typename Grid::template Field; + // using UField = typename Grid::template Field; + + Neon::double_3d ulid(1., 0., 0.); + // Neon Grid and Fields initialization + Neon::index_3d domainDim(config.N, config.N, config.N); + + Lbm lbm(config, + report, + [](Neon::index_3d const&) { return true; }); + auto ulb = config.ulb; + lbm.setBC([=] NEON_CUDA_HOST_DEVICE(Neon::index_3d const& globalIdx, + NEON_OUT Storage p[Lattice::Q], + NEON_OUT CellType::Classification& cellClass) { + typename Lattice::Precision::Storage popVal = 0; + + if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 || + globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 || + globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) { + cellClass = CellType::bounceBack; + + if (globalIdx.y == domainDim.y - 1) { + cellClass = CellType::movingWall; + } + + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + if (globalIdx.y == domainDim.y - 1) { + popVal = -6. * Lattice::Registers::template getT() * ulb * + (Lattice::Registers::template getVelocityComponent() * ulid.v[0] + + Lattice::Registers::template getVelocityComponent() * ulid.v[1] + + Lattice::Registers::template getVelocityComponent() * ulid.v[2]); + } else { + popVal = 0; + } + p[q] = popVal; + }); + } else { + cellClass = CellType::bulk; + Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) { + using M = typename Lattice::template RegisterMapper; + p[q] = Lattice::Registers::template getT(); + }); + } + }); + lbm.iterate(); +} + + +template +auto runFilterMethod(Config& config, + Report& report, + std::stringstream& testCode) -> void +{ + //feenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT); // Enable all floating point exceptions but FE_INEXACT + if (config.streamingMethod == "push") { + if (config.devices.size() != 1) { + NEON_THROW_UNSUPPORTED_OPERATION("We only support PUSH in a single device configuration for now.") + } + testCode << "_push"; + return run(config, report, testCode); + } + if (config.streamingMethod == "pull") { + testCode << "_pull"; + return run(config, report, testCode); + } + if (config.streamingMethod == "aa") { + if (config.devices.size() != 1) { + NEON_THROW_UNSUPPORTED_OPERATION("We only support AA in a single device configuration for now.") + } + testCode << "_aa"; + return run(config, report, testCode); + } + NEON_DEV_UNDER_CONSTRUCTION(""); +} + +template +auto runFilterCollision(Config& config, + Report& report, + std::stringstream& testCode) -> void +{ + if (config.collisionCli.getOption() == Collision::bgk) { + testCode << "_bgk"; + return runFilterMethod(config, report, testCode); + } + if (config.collisionCli.getOption() == Collision::kbc) { + if (config.lattice != "d3q27" && config.lattice != "D3Q27") { + Neon::NeonException e("runFilterCollision"); + e << "LBM kbc collision model only supports d3q27 lattice"; + NEON_THROW(e); + } + testCode << "_kbc"; + using L = D3Q27>; + if constexpr (std::is_same_v) { + return runFilterMethod(config, report, testCode); + } + } + NEON_DEV_UNDER_CONSTRUCTION(""); +} + +template +auto runFilterLattice(Config& config, + Report& report, + std::stringstream& testCode) -> void +{ + using P = Precision; + + if (config.lattice == "d3q19" || config.lattice == "D3Q19") { + testCode << "_D3Q19"; + using L = D3Q19

; + return runFilterCollision(config, report, testCode); + } + if (config.lattice == "d3q27" || config.lattice == "D3Q27") { + testCode << "_D3Q27"; + using L = D3Q27

; + return runFilterCollision(config, report, testCode); + } + NEON_DEV_UNDER_CONSTRUCTION("Lattice type not supported. Available options: D3Q19 and D3Q27"); +} + + +template +auto runFilterComputeType(Config& config, + Report& report, + std::stringstream& testCode) +{ + if (config.computeTypeStr == "double") { + testCode << "_Sdouble"; + return runFilterLattice(config, report, testCode); + } + if (config.computeTypeStr == "float") { + testCode << "_Sfloat"; + return runFilterLattice(config, report, testCode); + } + NEON_DEV_UNDER_CONSTRUCTION(""); +} + +template +auto runFilterStoreType(Config& config, + Report& report, + std::stringstream& testCode) + -> void +{ + if (config.storeTypeStr == "double") { + testCode << "_Cdouble"; + return runFilterComputeType(config, report, testCode); + } + if (config.storeTypeStr == "float") { + testCode << "_Cfloat"; + return runFilterComputeType(config, report, testCode); + } + NEON_DEV_UNDER_CONSTRUCTION(""); +} +} // namespace details + +#ifdef NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS +constexpr bool skipTest = false; +#else +constexpr bool skipTest = false; +#endif + +auto run(Config& config, + Report& report, + std::stringstream& testCode) -> void +{ + testCode << "___" << config.N << "_"; + testCode << "_numDevs_" << config.devices.size(); + + if (config.gridType == "dGrid") { + testCode << "_dGrid"; + return details::runFilterStoreType(config, report, testCode); + } + // if (config.gridType == "eGrid") { + // if constexpr (!skipTest) { + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid" || config.gridType == "bGrid_8_8_8") { + // return details::runFilterStoreType(config, report); + // } + if (config.gridType == "bGrid_4_4_4") { + if constexpr (!skipTest) { + testCode << "_bGrid_4_4_4"; + using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>; + using Grid = Neon::domain::details::bGrid::bGrid; + return details::runFilterStoreType(config, report, testCode); + } else { + NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + } + } + // if (config.gridType == "bGrid_8_8_8") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<8, 8, 8>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report, testCode); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_2_2_2") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_4") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_4") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_2_8") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "bGrid_32_8_2") { + // if constexpr (!skipTest) { + // using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>; + // using Grid = Neon::domain::details::bGrid::bGrid; + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + // if (config.gridType == "dGridSoA") { + // if constexpr (!skipTest) { + // return details::runFilterStoreType(config, report); + // } else { + // NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.") + // } + // } + NEON_THROW_UNSUPPORTED_OPERATION("Unknown grid type: " + config.gridType); +} +} // namespace CavityTwoPop diff --git a/benchmarks/lbm/src/RunCavityTwoPop.h b/benchmarks/lbm/src/RunCavityTwoPop.h new file mode 100644 index 00000000..0386d28e --- /dev/null +++ b/benchmarks/lbm/src/RunCavityTwoPop.h @@ -0,0 +1,13 @@ +#include "Config.h" +#include "D3Q19.h" +#include "Neon/domain/dGrid.h" + +#include "Metrics.h" +#include "Repoert.h" + +namespace CavityTwoPop { + +auto run(Config& config, + Report& report, + std::stringstream&) -> void; +} // namespace CavityTwoPop \ No newline at end of file diff --git a/benchmarks/lbm/src/app.cpp b/benchmarks/lbm/src/app.cpp new file mode 100644 index 00000000..8cbfc1cf --- /dev/null +++ b/benchmarks/lbm/src/app.cpp @@ -0,0 +1,48 @@ + +#include "Config.h" +#include "Repoert.h" +#include "RunCavityTwoPop.h" + +#include "Neon/Neon.h" +#include "Neon/core/tools/clipp.h" +#include "Neon/domain/dGrid.h" + +int main(int argc, char** argv) +{ + Config config; + Neon::init(); + + config.Re = 100.; // Reynolds number + config.ulb = 0.04; // Velocity in lattice units + config.N = 160; // Number of nodes in x-direction + config.benchmark = true; // Run in benchmark mode ? + config.max_t = 10.0; // Non-benchmark mode: Total time in dim.less units + // config.out_freq = 20000000; // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages) + // config.data_freq = 20000000; // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump) + // config.bench_ini_iter = 0; // Benchmark mode: Number of warmup iterations + // config.bench_max_iter = 10000; // Benchmark mode: Total number of iterations + // config.perKeeperFile = "perf"; + // config.devices = {0}; + // config.gridType = "dGrid"; + // config.occ = Neon::skeleton::Options_t::Occ::none + + + if (config.parseArgs(argc, argv) != 0) { + return -1; + } + + std::cout << "--------------- Parameters ---------------\n"; + std::cout << config.toString(); + std::cout << "-------------------------------------------\n"; + + Report report(config); + std::stringstream testCode; + for (int i = 0; i < config.repetitions; i++) { + testCode = std::stringstream(); + CavityTwoPop::run(config, report, testCode); + } + + report.save(testCode); + + return 0; +} diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming.h b/libNeonCore/include/Neon/core/tools/metaprogramming.h index 53678ed6..ea004a43 100644 --- a/libNeonCore/include/Neon/core/tools/metaprogramming.h +++ b/libNeonCore/include/Neon/core/tools/metaprogramming.h @@ -4,3 +4,4 @@ #include "Neon/core/tools/metaprogramming/debugHelp.h" #include "Neon/core/tools/metaprogramming/extractTupleVecType.h" #include "Neon/core/tools/metaprogramming/tupleVecTable.h" +#include "Neon/core/tools/metaprogramming/ConstexprFor.h" \ No newline at end of file diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h new file mode 100644 index 00000000..a6d8767e --- /dev/null +++ b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h @@ -0,0 +1,31 @@ +#pragma once + +namespace Neon { + +/** + * Implementation of a constexpr for loop. + * Reference: https://artificial-mind.net/blog/2020/10/31/constexpr-for + * + * The loop is implemented as a recursive template function. + * It is equicalent to the following code: + * + * for(int i = Start; i < End; i += Inc) { + * f(i); + * // do something + * // ... + * // ... + * } + */ +template +constexpr void ConstexprFor(F&& f) +{ + if constexpr (Start < End) { + f(std::integral_constant()); + ConstexprFor(f); + } +} + +} // namespace Neon \ No newline at end of file diff --git a/libNeonCore/include/Neon/core/types/Macros.h b/libNeonCore/include/Neon/core/types/Macros.h index 5e909d3a..d9f47914 100644 --- a/libNeonCore/include/Neon/core/types/Macros.h +++ b/libNeonCore/include/Neon/core/types/Macros.h @@ -206,8 +206,12 @@ #define NEON_RESTRICT restrict #endif -#ifdef NEON_COMPILER_CUDA +#if defined(NEON_COMPILER_CUDA) +#if!defined(_WIN32) #define NEON_RESTRICT __restrict__ +#else +#define NEON_RESTRICT +#endif #endif #ifdef NEON_COMPILER_CLANG diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h index acdae410..e41c8f26 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h @@ -56,6 +56,10 @@ class Vec_3d num_axis = 3 }; + static constexpr int directionX = axis_e::x_axis; + static constexpr int directionY = axis_e::y_axis; + static constexpr int directionZ = axis_e::z_axis; + union { Integer v[axis_e::num_axis]{0, 0, 0}; @@ -120,10 +124,15 @@ class Vec_3d NEON_CUDA_HOST_DEVICE inline void constexpr set(Integer p[self_t::num_axis]); - NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other); + NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other); NEON_CUDA_HOST_DEVICE inline void constexpr set(const Integer& xyz); + template + NEON_CUDA_HOST_DEVICE inline constexpr Integer getComponent() const + { + return v[componentId]; + } //---- [REDUCE SECTION] -------------------------------------------------------------------------------------------- //---- [REDUCE SECTION] -------------------------------------------------------------------------------------------- @@ -324,10 +333,10 @@ class Vec_3d * @return Resulting point is C =(A.x / B.x, A.y / B.y, A.z / B.z) * */ template - NEON_CUDA_HOST_DEVICE inline self_t operator*(const Vec_3d& B) const; + NEON_CUDA_HOST_DEVICE inline constexpr self_t operator*(const Vec_3d& B) const; template - NEON_CUDA_HOST_DEVICE inline self_t operator*(const K_tt& alpha) const; + NEON_CUDA_HOST_DEVICE inline constexpr self_t operator*(const K_tt& alpha) const; /** * Compute the division between two points A and B, component by component (A.x/B.x, A.y/B.y, A.z/B.z). * Be careful!!! if the type is int, the division will be an integer division!!! @@ -364,15 +373,15 @@ class Vec_3d * @param[in] B: second point for the operation. * @return True if A.x <= B.x && A.y <= B.y && A.z <= B.z */ - NEON_CUDA_HOST_DEVICE inline bool operator==(const self_t& B) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator==(const self_t& B) const; - NEON_CUDA_HOST_DEVICE inline bool operator==(const Integer other[self_t::num_axis]) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator==(const Integer other[self_t::num_axis]) const; - NEON_CUDA_HOST_DEVICE inline bool operator==(const Integer otherScalar) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator==(const Integer otherScalar) const; - NEON_CUDA_HOST_DEVICE inline bool operator!=(const self_t& B) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator!=(const self_t& B) const; - NEON_CUDA_HOST_DEVICE inline bool operator!=(const Integer other[self_t::num_axis]) const; + NEON_CUDA_HOST_DEVICE inline constexpr bool operator!=(const Integer other[self_t::num_axis]) const; NEON_CUDA_HOST_DEVICE inline self_t operator-() const; diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h index fe7222eb..c5ceea55 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h +++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h @@ -458,7 +458,7 @@ NEON_CUDA_HOST_DEVICE inline Vec_3d Vec_3d template -NEON_CUDA_HOST_DEVICE inline Vec_3d Vec_3d::operator*(const Vec_3d& B) const +NEON_CUDA_HOST_DEVICE inline constexpr Vec_3d Vec_3d::operator*(const Vec_3d& B) const { const Vec_3d& A = *this; // Vec_3d C((Integer)(A.x * B.x), (Integer)(A.y * B.y), (Integer)(A.z * B.z)); @@ -468,7 +468,7 @@ NEON_CUDA_HOST_DEVICE inline Vec_3d Vec_3d template -NEON_CUDA_HOST_DEVICE inline Vec_3d Vec_3d::operator*(const K_tt& alpha) const +NEON_CUDA_HOST_DEVICE inline constexpr Vec_3d Vec_3d::operator*(const K_tt& alpha) const { const Vec_3d& A = *this; const auto alpha_c = static_cast(alpha); @@ -526,35 +526,35 @@ NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator< template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator==(const Vec_3d& B) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator==(const Vec_3d& B) const { const Vec_3d& A = *this; return A.x == B.x && A.y == B.y && A.z == B.z; } template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator==(const IntegerType_ta other[Vec_3d::num_axis]) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator==(const IntegerType_ta other[Vec_3d::num_axis]) const { const Vec_3d& A = *this; return A.x == other[0] && A.y == other[1] && A.z == other[2]; } template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator==(const IntegerType_ta otherScalar) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator==(const IntegerType_ta otherScalar) const { const Vec_3d& A = *this; return A.x == otherScalar && A.y == otherScalar && A.z == otherScalar; } template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator!=(const Vec_3d& B) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator!=(const Vec_3d& B) const { const Vec_3d& A = *this; return !(A == B); } template -NEON_CUDA_HOST_DEVICE inline bool Vec_3d::operator!=(const IntegerType_ta other[Vec_3d::num_axis]) const +NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d::operator!=(const IntegerType_ta other[Vec_3d::num_axis]) const { const Vec_3d& A = *this; return A.x != other[0] || A.y != other[1] || A.z != other[2]; diff --git a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h index 788291a6..940c6d2c 100644 --- a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h +++ b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h @@ -58,6 +58,7 @@ template class Vec_4d { public: + using Integer = IntegerType_ta; using element_t = IntegerType_ta; using self_t = Vec_4d; diff --git a/libNeonDomain/include/Neon/domain/Grids.h b/libNeonDomain/include/Neon/domain/Grids.h index aad0cda5..7c899b98 100644 --- a/libNeonDomain/include/Neon/domain/Grids.h +++ b/libNeonDomain/include/Neon/domain/Grids.h @@ -3,3 +3,4 @@ #include "Neon/domain/aGrid.h" #include "Neon/domain/eGrid.h" #include "Neon/domain/bGrid.h" +#include "Neon/domain/dGridSoA.h" diff --git a/libNeonDomain/include/Neon/domain/dGridSoA.h b/libNeonDomain/include/Neon/domain/dGridSoA.h new file mode 100644 index 00000000..bdd63f25 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/dGridSoA.h @@ -0,0 +1,7 @@ +#pragma once +#include "Neon/domain/details/dGridSoA/dGridSoA.h" + + +namespace Neon { +using dGridSoA = Neon::domain::details::dGridSoA::dGridSoA; +} \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h index 1e62f883..1ae2bf1d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h @@ -63,7 +63,8 @@ bField::bField(const std::string& fieldUserName, blockConnectivity.mem(), bitmask.mem(), dataBlockOrigins.mem(), - mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx)); + mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx), + mData->grid->getDimension()); }); } @@ -311,8 +312,9 @@ auto bField::initHaloUpdateTable() -> void T* srcMem = blockViewPartitions[Data::EndPoints::src]->mem(); T* dstMem = blockViewPartitions[Data::EndPoints::dst]->mem(); - Neon::size_4d srcBoundaryBuff(boundaryZBeginIdx[Data::EndPoints::src][static_cast(byDirection)], 0, 0, 0); Neon::size_4d dstGhostBuff(ghostZBeginIdx[Data::EndPoints::dst][static_cast(ByDirectionUtils::invert(byDirection))], 0, 0, 0); + Neon::size_4d srcBoundaryBuff(boundaryZBeginIdx[Data::EndPoints::src][static_cast(byDirection)], 0, 0, 0); + size_t transferDataBlockCount = mData->grid->mData->partitioner1D.getSpanLayout().getBoundsBoundary(setIdxVec[Data::EndPoints::src], byDirection).count; // std::cout << "To " << dstGhostBuff << " prt " << blockViewPartitions[Data::EndPoints::dst]->prtID() << " From " << srcBoundaryBuff << " prt " << blockViewPartitions[Data::EndPoints::src]->prtID() << std::endl; diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h index 59131cd7..e1c7e55d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h @@ -60,7 +60,8 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, const ActiveCellLambda activeCellLambda, const Neon::domain::Stencil& stencil, const double_3d& spacingData = double_3d(1, 1, 1), - const double_3d& origin = double_3d(0, 0, 0)); + const double_3d& origin = double_3d(0, 0, 0), + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); /** @@ -72,10 +73,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate, const ActiveCellLambda activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization */, const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */, const int multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing and (i+1)* voxelSpacing. - * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */ - , + * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */, const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */, - const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */); + const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); /** * Returns some properties for a given cartesian in the Cartesian domain. diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h index 607237c6..a375c64d 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h @@ -1,28 +1,31 @@ #include "Neon/domain/details/bGrid/bGrid.h" +#include "Neon/domain/tools/SpaceCurves.h" namespace Neon::domain::details::bGrid { template template -bGrid::bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const double_3d& spacingData, - const double_3d& origin) - : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin) +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const double_3d& spacingData, + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) + : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin, encoderType) { } template template -bGrid::bGrid(const Neon::Backend& backend, - const Neon::int32_3d& domainSize, - const ActiveCellLambda activeCellLambda, - const Neon::domain::Stencil& stencil, - const int multiResDiscreteIdxSpacing, - const double_3d& spacingData, - const double_3d& origin) +bGrid::bGrid(const Neon::Backend& backend, + const Neon::int32_3d& domainSize, + const ActiveCellLambda activeCellLambda, + const Neon::domain::Stencil& stencil, + const int multiResDiscreteIdxSpacing, + const double_3d& spacingData, + const double_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) { @@ -35,18 +38,25 @@ bGrid::bGrid(const Neon::Backend& backend, SBlock::memBlockSizeY, SBlock::memBlockSizeZ); + std::stringstream gridName; + gridName << "bGrid_" << SBlock::memBlockSizeX << "_" + << SBlock::memBlockSizeY << "_" + << SBlock::memBlockSizeZ; { auto nElementsPerPartition = backend.devSet().template newDataSet(0); // We do an initialization with nElementsPerPartition to zero, // then we reset to the computed number. - bGrid::GridBase::init("bGrid", + + bGrid::GridBase::init(gridName.str(), backend, domainSize, stencil, nElementsPerPartition, defaultKernelBlockSize, multiResDiscreteIdxSpacing, - origin); + origin, + encoderType, + defaultKernelBlockSize); } { // Initialization of the partitioner @@ -58,6 +68,7 @@ bGrid::bGrid(const Neon::Backend& backend, SBlock::memBlockSize3D.template newType(), domainSize, Neon::domain::Stencil::s27_t(false), + encoderType, multiResDiscreteIdxSpacing); mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping(); @@ -107,8 +118,8 @@ bGrid::bGrid(const Neon::Backend& backend, for (int j = 0; j < SBlock::memBlockSize3D.template newType().y; j++) { for (int i = 0; i < SBlock::memBlockSize3D.template newType().x; i++) { auto globalPosition = blockOrigin + Neon::int32_3d(i * this->mData->mMultiResDiscreteIdxSpacing, - j * this->mData->mMultiResDiscreteIdxSpacing, - k * this->mData->mMultiResDiscreteIdxSpacing); + j * this->mData->mMultiResDiscreteIdxSpacing, + k * this->mData->mMultiResDiscreteIdxSpacing); bool const isInDomain = globalPosition < domainSize * this->mData->mMultiResDiscreteIdxSpacing; bool const isActive = activeCellLambda(globalPosition); if (isActive && isInDomain) { @@ -155,8 +166,8 @@ bGrid::bGrid(const Neon::Backend& backend, BlockIdx blockNghIdx = Span::getInvalidBlockId(); typename decltype(blockConnectivity)::Idx nghIdx; Neon::int8_3d stencilPoint(i - int8_t(1), - j - int8_t(1), - k - int8_t(1)); + j - int8_t(1), + k - int8_t(1)); bool isValid = blockConnectivity.getNghIndex(idx, stencilPoint, nghIdx); if (isValid) { blockNghIdx = static_cast(nghIdx.helpGet()); @@ -220,14 +231,16 @@ bGrid::bGrid(const Neon::Backend& backend, mData->stencilIdTo3dOffset.updateDeviceData(backend, Neon::Backend::mainStreamIdx); } // Init the base grid - bGrid::GridBase::init("bGrid", + bGrid::GridBase::init(gridName.str(), backend, domainSize, Neon::domain::Stencil(), mData->mNumActiveVoxel, SBlock::memBlockSize3D.template newType(), spacingData, - origin); + origin, + encoderType, + defaultKernelBlockSize); { // setting launchParameters mData->launchParametersTable.forEachSeq([&](Neon::DataView dw, Neon::set::LaunchParameters& bLaunchParameters) { diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h index b12fa671..fc596898 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h @@ -36,7 +36,8 @@ class bPartition typename Idx::DataBlockIdx* mBlockConnectivity, typename SBlock::BitMask const* NEON_RESTRICT mMask, Neon::int32_3d* mOrigin, - NghIdx* mStencilNghIndex); + NghIdx* mStencilNghIndex, + Neon::int32_3d mDomainSize); /** * Retrieve the cardinality of the field. @@ -98,6 +99,27 @@ class bPartition T defaultValue) const -> NghData; + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void>; + + template + NEON_CUDA_HOST_DEVICE inline auto + writeNghData(const Idx& gidx, + int card, + T value) + -> bool; + /** * Gets the global coordinates of the cartesian point. */ @@ -109,6 +131,11 @@ class bPartition isActive(const Idx& cell, const typename SBlock::BitMask* mask = nullptr) const -> bool; + + NEON_CUDA_HOST_DEVICE inline auto + getDomainSize() + const -> Neon::index_3d; + /** * Gets the Idx for in the block view space. */ @@ -116,7 +143,7 @@ class bPartition getBlockViewIdx(const Idx& cell) const -> BlockViewGridIdx; - + NEON_CUDA_HOST_DEVICE inline auto helpGetPitch(const Idx& cell, int card) const -> uint32_t; @@ -147,6 +174,7 @@ class bPartition helpGetNghIdx(const Idx& idx, const typename Idx::DataBlockIdx* blockConnectivity) const -> Idx; + int mCardinality; T* mMem; NghIdx const* NEON_RESTRICT mStencilNghIndex; @@ -154,6 +182,8 @@ class bPartition typename SBlock::BitMask const* NEON_RESTRICT mMask; Neon::int32_3d const* NEON_RESTRICT mOrigin; int mSetIdx; + int mMultiResDiscreteIdxSpacing = 1; + Neon::int32_3d mDomainSize; }; } // namespace Neon::domain::details::bGrid diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h index ec456913..75d2006b 100644 --- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h @@ -25,14 +25,16 @@ bPartition:: typename Idx::DataBlockIdx* blockConnectivity, typename SBlock::BitMask const* NEON_RESTRICT mask, Neon::int32_3d* origin, - NghIdx* stencilNghIndex) + NghIdx* stencilNghIndex, + Neon::int32_3d mDomainSize) : mCardinality(cardinality), mMem(mem), mStencilNghIndex(stencilNghIndex), mBlockConnectivity(blockConnectivity), mMask(mask), mOrigin(origin), - mSetIdx(setIdx) + mSetIdx(setIdx), + mDomainSize(mDomainSize) { } @@ -45,9 +47,20 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: location.x += gidx.mInDataBlockIdx.x; location.y += gidx.mInDataBlockIdx.y; location.z += gidx.mInDataBlockIdx.z; + if constexpr (SBlock::isMultiResMode) { + return location * mMultiResDiscreteIdxSpacing; + } return location; } +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getDomainSize() + const -> Neon::index_3d +{ + return mDomainSize; +} + template NEON_CUDA_HOST_DEVICE inline auto bPartition:: getBlockViewIdx(const Idx& gidx) @@ -68,7 +81,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: template inline NEON_CUDA_HOST_DEVICE auto bPartition:: - operator()(const Idx& cell, +operator()(const Idx& cell, int card) -> T& { return mMem[helpGetPitch(cell, card)]; @@ -76,7 +89,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: template inline NEON_CUDA_HOST_DEVICE auto bPartition:: - operator()(const Idx& cell, +operator()(const Idx& cell, int card) const -> const T& { return mMem[helpGetPitch(cell, card)]; @@ -97,7 +110,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition:: helpGetValidIdxPitchExplicit(const Idx& idx, int card) const -> uint32_t { - uint32_t const blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; + uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ; uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x + SBlock::memBlockSizeX * idx.mInDataBlockIdx.y + (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z; @@ -371,6 +384,54 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition:: return result; } +template + +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + + if (isValid) { + auto const& value = mMem[pitch]; + funIfValid(value); + return; + } + + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} + +template +template +NEON_CUDA_HOST_DEVICE inline auto bPartition:: + writeNghData(const Idx& gidx, + int card, + T value) + -> bool +{ + NghData result; + bIndex nghIdx = helpGetNghIdx(gidx); + auto [isValid, pitch] = helpNghPitch(nghIdx, card); + if (!isValid) { + return false; + } + mMem[pitch] = value; + return true; +} + template NEON_CUDA_HOST_DEVICE inline auto bPartition::isActive(const Idx& cell, diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h index 49f57dbd..11dda19e 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h @@ -26,7 +26,8 @@ dField::dField(const std::string& fieldUserName, T(0), dataUse, memoryOptions, - haloStatus) { + haloStatus) +{ // only works if dims in x and y direction for all partitions match for (int i = 0; i < dims.size() - 1; ++i) { @@ -88,7 +89,7 @@ dField::dField(const std::string& fieldUserName, { // Setting up partitions Neon::aGrid const& aGrid = mData->grid->helpFieldMemoryAllocator(); - mData->memoryField = aGrid.newField(fieldUserName + "-storage", cardinality, T(), dataUse, memoryOptions); + mData->memoryField = aGrid.newField(fieldUserName + "-storage", cardinality, T(), dataUse, memoryOptions); // const int setCardinality = mData->grid->getBackend().getDeviceCount(); mData->partitionTable.forEachConfiguration( [&](Neon::Execution execution, @@ -306,7 +307,7 @@ auto dField::operator()(const Neon::index_3d& idxGlobal, auto& partition = mData->partitionTable.getPartition(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD); - auto& span = mData->grid->getSpan(Neon::Execution::host,partitionIdx, Neon::DataView::STANDARD); + auto& span = mData->grid->getSpan(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD); Idx idx; bool isOk = span.setAndValidate(idx, localIDx.x, localIDx.y, localIDx.z); if (!isOk) { @@ -326,7 +327,7 @@ auto dField::getReference(const Neon::index_3d& idxGlobal, auto& partition = mData->partitionTable.getPartition(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD); - auto& span = mData->grid->getSpan(Neon::Execution::host,partitionIdx, Neon::DataView::STANDARD); + auto& span = mData->grid->getSpan(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD); Idx idx; bool isOk = span.setAndValidate(idx, localIDx.x, localIDx.y, localIDx.z); if (!isOk) { @@ -484,6 +485,81 @@ auto dField::initHaloUpdateTable() transfersVec.push_back(transfer); } }); + + mData->latticeHaloUpdateTable.forEachPutConfiguration( + bk, [&](Neon::SetIdx setIdxSrc, + Execution execution, + Neon::domain::tool::partitioning::ByDirection byDirection, + std::vector& transfersVec) { + { + using namespace Neon::domain::tool::partitioning; + + Neon::SetIdx setIdxDst = getNghSetIdx(setIdxSrc, byDirection); + + int r = grid.getStencil().getRadius(); + + std::array partitions; + std::array, Data::EndPointsUtils::nConfigs> ghostZBeginIdx; + std::array, Data::EndPointsUtils::nConfigs> boundaryZBeginIdx; + std::array memPhyDim; + + partitions[Data::EndPoints::dst] = &this->getPartition(execution, setIdxDst, Neon::DataView::STANDARD); + partitions[Data::EndPoints::src] = &this->getPartition(execution, setIdxSrc, Neon::DataView::STANDARD); + + for (auto endPoint : {Data::EndPoints::dst, Data::EndPoints::src}) { + ghostZBeginIdx[endPoint][static_cast(ByDirection::down)] = 0; + boundaryZBeginIdx[endPoint][static_cast(ByDirection::down)] = r; + boundaryZBeginIdx[endPoint][static_cast(ByDirection::up)] = partitions[endPoint]->dim().z; + ghostZBeginIdx[endPoint][static_cast(ByDirection::up)] = partitions[endPoint]->dim().z + r; + + memPhyDim[endPoint] = Neon::size_4d( + 1, + size_t(partitions[endPoint]->dim().x), + size_t(partitions[endPoint]->dim().x) * partitions[endPoint]->dim().y, + size_t(partitions[endPoint]->dim().x) * partitions[endPoint]->dim().y * (partitions[endPoint]->dim().z + 2 * r)); + } + + for (int j = 0; j < this->getCardinality(); j++) { + auto const& stencil = this->getGrid().getStencil(); + if (this->getCardinality() != stencil.nPoints()) { + continue; + } + T* srcMem = partitions[Data::EndPoints::src]->mem(); + T* dstMem = partitions[Data::EndPoints::dst]->mem(); + + Neon::size_4d srcBoundaryBuff(0, 0, boundaryZBeginIdx[Data::EndPoints::src][static_cast(byDirection)], j); + Neon::size_4d dstGhostBuff(0, 0, ghostZBeginIdx[Data::EndPoints::dst][static_cast(ByDirectionUtils::invert(byDirection))], j); + + // std::cout << "To " << dstGhostBuff << " prt " << partitions[Data::EndPoints::dst]->prtID() << " From " << srcBoundaryBuff << "(src dim" << partitions[Data::EndPoints::src]->dim() << ")" << std::endl; + // std::cout << "dst mem " << partitions[Data::EndPoints::dst]->mem() << " " << std::endl; + // std::cout << "dst pitch " << (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum() << " " << std::endl; + // std::cout << "dst dstGhostBuff " << dstGhostBuff << " " << std::endl; + // std::cout << "dst pitch all" << memPhyDim[Data::EndPoints::dst] << " " << std::endl; + + Neon::set::MemoryTransfer transfer({setIdxDst, dstMem + (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum(), dstGhostBuff}, + {setIdxSrc, srcMem + (srcBoundaryBuff * memPhyDim[Data::EndPoints::src]).rSum(), srcBoundaryBuff}, + sizeof(T) * + r * + partitions[Data::EndPoints::src]->dim().x * + partitions[Data::EndPoints::src]->dim().y); + if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSrc)) { + return; + } + + if (ByDirection::down == byDirection && bk.isFirstDevice(setIdxSrc)) { + return; + } + if (ByDirection::up == byDirection && !(stencil.points()[j].z > 0)) { + continue; + } + if (ByDirection::down == byDirection && !(stencil.points()[j].z < 0)) { + continue; + } + // std::cout << transfer.toString() << std::endl; + transfersVec.push_back(transfer); + } + } + }); // // mData->latticeHaloUpdateTable.forEachPutConfiguration( // bk, [&](Neon::SetIdx setIdxSrc, @@ -608,7 +684,33 @@ auto dField:: execution); } } else { - NEON_DEV_UNDER_CONSTRUCTION(""); + auto transfers = bk.template newDataSet>(); + if (this->getMemoryOptions().getOrder() == Neon::MemoryLayout::structOfArrays) { + for (auto byDirection : {tool::partitioning::ByDirection::up, + tool::partitioning::ByDirection::down}) { + + auto const& tableEntryByDir = mData->latticeHaloUpdateTable.get(transferMode, + execution, + byDirection); + + tableEntryByDir.forEachSeq([&](SetIdx setIdx, auto const& tableEntryByDirBySetIdx) { + transfers[setIdx].insert(std::end(transfers[setIdx]), + std::begin(tableEntryByDirBySetIdx), + std::end(tableEntryByDirBySetIdx)); + }); + } + dataTransferContainer = + Neon::set::Container::factoryDataTransfer( + *this, + transferMode, + stencilSemantic, + transfers, + execution); + + + } else { + NEON_DEV_UNDER_CONSTRUCTION(""); + } } Neon::set::Container SyncContainer = Neon::set::Container::factorySynchronization( diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h index 226b0bb8..5d56e526 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h @@ -20,7 +20,7 @@ #include "Neon/domain/interface/LaunchConfig.h" #include "Neon/domain/interface/Stencil.h" #include "Neon/domain/interface/common.h" - +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/domain/tools/SpanTable.h" #include "Neon/domain/patterns/PatternScalar.h" @@ -84,7 +84,8 @@ class dGrid : public Neon::domain::interface::GridBaseTemplate const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, - const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */); + const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); /** * Returns a LaunchParameters configured for the specified inputs. diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h index 297971de..a263400a 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h @@ -8,12 +8,18 @@ template dGrid::dGrid(const Neon::Backend& backend, const Neon::int32_3d& dimension, const ActiveCellLambda& /*activeCellLambda*/, - const Neon::domain::Stencil& stencil, - const Vec_3d& spacing, - const Vec_3d& origin) + const Neon::domain::Stencil& stencil, + const Vec_3d& spacing, + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) { mData = std::make_shared(backend); const index_3d defaultBlockSize(256, 1, 1); + if (encoderType != Neon::domain::tool::spaceCurves::EncoderType::sweep) { + NeonException exce("dGrid"); + exce << "dGRid only supports sweep space filling curves"; + NEON_THROW(exce); + } { auto nElementsPerPartition = backend.devSet().template newDataSet(0); @@ -26,7 +32,9 @@ dGrid::dGrid(const Neon::Backend& backend, nElementsPerPartition, Neon::index_3d(256, 1, 1), spacing, - origin); + origin, + Neon::domain::tool::spaceCurves::EncoderType::sweep, + {0, 0, 0}); } const int32_t numDevices = getBackend().devSet().setCardinality(); @@ -83,15 +91,17 @@ dGrid::dGrid(const Neon::Backend& backend, Neon::DataView dw, dSpan& span) { span.mDataView = dw; - span.mZHaloRadius = setCardinality == 1 ? 0 : mData->halo.z; - span.mZBoundaryRadius = mData->halo.z; + span.mZghostRadius = setCardinality == 1 ? 0 : mData->halo.z; + span.mZboundaryRadius = mData->halo.z; + span.mMaxZInDomain = mData->partitionDims[setIdx].z; switch (dw) { case Neon::DataView::STANDARD: { // Only works z partitions. assert(mData->halo.x == 0 && mData->halo.y == 0); - span.mDim = mData->partitionDims[setIdx]; + span.mSpanDim = mData->partitionDims[setIdx]; + break; } case Neon::DataView::BOUNDARY: { @@ -99,8 +109,8 @@ dGrid::dGrid(const Neon::Backend& backend, // Only works z partitions. assert(mData->halo.x == 0 && mData->halo.y == 0); - span.mDim = mData->partitionDims[setIdx]; - span.mDim.z = span.mZBoundaryRadius * 2; + span.mSpanDim = mData->partitionDims[setIdx]; + span.mSpanDim.z = span.mZboundaryRadius * 2; break; } @@ -109,12 +119,12 @@ dGrid::dGrid(const Neon::Backend& backend, // Only works z partitions. assert(mData->halo.x == 0 && mData->halo.y == 0); - span.mDim = mData->partitionDims[setIdx]; - span.mDim.z = span.mDim.z - span.mZBoundaryRadius * 2; - if (span.mDim.z <= 0 && setCardinality > 1) { + span.mSpanDim = mData->partitionDims[setIdx]; + span.mSpanDim.z = span.mSpanDim.z - span.mZboundaryRadius * 2; + if (span.mSpanDim.z <= 0 && setCardinality > 1) { NeonException exp("dGrid"); exp << "The grid size is too small to support the data view model correctly \n"; - exp << span.mDim << " for setIdx " << setIdx << " and device " << getDevSet().devId(setIdx); + exp << span.mSpanDim << " for setIdx " << setIdx << " and device " << getDevSet().devId(setIdx); NEON_THROW(exp); } @@ -132,7 +142,7 @@ dGrid::dGrid(const Neon::Backend& backend, Neon::DataView dw, int& count) { if (Execution::host == execution) { - count = mData->spanTable.getSpan(Neon::Execution::host, setIdx, dw).mDim.rMul(); + count = mData->spanTable.getSpan(Neon::Execution::host, setIdx, dw).mSpanDim.rMul(); } }); } @@ -180,7 +190,9 @@ dGrid::dGrid(const Neon::Backend& backend, nElementsPerPartition, defaultBlockSize, spacing, - origin); + origin, + Neon::domain::tool::spaceCurves::EncoderType::sweep, + {0, 0, 0}); } } @@ -224,11 +236,11 @@ auto dGrid::newContainer(const std::string& name, { const Neon::index_3d& defaultBlockSize = getDefaultBlock(); Neon::set::Container c = Neon::set::Container::factory(name, - Neon::set::internal::ContainerAPI::DataViewSupport::on, - *this, - lambda, - defaultBlockSize, - [](const Neon::index_3d&) { return 0; }); + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + defaultBlockSize, + [](const Neon::index_3d&) { return 0; }); return c; } @@ -242,11 +254,11 @@ auto dGrid::newContainer(const std::string& name, -> Neon::set::Container { Neon::set::Container c = Neon::set::Container::factory(name, - Neon::set::internal::ContainerAPI::DataViewSupport::on, - *this, - lambda, - blockSize, - [sharedMem](const Neon::index_3d&) { return sharedMem; }); + Neon::set::internal::ContainerAPI::DataViewSupport::on, + *this, + lambda, + blockSize, + [sharedMem](const Neon::index_3d&) { return sharedMem; }); return c; } diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h index 3291e622..a2c57cdb 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h @@ -37,9 +37,9 @@ struct dIndex NEON_CUDA_HOST_DEVICE inline explicit dIndex(const Location& location); - NEON_CUDA_HOST_DEVICE inline auto set() -> Location&; + NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&; - NEON_CUDA_HOST_DEVICE inline auto get() const -> const Location&; + NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&; }; } // namespace Neon::domain::details::dGrid diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h index 4389fb3f..6426e43a 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h @@ -16,11 +16,11 @@ NEON_CUDA_HOST_DEVICE inline dIndex::dIndex(const Location::Integer &x, mLocation.z = z; } -NEON_CUDA_HOST_DEVICE inline auto dIndex::set() -> Location& +NEON_CUDA_HOST_DEVICE inline auto dIndex::setLocation() -> Location& { return mLocation; } -NEON_CUDA_HOST_DEVICE inline auto dIndex::get() const -> const Location& +NEON_CUDA_HOST_DEVICE inline auto dIndex::getLocation() const -> const Location& { return mLocation; } diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h index 196f6b70..c1e17b0b 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h @@ -44,16 +44,16 @@ class dPartition int cardinality, Neon::index_3d fullGridSize, NghIdx* stencil = nullptr) - : m_dataView(dataView), - m_mem(mem), - m_dim(dim), - m_zHaloRadius(zHaloRadius), - m_zBoundaryRadius(zBoundaryRadius), - m_pitch(pitch), - m_prtID(prtID), - m_origin(origin), - m_cardinality(cardinality), - m_fullGridSize(fullGridSize), + : mDataView(dataView), + mMem(mem), + mDim(dim), + mZHaloRadius(zHaloRadius), + mZBoundaryRadius(zBoundaryRadius), + mPitch(pitch), + mPrtID(prtID), + mOrigin(origin), + mCardinality(cardinality), + mFullGridSize(fullGridSize), mPeriodicZ(false), mStencil(stencil) { @@ -70,21 +70,21 @@ class dPartition prtID() const -> int { - return m_prtID; + return mPrtID; } inline NEON_CUDA_HOST_DEVICE auto cardinality() const -> int { - return m_cardinality; + return mCardinality; } inline NEON_CUDA_HOST_DEVICE auto getPitchData() const -> const Pitch& { - return m_pitch; + return mPitch; } inline NEON_CUDA_HOST_DEVICE auto @@ -92,76 +92,76 @@ class dPartition int cardinalityIdx = 0) const -> int64_t { - return idx.get().x * int64_t(m_pitch.x) + - idx.get().y * int64_t(m_pitch.y) + - idx.get().z * int64_t(m_pitch.z) + - cardinalityIdx * int64_t(m_pitch.w); + return idx.getLocation().x * int64_t(mPitch.x) + + idx.getLocation().y * int64_t(mPitch.y) + + idx.getLocation().z * int64_t(mPitch.z) + + cardinalityIdx * int64_t(mPitch.w); } inline NEON_CUDA_HOST_DEVICE auto dim() const -> const Neon::index_3d { - return m_dim; + return mDim; } inline NEON_CUDA_HOST_DEVICE auto halo() const -> const Neon::index_3d { - return Neon::index_3d(0, 0, m_zHaloRadius); + return Neon::index_3d(0, 0, mZHaloRadius); } inline NEON_CUDA_HOST_DEVICE auto origin() const -> const Neon::index_3d { - return m_origin; + return mOrigin; } NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, NghIdx nghOffset, int card, const T& alternativeVal) const -> NghData { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val = alternativeVal; if (isValidNeighbour) { - val = operator()(cellNgh, card); + val = operator()(gidxNgh, card); } return NghData(val, isValidNeighbour); } NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, NghIdx nghOffset, int card) const -> NghData { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); T val; if (isValidNeighbour) { - val = operator()(cellNgh, card); + val = operator()(gidxNgh, card); } return NghData(val, isValidNeighbour); } - template + template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card, LambdaVALID funIfValid, LambdaNOTValid funIfNOTValid = nullptr) - const -> std::enable_if_t , void> + const -> std::enable_if_t, void> { - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { - T val = this->operator()(cellNgh, card); + T val = this->operator()(gidxNgh, card); funIfValid(val); } if constexpr (!std::is_same_v) { @@ -171,129 +171,146 @@ class dPartition } } - template + template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card) const -> NghData { - NghData res; - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { - T val = operator()(cellNgh, card); - res.set(val, true); - } else { - res.invalidate(); + T val = operator()(gidxNgh, card); + return NghData(val, isValidNeighbour); } - return res; + return NghData(); + } + + template + NEON_CUDA_HOST_DEVICE inline auto + writeNghData(const Idx& gidx, + int card, + T value) + -> bool + { + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + operator()(gidxNgh, card) = value; + } + return isValidNeighbour; } template NEON_CUDA_HOST_DEVICE inline auto - getNghData(const Idx& eId, + getNghData(const Idx& gidx, int card, T const& defaultValue) const -> NghData { NghData res(defaultValue, false); - Idx cellNgh; - const bool isValidNeighbour = nghIdx(eId, cellNgh); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); if (isValidNeighbour) { - T val = operator()(cellNgh, card); + T val = operator()(gidxNgh, card); res.set(val, true); } return res; } NEON_CUDA_HOST_DEVICE inline auto - nghVal(const Idx& eId, + nghVal(const Idx& gidx, uint8_t nghID, int card, const T& alternativeVal) const -> NghData { NghIdx nghOffset = mStencil[nghID]; - return getNghData(eId, nghOffset, card, alternativeVal); + return getNghData(gidx, nghOffset, card, alternativeVal); } /** * Get the index of the neighbor given the offset * @tparam dataView_ta - * @param[in] eId Index of the current element + * @param[in] gidx Index of the current element * @param[in] nghOffset Offset of the neighbor of interest from the current element * @param[in,out] neighbourIdx Index of the neighbor * @return Whether the neighbour is valid */ NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& eId, - const NghIdx& nghOffset, - Idx& neighbourIdx) + helpGetNghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) const -> bool { - Idx cellNgh(eId.get().x + nghOffset.x, - eId.get().y + nghOffset.y, - eId.get().z + nghOffset.z); + Idx gidxNgh(gidx.getLocation().x + nghOffset.x, + gidx.getLocation().y + nghOffset.y, + gidx.getLocation().z + nghOffset.z); - const auto cellNghGlobal = getGlobalIndex(cellNgh); + const auto gidxNghGlobal = getGlobalIndex(gidxNgh); bool isValidNeighbour = true; - if (mPeriodicZ) { - printf("Error, periodic not implemented yet"); - assert(false); - } - - isValidNeighbour = (cellNghGlobal.x >= 0) && - (cellNghGlobal.y >= 0) && - (cellNghGlobal.z >= 0); - - // isValidNeighbour = (cellNgh.get().x < m_dim.x) && - // (cellNgh.get().y < m_dim.y) && - // (cellNgh.get().z < m_dim.z + 2 * m_zHaloRadius) && isValidNeighbour; + isValidNeighbour = (gidxNghGlobal.x >= 0) && + (gidxNghGlobal.y >= 0) && + (gidxNghGlobal.z >= 0); - isValidNeighbour = (cellNghGlobal.x < m_fullGridSize.x) && - (cellNghGlobal.y < m_fullGridSize.y) && - (cellNghGlobal.z < m_fullGridSize.z) && + isValidNeighbour = (gidxNghGlobal.x < mFullGridSize.x) && + (gidxNghGlobal.y < mFullGridSize.y) && + (gidxNghGlobal.z < mFullGridSize.z) && isValidNeighbour; if (isValidNeighbour) { - neighbourIdx = cellNgh; + neighbourIdx = gidxNgh; } return isValidNeighbour; } template NEON_CUDA_HOST_DEVICE inline auto - nghIdx(const Idx& eId, - Idx& cellNgh) + helpGetNghIdx(const Idx& gidx, + Idx& gidxNgh) const -> bool { - cellNgh = Idx(eId.get().x + xOff, - eId.get().y + yOff, - eId.get().z + zOff); - Idx cellNgh_global(cellNgh.get() + m_origin); - // const bool isValidNeighbour = (cellNgh_global >= 0 && cellNgh < (m_dim + m_halo) && cellNgh_global < m_fullGridSize); + // NghIdx offset(xOff, yOff, zOff); + // return helpGetNghIdx(gidx, offset, gidxNgh); + gidxNgh = Idx(gidx.getLocation().x + xOff, + gidx.getLocation().y + yOff, + gidx.getLocation().z + zOff); + bool isValidNeighbour = true; if constexpr (xOff > 0) { - isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().x <= m_fullGridSize.x && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (xOff < 0) { - isValidNeighbour = cellNgh_global.get().x >= 0 && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } if constexpr (yOff > 0) { - isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().y <= m_fullGridSize.y && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (yOff < 0) { - isValidNeighbour = cellNgh_global.get().y >= 0 && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } if constexpr (zOff > 0) { - isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour; - isValidNeighbour = cellNgh_global.get().z <= m_fullGridSize.z && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; } if constexpr (zOff < 0) { - isValidNeighbour = cellNgh_global.get().z >= m_zHaloRadius && isValidNeighbour; + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; } return isValidNeighbour; } @@ -303,7 +320,7 @@ class dPartition mem() -> T* { - return m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -311,7 +328,7 @@ class dPartition const -> const T* { - return m_mem; + return mMem; } NEON_CUDA_HOST_DEVICE inline auto @@ -319,7 +336,7 @@ class dPartition int cardinalityIdx) -> T* { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -327,7 +344,7 @@ class dPartition int cardinalityIdx) -> T& { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } NEON_CUDA_HOST_DEVICE inline auto @@ -335,7 +352,7 @@ class dPartition int cardinalityIdx) const -> const T& { int64_t p = getPitch(cell, cardinalityIdx); - return m_mem[p]; + return mMem[p]; } template @@ -377,7 +394,8 @@ class dPartition } } - NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local) const -> Neon::index_3d + NEON_CUDA_HOST_DEVICE inline auto + getGlobalIndex(const Idx& local) const -> Neon::index_3d { // assert(local.mLocation.x >= 0 && // local.mLocation.y >= 0 && @@ -386,22 +404,35 @@ class dPartition // local.mLocation.y < m_dim.y && // local.mLocation.z < m_dim.z + m_zHaloRadius); - Neon::index_3d result = local.mLocation + m_origin; - result.z -= m_zHaloRadius; + Neon::index_3d result = local.mLocation; + result.z = result.z + mOrigin.z - mZHaloRadius; return result; } + template + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local) + const -> int + { + if constexpr (Neon::index_3d::directionZ != direction) { + return local.mLocation.v[direction]; + } else { + return local.mLocation.v[Neon::index_3d::directionZ] + + mOrigin.v[Neon::index_3d::directionZ] - + mZHaloRadius; + } + } + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() const -> Neon::index_3d { - return m_fullGridSize; + return mFullGridSize; } auto ioToVti(std::string const& fname, std::string const& fieldName) { - auto fnameCommplete = fname + "_" + std::to_string(m_prtID); - auto haloOrigin = Vec_3d(m_origin.x, m_origin.y, m_origin.z - m_zHaloRadius); - auto haloDim = m_dim + Neon::index_3d(0, 0, 2 * m_zHaloRadius) + 1; + auto fnameCommplete = fname + "_" + std::to_string(mPrtID); + auto haloOrigin = Vec_3d(mOrigin.x, mOrigin.y, mOrigin.z - mZHaloRadius); + auto haloDim = mDim + Neon::index_3d(0, 0, 2 * mZHaloRadius) + 1; IoToVTK io(fnameCommplete, haloDim, @@ -413,25 +444,37 @@ class dPartition io.addField([&](const Neon::index_3d& idx, int i) { return operator()(dIndex(idx), i); }, - m_cardinality, "Partition", ioToVTKns::VtiDataType_e::voxel); + mCardinality, "Partition", ioToVTKns::VtiDataType_e::voxel); io.flushAndClear(); return; } + auto getDataView() + const -> Neon::DataView + { + return mDataView; + } + + auto helpGetGlobalToLocalOffets() + const -> NghIdx* + { + return mStencil; + } + private: - Neon::DataView m_dataView; - T* m_mem; - Neon::index_3d m_dim; - int m_zHaloRadius; - int m_zBoundaryRadius; - Pitch m_pitch; - int m_prtID; - Neon::index_3d m_origin; - int m_cardinality; - Neon::index_3d m_fullGridSize; - bool mPeriodicZ; - NghIdx* mStencil; + Neon::DataView mDataView; + T* NEON_RESTRICT mMem; + Neon::index_3d mDim; + int mZHaloRadius; + int mZBoundaryRadius; + Pitch mPitch; + int mPrtID; + Neon::index_3d mOrigin; + int mCardinality; + Neon::index_3d mFullGridSize; + bool mPeriodicZ; + NghIdx* NEON_RESTRICT mStencil; }; diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h index 74ab5ff3..c81baace 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h @@ -43,11 +43,12 @@ class dSpan private: Neon::DataView mDataView; - int mZHaloRadius; - int mZBoundaryRadius; - Neon::index_3d mDim /** Dimension of the span, its values depends on the mDataView*/; + int mZghostRadius; + int mZboundaryRadius; + int mMaxZInDomain; + Neon::index_3d mSpanDim /** Dimension of the span, its values depends on the mDataView*/; }; -} // namespace Neon::domain::details::dGrid +} // namespace Neon::domain::deta ils::dGrid #include "dSpan_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h index 8f6f9fea..37bea7d7 100644 --- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h +++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h @@ -10,29 +10,29 @@ dSpan::setAndValidate(Idx& idx, const -> bool { bool res = false; - idx.set().x = int(x); - idx.set().y = int(y); - idx.set().z = int(z); + idx.setLocation().x = int(x); + idx.setLocation().y = int(y); + idx.setLocation().z = int(z); - if (idx.get() < mDim) { + if (idx.getLocation() < mSpanDim) { res = true; } switch (mDataView) { case Neon::DataView::STANDARD: { - idx.set().z += mZHaloRadius; + idx.setLocation().z += mZghostRadius; return res; } case Neon::DataView::INTERNAL: { - idx.set().z += mZHaloRadius + mZBoundaryRadius; + idx.setLocation().z += mZghostRadius + mZboundaryRadius; return res; } case Neon::DataView::BOUNDARY: { - idx.set().z += idx.get().z < mZBoundaryRadius - ? 0 - : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); - idx.set().z += mZHaloRadius; + idx.setLocation().z += idx.getLocation().z < mZboundaryRadius + ? 0 + : (mMaxZInDomain - 1) + (-1 * mZboundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); + idx.setLocation().z += mZghostRadius; return res; } @@ -51,19 +51,19 @@ NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetDataView() NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetZHaloRadius() const -> int const& { - return mZHaloRadius; + return mZghostRadius; } NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetZBoundaryRadius() const -> int const& { - return mZBoundaryRadius; + return mZboundaryRadius; } NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetDim() const -> Neon::index_3d const& { - return mDim; + return mSpanDim; } } // namespace Neon::domain::details::dGrid \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h new file mode 100644 index 00000000..7ce3e582 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h @@ -0,0 +1,98 @@ +#pragma once +#include + +#include "Neon/core/core.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/Macros.h" + +#include "Neon/set/BlockConfig.h" +#include "Neon/set/Containter.h" +#include "Neon/set/DevSet.h" +#include "Neon/set/MemoryOptions.h" + +#include "Neon/sys/memory/MemDevice.h" + +#include "Neon/domain/aGrid.h" + +#include "Neon/domain/interface/GridBaseTemplate.h" +#include "Neon/domain/interface/GridConcept.h" +#include "Neon/domain/interface/KernelConfig.h" +#include "Neon/domain/interface/LaunchConfig.h" +#include "Neon/domain/interface/Stencil.h" +#include "Neon/domain/interface/common.h" + +#include "Neon/domain/tools/GridTransformer.h" +#include "Neon/domain/tools/SpanTable.h" + +#include "Neon/domain/details/eGrid/eGrid.h" +#include "Neon/domain/patterns/PatternScalar.h" + +#include "dPartitionSoA.h" +#include "dSpanSoA.h" + +namespace Neon::domain::details::dGridSoA { + +namespace details { +struct dGridSoATransformation +{ + using FoundationGrid = Neon::domain::details::dGrid::dGrid; + using Idx = dIndexSoA; + using Span = dSpanSoA; + template + using Partition = dPartitionSoA; + + static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on; + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = FoundationGrid::executionThreadSpan; + using ExecutionThreadSpanIndexType = int32_t; + + static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const& + { + return foundationGrid.getDefaultBlock(); + } + + static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable& spanTable) -> void + { + spanTable.forEachConfiguration([&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Span& span) { + span.helpInit(foundationGrid.getSpan(execution, setIdx, dw)); + }); + } + + static auto initLaunchParameters(FoundationGrid& foundationGrid, + Neon::DataView dataView, + const Neon::index_3d& blockSize, + const size_t& shareMem) -> Neon::set::LaunchParameters + { + return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem); + } + + // static auto helpGetGridIdx(FoundationGrid&, + // Neon::SetIdx const&, + // FoundationGrid::Idx const& fgIdx) + // -> dGridSoATransformation::Idx + // { + // dGridSoATransformation::Idx tgIdx = fgIdx; + // return tgIdx; + // } + + template + static auto initFieldPartition(FoundationGrid::Field& foundationField, + Neon::domain::tool::PartitionTable>& partitionTable) -> void + { + partitionTable.forEachConfiguration( + [&](Neon::Execution execution, + Neon::SetIdx setIdx, + Neon::DataView dw, + Partition& partition) { + auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw); + partition = Partition(foundationPartition); + }); + } +}; + +} // namespace details +using dGridSoA = Neon::domain::tool::GridTransformer::Grid; + +} // namespace Neon::domain::details::dGridSoA diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h new file mode 100644 index 00000000..2ed82d86 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h @@ -0,0 +1,53 @@ +#pragma once + +#include "Neon/core/core.h" +#include "Neon/domain/details/dGridSoA/dIndexSoA.h" + +namespace Neon::domain::details::dGridSoA { + +// Common forward declarations +class dSpanSoA; +template +class dPartitionSoA; + +struct dIndexSoA +{ + using OuterIdx = dIndexSoA; + + template + friend class dPartition; + friend dSpanSoA; + + template + friend class dField; + + // dGrid specific types + using Offset = int32_t; + using Location = index_3d; + using Count = int32_t; + + dIndexSoA() = default; + Location mLocation = 0; + Offset mOffset = 0; + + NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location const& location, + Offset const& offset); + + NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location::Integer const& x, + Location::Integer const& y, + Location::Integer const& z, + Offset const& offset); + + NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&; + + NEON_CUDA_HOST_DEVICE inline auto setOffset() -> Offset&; + + NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&; + + NEON_CUDA_HOST_DEVICE inline auto getOffset() const -> const Offset&; +}; + +} // namespace Neon::domain::details::dGridSoA + +#include "dIndexSoA_imp.h" diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h new file mode 100644 index 00000000..790608c7 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h @@ -0,0 +1,50 @@ +#pragma once +#include "Neon/core/core.h" + +namespace Neon::domain::details::dGridSoA { + +NEON_CUDA_HOST_DEVICE inline dIndexSoA:: + dIndexSoA(const Location& location, + Offset const& offset) +{ + mLocation = location; + mOffset = offset; +} + +NEON_CUDA_HOST_DEVICE inline dIndexSoA:: + dIndexSoA(const Location::Integer& x, + const Location::Integer& y, + const Location::Integer& z, + Offset const& offset) +{ + mLocation.x = x; + mLocation.y = y; + mLocation.z = z; + mOffset = offset; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + setLocation() -> Location& +{ + return mLocation; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + setOffset() -> Offset& +{ + return mOffset; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + getLocation() const -> const Location& +{ + return mLocation; +} + +NEON_CUDA_HOST_DEVICE inline auto dIndexSoA:: + getOffset() + const -> const Offset& +{ + return mOffset; +} +} // namespace Neon::domain::details::dGridSoA \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h new file mode 100644 index 00000000..15c914a3 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h @@ -0,0 +1,365 @@ +#pragma once +#include +#include "Neon/core/core.h" +#include "Neon/core/types/Macros.h" +#include "Neon/domain/details/dGrid/dGrid.h" +#include "Neon/domain/interface/NghData.h" +#include "Neon/set/DevSet.h" +#include "Neon/sys/memory/CudaIntrinsics.h" +#include "cuda_fp16.h" +#include "dIndexSoA.h" + +namespace Neon::domain::details::dGridSoA { + +template +class dPartitionSoA +{ + public: + using Idx = dIndexSoA; + using NghData = Neon::domain::NghData; + using Pitch = uint32_4d; + using NghIdx = int8_3d; + using Type = T; + + dPartitionSoA() + { + } + + dPartitionSoA(Neon::domain::details::dGrid::dPartition& dPartitionOriginal) + { + mDataView = dPartitionOriginal.getDataView(); + mMem = dPartitionOriginal.mem(); + mDim = dPartitionOriginal.dim(); + mZHaloRadius = dPartitionOriginal.halo().z; + mPitch = dPartitionOriginal.getPitchData().template newType(); + mPrtID = dPartitionOriginal.prtID(); + mOrigin = dPartitionOriginal.origin(); + mCardinality = dPartitionOriginal.cardinality(); + mFullGridSize = dPartitionOriginal.getDomainSize(); + mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets(); + } + + inline NEON_CUDA_HOST_DEVICE auto + prtID() + const -> int + { + return mPrtID; + } + + inline NEON_CUDA_HOST_DEVICE auto + cardinality() + const -> int + { + return mCardinality; + } + + inline NEON_CUDA_HOST_DEVICE auto + getPitchData() + const -> const Pitch& + { + return mPitch; + } + + inline NEON_CUDA_HOST_DEVICE auto + getPitch(const Idx& idx, + int cardinality) + const -> Idx::Offset + { + return idx.getOffset() + cardinality * mPitch.w; + } + + inline NEON_CUDA_HOST_DEVICE auto + dim() + const -> const Neon::index_3d + { + return mDim; + } + + inline NEON_CUDA_HOST_DEVICE auto + halo() + const -> const Neon::index_3d + { + return Neon::index_3d(0, 0, mZHaloRadius); + } + + inline NEON_CUDA_HOST_DEVICE auto + origin() + const -> const Neon::index_3d + { + return mOrigin; + } + + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + NghIdx nghOffset, + int card, + const T& alternativeVal) + const -> NghData + { + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); + T val = alternativeVal; + if (isValidNeighbour) { + val = operator()(gidxNgh, card); + } + return NghData(val, isValidNeighbour); + } + + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + NghIdx nghOffset, + int card) + const -> NghData + { + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh); + T val; + if (isValidNeighbour) { + val = operator()(gidxNgh, card); + } + return NghData(val, isValidNeighbour); + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t, void> + { + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = this->operator()(gidxNgh, card); + funIfValid(val); + } + if constexpr (!std::is_same_v) { + if (!isValidNeighbour) { + funIfNOTValid(); + } + } + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card) + const -> NghData + { + NghData res; + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = operator()(gidxNgh, card); + res.set(val, true); + } else { + res.invalidate(); + } + return res; + } + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + T const& defaultValue) + const -> NghData + { + NghData res(defaultValue, false); + Idx gidxNgh; + const bool isValidNeighbour = helpGetNghIdx(gidx, gidxNgh); + if (isValidNeighbour) { + T val = operator()(gidxNgh, card); + res.set(val, true); + } + return res; + } + + NEON_CUDA_HOST_DEVICE inline auto + nghVal(const Idx& gidx, + uint8_t nghID, + int card, + const T& alternativeVal) + const -> NghData + { + NghIdx nghOffset = mStencil[nghID]; + return getNghData(gidx, nghOffset, card, alternativeVal); + } + + /** + * Get the index of the neighbor given the offset + * @tparam dataView_ta + * @param[in] gidx Index of the current element + * @param[in] nghOffset Offset of the neighbor of interest from the current element + * @param[in,out] neighbourIdx Index of the neighbor + * @return Whether the neighbour is valid + */ + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& gidx, + const NghIdx& nghOffset, + Idx& neighbourIdx) + const -> bool + { + Neon::index_3d cartesian(gidx.getLocation().x + nghOffset.x, + gidx.getLocation().y + nghOffset.y, + gidx.getLocation().z + nghOffset.z); + + neighbourIdx = Idx(cartesian, gidx.getOffset() + + nghOffset.x * getPitchData().x + + nghOffset.y * getPitchData().y + + nghOffset.z * getPitchData().z); + + Neon::index_3d const nghCartesianIdx = getGlobalIndex(neighbourIdx); + + bool isValidNeighbour = true; + + isValidNeighbour = (nghCartesianIdx.x >= 0) && + (nghCartesianIdx.y >= 0) && + (nghCartesianIdx.z >= 0); + + isValidNeighbour = (nghCartesianIdx.x < mFullGridSize.x) && + (nghCartesianIdx.y < mFullGridSize.y) && + (nghCartesianIdx.z < mFullGridSize.z) && + isValidNeighbour; + + return isValidNeighbour; + } + + template + NEON_CUDA_HOST_DEVICE inline auto + helpGetNghIdx(const Idx& gidx, + Idx& gidxNgh) + const -> bool + { + { + Neon::index_3d cartesian(gidx.getLocation().x + xOff, + gidx.getLocation().y + yOff, + gidx.getLocation().z + zOff); + gidxNgh = Idx(cartesian, gidx.getOffset() + + xOff * static_cast(getPitchData().x) + + yOff * static_cast(getPitchData().y) + + zOff * static_cast(getPitchData().z)); + } + + bool isValidNeighbour = true; + if constexpr (xOff > 0) { + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (xOff < 0) { + int constexpr direction = Neon::index_3d::directionX; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + if constexpr (yOff > 0) { + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (yOff < 0) { + int constexpr direction = Neon::index_3d::directionY; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + if constexpr (zOff > 0) { + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour; + } + if constexpr (zOff < 0) { + int constexpr direction = Neon::index_3d::directionZ; + int const cartesianByDirection = getGlobalIndexByDirection(gidxNgh); + isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour; + } + return isValidNeighbour; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem() + -> T* + { + return mMem; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem() const + -> const T* + { + return mMem; + } + + NEON_CUDA_HOST_DEVICE inline auto + mem(const Idx& cell, + int cardinalityIdx) + -> T* + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mMem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto + operator()(const Idx& cell, + int cardinalityIdx) + -> T& + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mMem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto + operator()(const Idx& cell, + int cardinalityIdx) + const -> const T& + { + Idx::Offset p = getPitch(cell, cardinalityIdx); + return mMem[p]; + } + + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local) + const -> Neon::index_3d + { + Neon::index_3d result = local.mLocation + mOrigin; + result.z -= mZHaloRadius; + return result; + } + + template + NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local) + const -> int + { + if constexpr (Neon::index_3d::directionZ != direction) { + return local.mLocation.v[direction] + + mOrigin.v[direction]; + } else { + return local.mLocation.v[Neon::index_3d::directionZ] + + mOrigin.v[Neon::index_3d::directionZ] - + mZHaloRadius; + } + } + + NEON_CUDA_HOST_DEVICE inline auto getDomainSize() + const -> Neon::index_3d + { + return mFullGridSize; + } + + Neon::DataView mDataView; + T* NEON_RESTRICT mMem; + Neon::index_3d mDim; + int mZHaloRadius; + Pitch mPitch; + int mPrtID; + Neon::index_3d mOrigin; + int mCardinality; + Neon::index_3d mFullGridSize; + NghIdx* NEON_RESTRICT mStencil; +}; + +} // namespace Neon::domain::details::dGridSoA diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h new file mode 100644 index 00000000..3aee038c --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h @@ -0,0 +1,57 @@ +#pragma once +#include "Neon/set/DevSet.h" +#include "dIndexSoA.h" +#include "Neon/domain/details/dGrid/dSpan.h" + +namespace Neon::domain::details::dGridSoA { + +/** + * Abstraction that represents the Cell space of a partition + * This abstraction is used by the neon lambda executor to + * run a containers on aGrid + */ +class dSpanSoA +{ + public: + using Idx = dIndexSoA; + + static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d3; + using ExecutionThreadSpanIndexType = int32_t; + + + NEON_CUDA_HOST_DEVICE inline auto + setAndValidate(Idx& idx, + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) const + -> bool; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetDataView() + const -> Neon::DataView const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetZHaloRadius() + const -> int const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetZBoundaryRadius() + const -> int const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpGetDim() + const -> Neon::index_3d const&; + + NEON_CUDA_HOST_DEVICE inline auto + helpInit(Neon::domain::details::dGrid::dSpan const&) ->void; + + private: + Neon::DataView mDataView; + int mZHaloRadius; + int mZBoundaryRadius; + Neon::index_3d mDim /** Dimension of the span, its values depends on the mDataView*/; +}; + +} // namespace Neon::domain::details::dGrid + +#include "dSpanSoA_imp.h" \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h new file mode 100644 index 00000000..f760adb5 --- /dev/null +++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h @@ -0,0 +1,86 @@ +#pragma once + +namespace Neon::domain::details::dGridSoA { + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::setAndValidate(Idx& idx, + const uint32_t& x, + const uint32_t& y, + const uint32_t& z) + const -> bool +{ + idx.setLocation().x = int(x); + idx.setLocation().y = int(y); + idx.setLocation().z = int(z); + + bool isValid = idx.getLocation() < mDim; + + switch (mDataView) { + case Neon::DataView::STANDARD: { + idx.setLocation().z += mZHaloRadius; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; + } + case Neon::DataView::INTERNAL: { + idx.setLocation().z += mZHaloRadius + mZBoundaryRadius; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; + } + case Neon::DataView::BOUNDARY: { + idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius + ? 0 + : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */); + idx.setLocation().z += mZHaloRadius; + idx.setOffset() = idx.getLocation().x + + idx.getLocation().y * mDim.x + + idx.getLocation().z * mDim.x * mDim.y; + break ; + } + default: { + } + } + return isValid; +} + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetDataView() + const -> Neon::DataView const& +{ + return mDataView; +} + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetZHaloRadius() + const -> int const& +{ + return mZHaloRadius; +} + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetZBoundaryRadius() + const -> int const& +{ + return mZBoundaryRadius; +} + +NEON_CUDA_HOST_DEVICE inline auto +dSpanSoA::helpGetDim() + const -> Neon::index_3d const& +{ + return mDim; +} + +NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) -> void +{ + mDataView = dspan.helpGetDataView(); + mZHaloRadius = dspan.helpGetZHaloRadius(); + mZBoundaryRadius = dspan.helpGetZBoundaryRadius(); + mDim = dspan.helpGetDim(); +} + + +} // namespace Neon::domain::details::dGridSoA \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h index c89cfdc3..2427dc57 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h @@ -65,7 +65,8 @@ eField::eField(const std::string& fieldUserName, mData->grid->getConnectivityField().getPartition(execution, setIdx, Neon::DataView::STANDARD).mem(), mData->grid->getGlobalMappingField().getPartition(execution, setIdx, Neon::DataView::STANDARD).mem(), mData->grid->getStencil3dTo1dOffset().rawMem(execution, setIdx), - mData->grid->getStencil().getRadius()); + mData->grid->getStencil().getRadius(), + mData->grid->getDimension()); }); } diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h index 346c2121..8a6269eb 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h @@ -84,7 +84,8 @@ class eGrid : public Neon::domain::interface::GridBaseTemplate const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, - const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */); + const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); eGrid(const Neon::Backend& backend /**< Target for computation */, const Neon::int32_3d& dimension /**< Dimension of the bounding box containing the domain */, diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h index a12f87ce..1e5c444b 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h @@ -10,7 +10,8 @@ eGrid::eGrid(const Neon::Backend& backend, const ActiveCellLambda& activeCellLambda, const Neon::domain::Stencil& stencil, const Vec_3d& spacing, - const Vec_3d& origin) + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType spaceFillingCode ) { mData = std::make_shared(backend); mData->stencil = stencil; @@ -29,7 +30,9 @@ eGrid::eGrid(const Neon::Backend& backend, nElementsPerPartition, Neon::index_3d(256, 1, 1), spacing, - origin); + origin, + spaceFillingCode, + {1,1,1}); } @@ -40,6 +43,7 @@ eGrid::eGrid(const Neon::Backend& backend, 1, dimension, stencil, + spaceFillingCode, 1); @@ -124,7 +128,9 @@ eGrid::eGrid(const Neon::Backend& backend, nElementsPerPartition, defaultBlockSize, spacing, - origin); + origin, + spaceFillingCode, + {1,1,1}); } } diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h index 012a3588..4381a24c 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h @@ -59,7 +59,7 @@ class ePartition * | * | Connectivity table has the same layout of a field with cardinality equal to * | the number of neighbours and an SoA layout. Let's call this field nghField. - * | nghField(e, nghIdx) is the eIdx_t of the neighbour element as in a STANDARD + * | nghField(e, helpGetNghIdx) is the eIdx_t of the neighbour element as in a STANDARD * | view. * |--) */ @@ -186,8 +186,21 @@ class ePartition NEON_CUDA_HOST_DEVICE inline auto getNghData(Idx eId, int card, - T defaultValue) + T defaultValue) const -> NghData; + + template + NEON_CUDA_HOST_DEVICE inline auto + getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid = nullptr) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void>; + /** * Check is the * @tparam dataView_ta @@ -211,6 +224,10 @@ class ePartition getGlobalIndex(Idx Idx) const -> Neon::index_3d; + NEON_CUDA_HOST_DEVICE inline auto + getDomainSize() + const -> Neon::index_3d; + NEON_CUDA_HOST_DEVICE inline auto mem() const -> const T*; @@ -231,7 +248,8 @@ class ePartition Offset* connRaw, Neon::index_3d* toGlobal, int8_t* stencil3dTo1dOffset, - int32_t stencilRadius); + int32_t stencilRadius, + Neon::index_3d domainSize); /** * Returns a pointer to element eId with target cardinality cardinalityIdx @@ -256,11 +274,6 @@ class ePartition getOffset(Idx eId, int cardinalityIdx) const -> Offset; - /** - * Returns raw pointer of the field - * @tparam dataView_ta - * @return - */ protected: //-- [INTERNAL DATA] ---------------------------------------------------------------------------- @@ -278,6 +291,7 @@ class ePartition int8_t* mStencil3dTo1dOffset = {nullptr}; int32_t mStencilTableYPitch; int32_t mStencilRadius; // Shift to be applied to all 3d offset component to access mStencil3dTo1dOffset table + Neon::index_3d mDomainSize; }; } // namespace Neon::domain::details::eGrid diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h index 0063ee9e..29980a61 100644 --- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h @@ -37,43 +37,43 @@ ePartition::cardinality() const template NEON_CUDA_HOST_DEVICE inline auto -ePartition::operator()(eIndex eId, int cardinalityIdx) const +ePartition::operator()(eIndex gidx, int cardinalityIdx) const -> T { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem[jump]; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::operator()(eIndex eId, int cardinalityIdx) -> T& +ePartition::operator()(eIndex gidx, int cardinalityIdx) -> T& { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem[jump]; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, +ePartition::getNghData(eIndex gidx, NghIdx nghIdx, int card) const -> NghData { - eIndex eIdxNgh; - const bool isValidNeighbour = isValidNgh(eId, nghIdx, eIdxNgh); + eIndex gidxxNgh; + const bool isValidNeighbour = isValidNgh(gidx, nghIdx, gidxxNgh); if (isValidNeighbour) { - T val = this->operator()(eIdxNgh, card); + T val = this->operator()(gidxxNgh, card); return NghData(val, isValidNeighbour); } - return NghData(isValidNeighbour); + return NghData(); } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, +ePartition::getNghData(eIndex gidx, const Neon::int8_3d& ngh3dIdx, int card) const -> NghData @@ -82,7 +82,7 @@ ePartition::getNghData(eIndex eId, (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); return res; } @@ -91,15 +91,15 @@ template template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, - int card) +ePartition::getNghData(eIndex gidx, + int card) const -> NghData { int tablePithc = (xOff + mStencilRadius) + (yOff + mStencilRadius) * mStencilTableYPitch + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); return res; } @@ -108,37 +108,66 @@ template template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghData(eIndex eId, - int card, - T defaultVal) +ePartition::getNghData(eIndex gidx, + int card, + T defaultVal) const -> NghData { int tablePithc = (xOff + mStencilRadius) + (yOff + mStencilRadius) * mStencilTableYPitch + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; - NghData res = getNghData(eId, nghIdx, card); + NghData res = getNghData(gidx, nghIdx, card); if (!res.isValid()) { res.set(defaultVal, false); } return res; } +template +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getNghData(const Idx& gidx, + int card, + LambdaVALID funIfValid, + LambdaNOTValid funIfNOTValid) + const -> std::enable_if_t && (std::is_invocable_v || std::is_same_v), void> +{ + int tablePithc = (xOff + mStencilRadius) + + (yOff + mStencilRadius) * mStencilTableYPitch + + (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; + NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; + NghData res = getNghData(gidx, nghIdx, card); + if (res.isValid()) { + funIfValid(res.getData()); + return; + } + if constexpr (!std::is_same_v) { + funIfNOTValid(); + } + return; +} + template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getNghIndex(eIndex eId, +ePartition::getNghIndex(eIndex gidx, const Neon::int8_3d& ngh3dIdx, - eIndex& eIdxNgh) const -> bool + eIndex& gidxxNgh) const -> bool { int tablePithc = (ngh3dIdx.x + mStencilRadius) + (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch + (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch; NghIdx nghIdx = mStencil3dTo1dOffset[tablePithc]; eIndex tmpEIdxNgh; - const bool isValidNeighbour = isValidNgh(eId, nghIdx, tmpEIdxNgh); + const bool isValidNeighbour = isValidNgh(gidx, nghIdx, tmpEIdxNgh); if (isValidNeighbour) { - eIdxNgh = tmpEIdxNgh; + gidxxNgh = tmpEIdxNgh; } return isValidNeighbour; } @@ -146,17 +175,17 @@ ePartition::getNghIndex(eIndex eId, template NEON_CUDA_HOST_DEVICE inline auto -ePartition::isValidNgh(eIndex eId, +ePartition::isValidNgh(eIndex gidx, NghIdx nghIdx, eIndex& neighbourIdx) const -> bool { - const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + eId.helpGet(); + const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + gidx.helpGet(); neighbourIdx.helpSet() = NEON_CUDA_CONST_LOAD((mConnectivity + connectivityJumo)); const bool isValidNeighbour = (neighbourIdx.mIdx > -1); - // printf("(prtId %d) getNghData id %d eIdxNgh %d connectivityJumo %d\n", + // printf("(prtId %d) getNghData id %d gidxxNgh %d connectivityJumo %d\n", // mPrtID, - // eId.mIdx, neighbourIdx.mIdx, connectivityJumo); + // gidx.mIdx, neighbourIdx.mIdx, connectivityJumo); return isValidNeighbour; } @@ -181,7 +210,8 @@ ePartition::ePartition(int prtId, Offset* connRaw, Neon::index_3d* toGlobal, int8_t* stencil3dTo1dOffset, - int32_t stencilRadius) + int32_t stencilRadius, + Neon::index_3d domainSize) { mPrtID = prtId; mMem = mem; @@ -196,25 +226,26 @@ ePartition::ePartition(int prtId, mStencilTableYPitch = 2 * stencilRadius + 1; mStencilRadius = stencilRadius; + mDomainSize = domainSize; } template NEON_CUDA_HOST_DEVICE auto -ePartition::pointer(eIndex eId, int cardinalityIdx) const +ePartition::pointer(eIndex gidx, int cardinalityIdx) const -> const Type* { - Offset jump = getOffset(eId, cardinalityIdx); + Offset jump = getOffset(gidx, cardinalityIdx); return mMem + jump; } template NEON_CUDA_HOST_DEVICE inline auto -ePartition::getOffset(eIndex eId, int cardinalityIdx) const +ePartition::getOffset(eIndex gidx, int cardinalityIdx) const -> Offset { - return Offset(eId.helpGet() * mPitch.x + cardinalityIdx * mPitch.y); + return Offset(gidx.helpGet() * mPitch.x + cardinalityIdx * mPitch.y); } template ::mem() const return mMem; } +template +NEON_CUDA_HOST_DEVICE inline auto +ePartition::getDomainSize() + const -> Neon::index_3d +{ + return mDomainSize; +} + } // namespace Neon::domain::details::eGrid diff --git a/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h b/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h index 9473ca55..4ab44988 100644 --- a/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h +++ b/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h @@ -33,7 +33,7 @@ mPartition::mPartition(int level, NghIdx* stencilNghIndex, int* refFactors, int* spacing) - : Neon::domain::details::bGrid::bPartition(0, cardinality, mem, neighbourBlocks, mask, origin, stencilNghIndex), + : Neon::domain::details::bGrid::bPartition(0, cardinality, mem, neighbourBlocks, mask, origin, stencilNghIndex, {0,0,0}), mLevel(level), mMemParent(memParent), mMemChild(memChild), diff --git a/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h b/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h index eed4c3bf..c76b2d42 100644 --- a/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h +++ b/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h @@ -41,7 +41,9 @@ sGrid::sGrid(const OuterGridT& outerGrid, nElementsPerPartition, defaultsBlockDim, outerGrid.getSpacing(), - outerGrid.getOrigin()); + outerGrid.getOrigin(), + outerGrid.getSpaceCurve(), + outerGrid.getMemoryBlock()); mStorage = std::make_shared(); mStorage->init(outerGrid); @@ -173,7 +175,9 @@ sGrid::sGrid(const OuterGridT& outerGrid, mStorage->getCount(Neon::DataView::STANDARD), defaultsBlockDim, outerGrid.getSpacing(), - outerGrid.getOrigin()); + outerGrid.getOrigin(), + outerGrid.getSpaceCurve(), + outerGrid.getMemoryBlock()); } template diff --git a/libNeonDomain/include/Neon/domain/interface/GridBase.h b/libNeonDomain/include/Neon/domain/interface/GridBase.h index daa5d697..04837435 100644 --- a/libNeonDomain/include/Neon/domain/interface/GridBase.h +++ b/libNeonDomain/include/Neon/domain/interface/GridBase.h @@ -9,8 +9,8 @@ #include "Neon/set/DevSet.h" #include "Neon/core/tools/io/ioToVti.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Stencil.h" - namespace Neon::domain::interface { /** @@ -66,13 +66,6 @@ class GridBase auto getNumActiveCellsPerPartition() const -> const Neon::set::DataSet&; - // /** - // * Return the number of cells stored per partition - // * @return - // */ - // auto getNumActiveCellsPerPartition() const - // -> const Neon::set::DataSet&; - /** * Creates a DataSet object compatible with the number of GPU used by the grid. */ @@ -123,6 +116,8 @@ class GridBase auto getGridUID() const -> size_t; + + /** * Add the grid information in a Report object */ @@ -136,31 +131,40 @@ class GridBase auto getDefaultBlock() const -> const Neon::index_3d&; + auto getMemoryBlock() const + -> Neon::index_3d; + + auto getSpaceCurve() const + -> Neon::domain::tool::spaceCurves::EncoderType; protected: /** * Protected constructor */ - GridBase(const std::string& gridImplementationName, - const Neon::Backend& backend, - const Neon::index_3d& dim, - const Neon::domain::Stencil& stencil, - const Neon::set::DataSet& nPartitionElements /**< Number of element per partition */, - const Neon::index_3d& defaultBlockSize, - const Vec_3d& spacingData = Vec_3d(1, 1, 1) /*! Spacing, i.e. size of a voxel */, - const Vec_3d& origin = Vec_3d(0, 0, 0) /*! Origin */); + GridBase(const std::string& gridImplementationName, + const Neon::Backend& backend, + const Neon::index_3d& dim, + const Neon::domain::Stencil& stencil, + const Neon::set::DataSet& nPartitionElements /**< Number of element per partition */, + const Neon::index_3d& defaultBlockSize, + const Vec_3d& spacingData /*! Spacing, i.e. size of a voxel */, + const Vec_3d& origin /*! Origin */, + Neon::domain::tool::spaceCurves::EncoderType spaceCurve, + Neon::index_3d memoryBlock); /** * Protected initialization function used by derived classes to set some parameters. */ - auto init(const std::string& gridImplementationName /**< Name of the implementation, for example dGrid eGrid etc */, - const Neon::Backend& backend /**< Backend used to create the grid */, - const Neon::index_3d& dimension /**< Dimension of the grid */, - const Neon::domain::Stencil& stencil /**< Union of all the stencil that will be used with the grid */, - const Neon::set::DataSet& nPartitionElements /**< Elements associated to each partition */, - const Neon::index_3d& defaultBlockSize /**< Default thread block size */, - const Vec_3d& spacingData /**< Grid spacing */, - const Vec_3d& origin /**< Position in space of the grid's origin */) -> void; + auto init(const std::string& gridImplementationName /**< Name of the implementation, for example dGrid eGrid etc */, + const Neon::Backend& backend /**< Backend used to create the grid */, + const Neon::index_3d& dimension /**< Dimension of the grid */, + const Neon::domain::Stencil& stencil /**< Union of all the stencil that will be used with the grid */, + const Neon::set::DataSet& nPartitionElements /**< Elements associated to each partition */, + const Neon::index_3d& defaultBlockSize /**< Default thread block size */, + const Vec_3d& spacingData /**< Grid spacing */, + const Vec_3d& origin /**< Position in space of the grid's origin */, + Neon::domain::tool::spaceCurves::EncoderType spaceCurve, + Neon::index_3d memoryBlock) -> void; /** * Protected method to set the default thread blocks size @@ -175,6 +179,7 @@ class GridBase -> Neon::set::LaunchParameters&; + private: struct Storage { @@ -187,14 +192,16 @@ class GridBase index_3d blockDim; }; - Neon::Backend backend /**< Backend used to create and run the grid. */; - Neon::index_3d dimension /**< Dimension of the grid */; - Neon::domain::Stencil stencil /**< Stencil used for the grid initialization */; - Neon::set::DataSet nPartitionElements /**< Number of elements per partition */; - Vec_3d spacing /**< Spacing, i.e. size of a voxel */; - Vec_3d origin /**< Position in space of the grid's origin */; - Defaults_t defaults; - std::string gridImplementationName; + Neon::Backend backend /**< Backend used to create and run the grid. */; + Neon::index_3d dimension /**< Dimension of the grid */; + Neon::domain::Stencil stencil /**< Stencil used for the grid initialization */; + Neon::set::DataSet nPartitionElements /**< Number of elements per partition */; + Vec_3d spacing /**< Spacing, i.e. size of a voxel */; + Vec_3d origin /**< Position in space of the grid's origin */; + Defaults_t defaults; + std::string gridImplementationName; + Neon::domain::tool::spaceCurves::EncoderType spaceCurve; + Neon::index_3d memoryBlock; }; std::shared_ptr mStorage; diff --git a/libNeonDomain/include/Neon/domain/interface/NghData.h b/libNeonDomain/include/Neon/domain/interface/NghData.h index 487c8fd7..b7de2fca 100644 --- a/libNeonDomain/include/Neon/domain/interface/NghData.h +++ b/libNeonDomain/include/Neon/domain/interface/NghData.h @@ -10,7 +10,7 @@ struct NghData { Type mData; bool mIsValid; - NEON_CUDA_HOST_DEVICE NghData(bool status = false) + NEON_CUDA_HOST_DEVICE NghData() { this->mIsValid = false; } diff --git a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h index 90556fb9..47518f7a 100644 --- a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h +++ b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h @@ -1,10 +1,10 @@ #pragma once +#include "Neon/domain/tools/PartitionTable.h" +#include "Neon/domain/tools/SpanTable.h" #include "Neon/domain/tools/gridTransformer/tField.h" #include "Neon/domain/tools/gridTransformer/tGrid.h" #include "Neon/domain/tools/gridTransformer/tGrid_ti.h" -#include "Neon/domain/tools/PartitionTable.h" -#include "Neon/domain/tools/SpanTable.h" namespace Neon::domain::tool { @@ -24,9 +24,10 @@ template class GridTransformer { public: + using Idx = typename GridTransformation::Idx; + using Span = typename GridTransformation::Span; template using Partition = typename GridTransformation::template Partition; - using Span = typename GridTransformation::Span; using FoundationGrid = typename GridTransformation::FoundationGrid; using Grid = details::tGrid; diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h index 67f7d9f7..6d110a1f 100644 --- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h +++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h @@ -105,13 +105,14 @@ class Partitioner1D template - Partitioner1D(const Neon::Backend& backend, - const ActiveIndexLambda& activeIndexLambda, - const BcLambda& bcLambda, - const Neon::index_3d& dataBlockSize, - const Neon::int32_3d& domainSize, - const Neon::domain::Stencil stencil, - const int& multiResDiscreteIdxSpacing = 1) + Partitioner1D(const Neon::Backend& backend, + const ActiveIndexLambda& activeIndexLambda, + const BcLambda& bcLambda, + const Neon::index_3d& dataBlockSize, + const Neon::int32_3d& domainSize, + const Neon::domain::Stencil stencil, + Neon::domain::tool::spaceCurves::EncoderType spaceFillingType, + const int& multiResDiscreteIdxSpacing = 1) { mData = std::make_shared(); @@ -119,6 +120,7 @@ class Partitioner1D mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing; mData->mStencil = stencil; mData->mDomainSize = domainSize; + mData->spaceCurve = spaceFillingType; // Block space interval (i.e. indexing space at the block granularity) @@ -164,6 +166,7 @@ class Partitioner1D domainSize, stencil, multiResDiscreteIdxSpacing, + spaceFillingType, mData->spanDecomposition); mData->mSpanLayout = std::make_shared( @@ -182,7 +185,12 @@ class Partitioner1D { return mData->block3DSpan; } - + + auto getSpaceCurve() const -> Neon::domain::tool::spaceCurves::EncoderType + { + return mData->spaceCurve; + } + auto getMemoryGrid() -> Neon::aGrid& { return mData->mTopologyWithGhost; @@ -288,7 +296,7 @@ class Partitioner1D auto getDenseMeta() -> const DenseMeta& { - //setDenseMeta(); + // setDenseMeta(); return *mData->mDenseMeta; } @@ -443,13 +451,14 @@ class Partitioner1D class Data { public: - Neon::index_3d mDataBlockSize = 0; - int mMultiResDiscreteIdxSpacing = 0; - Neon::domain::Stencil mStencil; - Neon::index_3d mDomainSize; - Neon::int32_3d block3DSpan; - bool globalMappingInit = false; - Neon::aGrid::Field globalMapping; + Neon::index_3d mDataBlockSize = 0; + int mMultiResDiscreteIdxSpacing = 0; + Neon::domain::Stencil mStencil; + Neon::index_3d mDomainSize; + Neon::int32_3d block3DSpan; + bool globalMappingInit = false; + Neon::aGrid::Field globalMapping; + Neon::domain::tool::spaceCurves::EncoderType spaceCurve; bool getStencil3dTo1dOffsetInit = false; Neon::set::MemSet stencil3dTo1dOffset; diff --git a/libNeonDomain/include/Neon/domain/tools/PointHashTable.h b/libNeonDomain/include/Neon/domain/tools/PointHashTable.h index 1b3e547e..d7bca923 100644 --- a/libNeonDomain/include/Neon/domain/tools/PointHashTable.h +++ b/libNeonDomain/include/Neon/domain/tools/PointHashTable.h @@ -61,6 +61,8 @@ class PointHashTable */ auto size() const -> size_t; + auto getBBox() const -> Point const&; + private: using Key = size_t; diff --git a/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h b/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h index 3a7375af..1c9abbef 100644 --- a/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h +++ b/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h @@ -105,4 +105,10 @@ auto PointHashTable::size() const -> size_t { return mMap.size(); } + +template +auto PointHashTable::getBBox() const -> Point const&{ + return mBBox; +} + } // namespace Neon::domain::tool \ No newline at end of file diff --git a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h new file mode 100644 index 00000000..add3f51e --- /dev/null +++ b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h @@ -0,0 +1,352 @@ +#pragma once +#include "Neon/Neon.h" +#include "Neon/Report.h" + +namespace Neon::domain::tool::spaceCurves { + + +enum struct EncoderType +{ + sweep = 0, + morton = 1, + hilbert = 2, +}; + + +/** + * Set of utilities for DataView options. + */ +struct EncoderTypeUtil +{ + /** + * Number of configurations for the enum + */ + static const int nConfig{static_cast(3)}; + + /** + * Convert enum value to string + * + * @param dataView + * @return + */ + static auto toString(EncoderType encoderType) -> std::string; + + /** + * Returns all valid configuration for DataView + * @return + */ + static auto getOptions() -> std::array; + + static auto fromInt(int val) -> EncoderType; + static auto fromString(const std::string& opt) -> EncoderType; + static auto toInt(EncoderType encoderType) -> int; + + struct Cli + { + explicit Cli(std::string); + explicit Cli(EncoderType model); + Cli(); + + auto getOption() const -> EncoderType; + auto set(const std::string& opt) -> void; + auto getStringOptions() const -> std::string; + auto getStringOption() const -> std::string; + auto getDoc() const -> std::string; + + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; + auto addToReport(Neon::Report& report) const -> void; + + private: + bool mSet = false; + EncoderType mOption; + }; +}; + + +/** + * operator<< + * + * @param os + * @param m + * @return + */ +std::ostream& operator<<(std::ostream& os, Neon::DataView const& m); + +class Encoder +{ + private: + static constexpr uint8_t mortonToHilbertTable[] = { + 48, + 33, + 27, + 34, + 47, + 78, + 28, + 77, + 66, + 29, + 51, + 52, + 65, + 30, + 72, + 63, + 76, + 95, + 75, + 24, + 53, + 54, + 82, + 81, + 18, + 3, + 17, + 80, + 61, + 4, + 62, + 15, + 0, + 59, + 71, + 60, + 49, + 50, + 86, + 85, + 84, + 83, + 5, + 90, + 79, + 56, + 6, + 89, + 32, + 23, + 1, + 94, + 11, + 12, + 2, + 93, + 42, + 41, + 13, + 14, + 35, + 88, + 36, + 31, + 92, + 37, + 87, + 38, + 91, + 74, + 8, + 73, + 46, + 45, + 9, + 10, + 7, + 20, + 64, + 19, + 70, + 25, + 39, + 16, + 69, + 26, + 44, + 43, + 22, + 55, + 21, + 68, + 57, + 40, + 58, + 67, + }; + + static constexpr uint8_t hilbertToMortonTable[] = { + 48, + 33, + 35, + 26, + 30, + 79, + 77, + 44, + 78, + 68, + 64, + 50, + 51, + 25, + 29, + 63, + 27, + 87, + 86, + 74, + 72, + 52, + 53, + 89, + 83, + 18, + 16, + 1, + 5, + 60, + 62, + 15, + 0, + 52, + 53, + 57, + 59, + 87, + 86, + 66, + 61, + 95, + 91, + 81, + 80, + 2, + 6, + 76, + 32, + 2, + 6, + 12, + 13, + 95, + 91, + 17, + 93, + 41, + 40, + 36, + 38, + 10, + 11, + 31, + 14, + 79, + 77, + 92, + 88, + 33, + 35, + 82, + 70, + 10, + 11, + 23, + 21, + 41, + 40, + 4, + 19, + 25, + 29, + 47, + 46, + 68, + 64, + 34, + 45, + 60, + 62, + 71, + 67, + 18, + 16, + 49, + }; + + static inline auto transformCurve(uint64_t in, uint64_t bits, const uint8_t* lookupTable) + { + uint64_t transform = 0; + uint64_t out = 0; + + for (int32_t i = int(3 * (bits - 1)); i >= 0; i -= 3) { + transform = lookupTable[transform | ((in >> i) & 7)]; + out = (out << 3) | (transform & 7); + transform &= ~7; + } + + return out; + } + + static inline auto mortonToHilbert3D(uint64_t mortonIndex, uint64_t bits) + { + return transformCurve(mortonIndex, bits, mortonToHilbertTable); + } + + static inline auto hilbertToMorton3D(uint64_t hilbertIndex, uint64_t bits) + { + return transformCurve(hilbertIndex, bits, hilbertToMortonTable); + } + + + static inline auto splitBy3(uint64_t a) + { + uint64_t x = a & 0x1fffff; // we only care about 21 bits + x = (x | x << 32) & 0x1f00000000ffff; // shift left 32 bits, mask out bits 21-31 + x = (x | x << 16) & 0x1f0000ff0000ff; // shift left 16 bits, mask out bits 11-20, 43-52 + x = (x | x << 8) & 0x100f00f00f00f00f; // shift left 8 bits, mask out bits 5-10, 21-26, 37-42, 53-58 + x = (x | x << 4) & 0x10c30c30c30c30c3; // shift left 4 bits, mask out bits 3-4, 11-12, 19-20, 27-28, 35-36, 43-44, 51-52, 59-60 + x = (x | x << 2) & 0x1249249249249249; // shift left 2 bits, mask out bits 2, 6-7, 10, 14-15, 18, 22-23, 26, 30-31, 34, 38-39, 42, 46-47, 50, 54-55, 58 + return x; + } + + public: + static inline auto mortonEncode([[maybe_unused]] Neon::index_3d dim, Neon::index_3d idx) + -> uint64_t + { + auto idxU64 = idx.newType(); + return splitBy3(idxU64.x) | (splitBy3(idxU64.y) << 1) | (splitBy3(idxU64.z) << 2); + } + + static inline auto encodeHilbert(Neon::index_3d dim, Neon::index_3d idx) + -> uint64_t + { + uint64_t mortonEncoded = mortonEncode(dim, idx); + uint64_t bits = uint64_t(std::ceil(std::log2(dim.newType().rMax()))); + return mortonToHilbert3D(mortonEncoded, bits); + } + + static inline auto encodeSweep(Neon::index_3d dim, Neon::index_3d idx) + -> uint64_t + { + auto idxU64 = idx.newType(); + auto dimU64 = dim.newType(); + + uint64_t res = idxU64.x + idxU64.y * dimU64.x + idxU64.z * dimU64.x * dimU64.y; + return res; + } + + static inline auto encode(EncoderType type, Neon::index_3d dim, Neon::index_3d idx) + { + switch (type) { + case EncoderType::morton: + return mortonEncode(dim, idx); + case EncoderType::hilbert: + return encodeHilbert(dim, idx); + case EncoderType::sweep: + return encodeSweep(dim, idx); + default: + NEON_THROW_UNSUPPORTED_OPERATION("Encoder type not supported"); + } + } +}; +} // namespace Neon::domain::tool::spaceCurves diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h index c9ca59b9..a1b4c90d 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h @@ -26,6 +26,7 @@ class tField : public Neon::domain::interface::FieldBaseTemplate; using Idx = typename Partition::Idx; using NghIdx = typename Partition::NghIdx; // for compatibility with eGrid + using NghData = typename Partition::NghData; // for compatibility with eGrid private: using FoundationGrid = typename GridTransformation::FoundationGrid; diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h index d6d98be1..ac98983c 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h @@ -8,8 +8,8 @@ #include "Neon/domain/interface/Stencil.h" #include "Neon/domain/interface/common.h" #include "Neon/domain/patterns/PatternScalar.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/domain/tools/SpanTable.h" - /** * template * GridTransformation { @@ -54,6 +54,16 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate + tGrid(const Neon::Backend& backend /**< Target for computation */, + const Neon::int32_3d& dimension /**< Dimension of the bounding box containing the domain */, + const SparsityPattern& activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */, + const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */, + const Vec_3d& spacing = Vec_3d(1, 1, 1) /**< Spacing, i.e. size of a voxel */, + const Vec_3d& origin = Vec_3d(0, 0, 0) /**< Origin */, + Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep); + tGrid(const tGrid& other); // copy constructor tGrid(tGrid&& other) noexcept; // move constructor tGrid& operator=(const tGrid& other); // copy assignment @@ -109,7 +119,7 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate(bk); } diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h index 4ba1403d..b01b8718 100644 --- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h +++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h @@ -27,7 +27,41 @@ tGrid::tGrid(FoundationGrid& foundationGrid) foundationGrid.getNumActiveCellsPerPartition(), foundationGrid.getDefaultBlock(), foundationGrid.getSpacing(), - foundationGrid.getOrigin()); + foundationGrid.getOrigin(), + foundationGrid.getSpaceCurve(), + foundationGrid.getMemoryBlock()); +} + +template +template +tGrid::tGrid(const Neon::Backend& bk, + const Neon::int32_3d& dimension, + const SparsityPattern& activeCellLambda, + const Neon::domain::Stencil& stencil, + const Vec_3d& spacing, + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType encoderType) +{ + mData = std::make_shared(bk); + mData->foundationGrid = FoundationGrid(bk, + dimension, + activeCellLambda, + stencil, + spacing, + origin, + encoderType); + GridTransformation::initSpan(mData->foundationGrid, + NEON_OUT mData->spanTable); + tGrid::GridBase::init("tGrid", + bk, + mData->foundationGrid.getDimension(), + mData->foundationGrid.getStencil(), + mData->foundationGrid.getNumActiveCellsPerPartition(), + mData->foundationGrid.getDefaultBlock(), + mData->foundationGrid.getSpacing(), + mData->foundationGrid.getOrigin(), + encoderType, + mData->foundationGrid.getMemoryBlock()); } template diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h index 8833af7a..7cf442c6 100644 --- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h +++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h @@ -2,12 +2,95 @@ #include "Neon/core/core.h" +#include #include "Cassifications.h" #include "Neon/domain/tools/PointHashTable.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/domain/tools/partitioning/SpanDecomposition.h" - namespace Neon::domain::tool::partitioning { +struct Hash +{ + std::vector id1dTo3d; + Neon::domain::tool::PointHashTable id3dTo1d; + + auto reHash(Neon::domain::tool::spaceCurves::EncoderType encoderType) -> void + { + // std::cout << "BEFORE Cartesian "; + // for (int i = 0; i < int(id1dTo3d.size()); i++) { + // std::cout << id1dTo3d[i] << " "; + // } + // std::cout << std::endl + // << " ID "; + // for (int i = 0; i < int(id1dTo3d.size()); i++) { + // std::cout << *id3dTo1d.getMetadata(id1dTo3d[i]) << " "; + // } + // std::cout << std::endl + // << " CODE "; + // for (int i = 0; i < int(id1dTo3d.size()); i++) { + // std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(spaceCurve, id3dTo1d.getBBox(), id1dTo3d[i]) << " "; + // } + // std::cout << std::endl; + // std::cout << " BOX " << id3dTo1d.getBBox(); + // + // std::cout << std::endl; + + // Encoding all points w.r.t the encoder type + std::vector code; + for (auto const& point : id1dTo3d) { + code.push_back(Neon::domain::tool::spaceCurves::Encoder::encode(encoderType, id3dTo1d.getBBox(), point)); + } + // Sort id1dTo3d w.r.t. the codes + std::vector permutation = getSortedPermutation(code, [](uint64_t a, uint64_t b) { + return a < b; + }); + id1dTo3d = applyPermutation(id1dTo3d, permutation); + for (uint64_t i = 0; i < id1dTo3d.size(); i++) { + *(id3dTo1d.getMetadata(id1dTo3d[i])) = i; + } +// +// std::cout << "AFTER Cartesian "; +// for (int i = 0; i < int(id1dTo3d.size()); i++) { +// std::cout << id1dTo3d[i] << " "; +// } +// std::cout << std::endl +// << " ID "; +// for (int i = 0; i < int(id1dTo3d.size()); i++) { +// std::cout << *id3dTo1d.getMetadata(id1dTo3d[i]) << " "; +// } +// std::cout << std::endl +// << " CODE "; +// for (int i = 0; i < int(id1dTo3d.size()); i++) { +// std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(spaceCurve, id3dTo1d.getBBox(), id1dTo3d[i]) << " "; +// } +// std::cout << std::endl; + } + + private: + template + std::vector getSortedPermutation( + const std::vector& vec, + Compare const& compare) + { + std::vector p(vec.size()); + std::iota(p.begin(), p.end(), 0); + std::sort(p.begin(), p.end(), + [&](std::size_t i, std::size_t j) { return compare(vec[i], vec[j]); }); + return p; + } + + template + std::vector applyPermutation( + const std::vector& vec, + const std::vector& p) + { + std::vector sorted_vec(vec.size()); + std::transform(p.begin(), p.end(), sorted_vec.begin(), + [&](std::size_t i) { return vec[i]; }); + return sorted_vec; + } +}; + class SpanClassifier { public: @@ -27,6 +110,7 @@ class SpanClassifier const Neon::int32_3d& domainSize, const Neon::domain::Stencil stencil, const int& discreteVoxelSpacing, + Neon::domain::tool::spaceCurves::EncoderType encoderType, std::shared_ptr sp); @@ -48,7 +132,7 @@ class SpanClassifier ByPartition, ByDirection, ByDomain) const - -> const Neon::domain::tool::PointHashTable&; + -> const Neon::domain::tool::PointHashTable&; [[nodiscard]] auto countInternal(Neon::SetIdx setIdx, ByDomain byDomain) const -> int; @@ -72,7 +156,7 @@ class SpanClassifier ByPartition, ByDirection, ByDomain) - -> Neon::domain::tool::PointHashTable&; + -> Neon::domain::tool::PointHashTable&; private: auto addPoint(Neon::SetIdx const& setIdx, @@ -82,13 +166,7 @@ class SpanClassifier ByDomain byDomain) -> void; - struct Info - { - std::vector id1dTo3d; - Neon::domain::tool::PointHashTable id3dTo1d; - }; - - using Leve0_Info = Info; + using Leve0_Info = Hash; using Leve1_ByDomain = std::array; using Leve2_ByDirection = std::array; using Leve3_ByPartition = std::array; @@ -103,17 +181,18 @@ template -SpanClassifier::SpanClassifier(const Neon::Backend& backend, - const ActiveCellLambda& activeCellLambda, - const BcLambda& bcLambda, - const Block3dIdxToBlockOrigin& block3dIdxToBlockOrigin, - const GetVoxelAbsolute3DIdx& getVoxelAbsolute3DIdx, - const Neon::int32_3d& block3DSpan, - const Neon::int32_3d& dataBlockSize3D, - const Neon::int32_3d& domainSize, - const Neon::domain::Stencil stencil, - const int& discreteVoxelSpacing, - std::shared_ptr spanDecompositionNoUse) +SpanClassifier::SpanClassifier(const Neon::Backend& backend, + const ActiveCellLambda& activeCellLambda, + const BcLambda& bcLambda, + const Block3dIdxToBlockOrigin& block3dIdxToBlockOrigin, + const GetVoxelAbsolute3DIdx& getVoxelAbsolute3DIdx, + const Neon::int32_3d& block3DSpan, + const Neon::int32_3d& dataBlockSize3D, + const Neon::int32_3d& domainSize, + const Neon::domain::Stencil stencil, + const int& discreteVoxelSpacing, + Neon::domain::tool::spaceCurves::EncoderType spaceFillingType, + std::shared_ptr spanDecompositionNoUse) { mData = backend.devSet().newDataSet(); mSpanDecomposition = spanDecompositionNoUse; @@ -129,7 +208,7 @@ SpanClassifier::SpanClassifier(const Neon::Backend& backend, for (auto& level2 : leve3ByPartition) { for (auto& level1 : level2) { for (auto& level0 : level1) { - level0.id3dTo1d = Neon::domain::tool::PointHashTable(block3DSpan); + level0.id3dTo1d = Neon::domain::tool::PointHashTable(block3DSpan); } } } @@ -236,5 +315,20 @@ SpanClassifier::SpanClassifier(const Neon::Backend& backend, } } }); + + mData.forEachSeq([&](SetIdx, auto& leve3ByPartition) { + // using Leve0_Info = Info; + // using Leve1_ByDomain = std::array; + // using Leve2_ByDirection = std::array; + // using Leve3_ByPartition = std::array; + // using Data = Neon::set::DataSet; + for (auto& level2 : leve3ByPartition) { + for (auto& level1 : level2) { + for (auto& level0 : level1) { + level0.reHash(spaceFillingType); + } + } + } + }); } } // namespace Neon::domain::tool::partitioning diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h index a7e86f7c..4a01dd16 100644 --- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h +++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h @@ -1,7 +1,7 @@ #pragma once #include "Neon/core/core.h" +#include "Neon/domain/tools/SpaceCurves.h" #include "Neon/domain/tools/partitioning/SpanClassifier.h" - namespace Neon::domain::tool::partitioning { class SpanLayout @@ -30,6 +30,10 @@ class SpanLayout std::shared_ptr spanPartitionerPtr, std::shared_ptr spanClassifierPtr); + auto sort(Neon::domain::tool::spaceCurves::EncoderType encoderType, + SpanClassifier& spanClassifier) + -> void; + auto getCount() -> Neon::set::DataSet; diff --git a/libNeonDomain/src/domain/details/aGrid/aGrid.cpp b/libNeonDomain/src/domain/details/aGrid/aGrid.cpp index be36fd4c..87942976 100644 --- a/libNeonDomain/src/domain/details/aGrid/aGrid.cpp +++ b/libNeonDomain/src/domain/details/aGrid/aGrid.cpp @@ -61,7 +61,9 @@ auto aGrid::init(const Neon::Backend& backend, lenghts, blockDim, spacingData, - origin); + origin, + Neon::domain::tool::spaceCurves::EncoderType::sweep, + {0, 0, 0}); mStorage = std::make_shared(); diff --git a/libNeonDomain/src/domain/details/dGrid/dGrid.cpp b/libNeonDomain/src/domain/details/dGrid/dGrid.cpp index 890642b3..ec8b24d8 100644 --- a/libNeonDomain/src/domain/details/dGrid/dGrid.cpp +++ b/libNeonDomain/src/domain/details/dGrid/dGrid.cpp @@ -59,7 +59,7 @@ auto dGrid::getLaunchParameters(const Neon::DataView dataView, auto dimsByDataView = getBackend().devSet().newDataSet([&](Neon::SetIdx const& setIdx, auto& value) { - value = getSpan(Neon::Execution::host, setIdx, dataView).mDim; + value = getSpan(Neon::Execution::host, setIdx, dataView).mSpanDim; }); ret.set(Neon::sys::GpuLaunchInfo::domainGridMode, diff --git a/libNeonDomain/src/domain/details/eGrid/eGrid.cpp b/libNeonDomain/src/domain/details/eGrid/eGrid.cpp index 164ae3b1..d4e12b7f 100644 --- a/libNeonDomain/src/domain/details/eGrid/eGrid.cpp +++ b/libNeonDomain/src/domain/details/eGrid/eGrid.cpp @@ -25,7 +25,9 @@ eGrid::eGrid(const Backend& backend, nElementsPerPartition, Neon::index_3d(256, 1, 1), spacing, - origin); + origin, + partitioner.getSpaceCurve(), + {1,1,1}); } @@ -35,7 +37,7 @@ eGrid::eGrid(const Backend& backend, mData->mGlobalMappingAField = mData->partitioner1D.getGlobalMapping(); mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset(); mData->memoryGrid = mData->partitioner1D.getMemoryGrid(); - //mData->partitioner1D.getDenseMeta(mData->denseMeta); + // mData->partitioner1D.getDenseMeta(mData->denseMeta); const int32_t numDevices = getBackend().devSet().setCardinality(); @@ -109,7 +111,9 @@ eGrid::eGrid(const Backend& backend, nElementsPerPartition, defaultBlockSize, spacing, - origin); + origin, + partitioner.getSpaceCurve(), + {1,1,1}); } } @@ -200,7 +204,7 @@ auto eGrid::convertToNghIdx(Neon::index_3d const& offset) auto eGrid::isInsideDomain(const index_3d& idx) const -> bool { - //auto const& metaInfo = mData->denseMeta.get(idx); + // auto const& metaInfo = mData->denseMeta.get(idx); auto const& metaInfo = mData->partitioner1D.getDenseMeta().get(idx); return metaInfo.isValid(); } @@ -225,7 +229,7 @@ auto eGrid::getProperties(const index_3d& idx) const -> GridBaseTemplate::CellPr if (this->getDevSet().setCardinality() == 1) { cellProperties.init(0, DataView::INTERNAL); } else { - //auto const& metaInfo = mData->denseMeta.get(idx); + // auto const& metaInfo = mData->denseMeta.get(idx); auto const& metaInfo = mData->partitioner1D.getDenseMeta().get(idx); cellProperties.init(metaInfo.setIdx, metaInfo.dw); } @@ -262,7 +266,7 @@ auto eGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tupledenseMeta.get(idx); + // auto const& meta = mData->denseMeta.get(idx); auto const& meta = mData->partitioner1D.getDenseMeta().get(idx); if (meta.isValid()) { auto const& span = getSpan(Execution::host, meta.setIdx, Neon::DataView::STANDARD); diff --git a/libNeonDomain/src/domain/interface/GridBase.cpp b/libNeonDomain/src/domain/interface/GridBase.cpp index 81663239..3bfd8a21 100644 --- a/libNeonDomain/src/domain/interface/GridBase.cpp +++ b/libNeonDomain/src/domain/interface/GridBase.cpp @@ -3,14 +3,16 @@ namespace Neon::domain::interface { -auto GridBase::init(const std::string& gridImplementationName, - const Neon::Backend& backend, - const Neon::index_3d& dimension, - const Neon::domain::Stencil& stencil, - const Neon::set::DataSet& nPartitionElements, - const Neon::index_3d& defaultBlockSize, - const Vec_3d& spacingData, - const Vec_3d& origin) -> void +auto GridBase::init(const std::string& gridImplementationName, + const Neon::Backend& backend, + const Neon::index_3d& dimension, + const Neon::domain::Stencil& stencil, + const Neon::set::DataSet& nPartitionElements, + const Neon::index_3d& defaultBlockSize, + const Vec_3d& spacingData, + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType spaceCurve, + Neon::index_3d memoryBlock) -> void { mStorage->backend = backend; mStorage->dimension = dimension; @@ -24,6 +26,8 @@ auto GridBase::init(const std::string& gridImplementationName, mStorage->defaults.launchParameters[DataViewUtil::toInt(dw)] = backend.devSet().newLaunchParameters(); } mStorage->defaults.blockDim = defaultBlockSize; + mStorage->spaceCurve = spaceCurve; + mStorage->memoryBlock = memoryBlock; } GridBase::GridBase() @@ -31,14 +35,16 @@ GridBase::GridBase() { } -GridBase::GridBase(const std::string& gridImplementationName, - const Neon::Backend& backend, - const Neon::index_3d& dimension, - const Neon::domain::Stencil& stencil, - const Neon::set::DataSet& nPartitionElements, - const Neon::index_3d& defaultBlockSize, - const Vec_3d& spacingData, - const Vec_3d& origin) +GridBase::GridBase(const std::string& gridImplementationName, + const Neon::Backend& backend, + const Neon::index_3d& dimension, + const Neon::domain::Stencil& stencil, + const Neon::set::DataSet& nPartitionElements, + const Neon::index_3d& defaultBlockSize, + const Vec_3d& spacingData, + const Vec_3d& origin, + Neon::domain::tool::spaceCurves::EncoderType spaceCurve, + Neon::index_3d memoryBlock) : mStorage(std::make_shared()) { init(gridImplementationName, @@ -48,7 +54,9 @@ GridBase::GridBase(const std::string& gridImplementationName, nPartitionElements, defaultBlockSize, spacingData, - origin); + origin, + spaceCurve, + memoryBlock); } auto GridBase::getDimension() const -> const Neon::index_3d& @@ -161,7 +169,8 @@ auto GridBase::toString() const -> std::string return tmp.str(); }() << "}" - << ", [Backend]:{" << getBackend().toString() << "}"; + << ", [Backend]:{" << getBackend().toString() << "}" + << ", [Memory]:{" << Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(mStorage->spaceCurve) << ", " << this->mStorage->memoryBlock << "}"; return s.str(); } @@ -232,10 +241,41 @@ auto GridBase::toReport(Neon::Report& report, }(), &subdoc); + report.addMember( + "MemoryBlock", + [&] { + std::stringstream list; + list << "["; + list << getMemoryBlock().x << " " + << getMemoryBlock().y << " " + << getMemoryBlock().z << "]"; + return list.str(); + }(), + &subdoc); + + report.addMember( + "SpaceCurve", + [&] { + std::stringstream list; + list << Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(mStorage->spaceCurve); + return list.str(); + }(), + &subdoc); + if (includeBackendInfo) getBackend().toReport(report, &subdoc); report.addSubdoc("Grid", subdoc); } +auto GridBase::getMemoryBlock() const -> Neon::index_3d +{ + return mStorage->memoryBlock; +} + +auto GridBase::getSpaceCurve() const -> Neon::domain::tool::spaceCurves::EncoderType +{ + return mStorage->spaceCurve; +} + } // namespace Neon::domain::interface \ No newline at end of file diff --git a/libNeonDomain/src/domain/tools/SpaceCurves.cpp b/libNeonDomain/src/domain/tools/SpaceCurves.cpp new file mode 100644 index 00000000..cca20e19 --- /dev/null +++ b/libNeonDomain/src/domain/tools/SpaceCurves.cpp @@ -0,0 +1,164 @@ +#include "Neon/domain/tools/SpaceCurves.h" +#include "Neon/core/types/Exceptions.h" + +namespace Neon::domain::tool::spaceCurves { + +auto EncoderTypeUtil::getOptions() -> std::array +{ + std::array options = {EncoderType::sweep, + EncoderType::morton, + EncoderType::hilbert}; + return options; +} + +auto EncoderTypeUtil::toString(EncoderType e) -> std::string +{ + switch (e) { + case EncoderType::sweep: { + return "sweep"; + } + case EncoderType::morton: { + return "morton"; + } + case EncoderType::hilbert: { + return "hilbert"; + } + default: { + NEON_THROW_UNSUPPORTED_OPTION("EncoderTypeUtil"); + } + } +} + +auto EncoderTypeUtil::fromInt(int val) -> EncoderType +{ + switch (val) { + case static_cast(EncoderType::sweep): { + return EncoderType::sweep; + } + case static_cast(EncoderType::morton): { + return EncoderType::morton; + } + case static_cast(EncoderType::hilbert): { + return EncoderType::hilbert; + } + default: { + NEON_THROW_UNSUPPORTED_OPTION("EncoderTypeUtil"); + } + } +} + +auto EncoderTypeUtil::fromString(const std::string& occ) -> EncoderType +{ + std::array opts = getOptions(); + for (auto a : opts) { + if (toString(a) == occ) { + return a; + } + } + NEON_THROW_UNSUPPORTED_OPTION(""); +} + +auto EncoderTypeUtil::toInt(EncoderType dataView) -> int +{ + return static_cast(dataView); +} + +std::ostream& operator<<(std::ostream& os, EncoderType const& m) +{ + return os << std::string(EncoderTypeUtil::toString(m)); +} + + +EncoderTypeUtil::Cli::Cli() +{ + mSet = false; +} + +EncoderTypeUtil::Cli::Cli(std::string s) +{ + set(s); +} + +EncoderTypeUtil::Cli::Cli(EncoderType model) +{ + mOption = model; +} + +auto EncoderTypeUtil::Cli::getOption() const -> EncoderType +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "TransferSemantic was not set."; + NEON_ERROR(errorMsg.str()); + } + return mOption; +} + +auto EncoderTypeUtil::Cli::set(const std::string& opt) + -> void +{ + try { + mOption = EncoderTypeUtil::fromString(opt); + } catch (...) { + std::stringstream errorMsg; + errorMsg << "TransferSemantic: " << opt << " is not a valid option (valid options are {"; + auto options = EncoderTypeUtil::getOptions(); + int i = 0; + for (auto o : options) { + if (i != 0) { + errorMsg << ", " << EncoderTypeUtil::toString(o); + } + errorMsg << EncoderTypeUtil::toString(o); + i = 1; + } + errorMsg << "})"; + NEON_ERROR(errorMsg.str()); + } + mSet = true; +} + +auto EncoderTypeUtil::Cli::getStringOptions() const -> std::string +{ + std::stringstream s; + auto options = EncoderTypeUtil::getOptions(); + int i = 0; + for (auto o : options) { + if (i != 0) { + s << ", "; + } + s << EncoderTypeUtil::toString(o); + i = 1; + } + std::string msg = s.str(); + return msg; +} + +auto EncoderTypeUtil::Cli::getStringOption() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "TransferSemantic was not set."; + NEON_ERROR(errorMsg.str()); + } + return EncoderTypeUtil::toString(mOption); +} + +auto EncoderTypeUtil::Cli::getDoc() const -> std::string +{ + std::stringstream s; + s << getStringOptions(); + s << " default: " << getStringOptions(); + return s.str(); +} + +auto EncoderTypeUtil::Cli::addToReport(Neon::Report& report) const -> void +{ + report.addMember("EncoderType", EncoderTypeUtil::toString(this->getOption())); +} + +auto EncoderTypeUtil::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void +{ + report.addMember("EncoderType", EncoderTypeUtil::toString(this->getOption()), &subBlock); +} + +} // namespace Neon::domain::tool::spaceCurves diff --git a/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp b/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp index e9c62754..4b372fc9 100644 --- a/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp +++ b/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp @@ -37,7 +37,7 @@ auto SpanClassifier::getMapper1Dto3D(const SetIdx& setIdx, auto SpanClassifier::getMapper3Dto1D(const SetIdx& setIdx, ByPartition byPartition, ByDirection byDirection, - ByDomain byDomain) const -> const Neon::domain::tool::PointHashTable& + ByDomain byDomain) const -> const Neon::domain::tool::PointHashTable& { return mData[setIdx] [static_cast(byPartition)] @@ -63,7 +63,7 @@ auto SpanClassifier::getMapper3Dto1D(const SetIdx& setIdx, ByPartition byPartition, ByDirection byDirection, ByDomain byDomain) - -> Neon::domain::tool::PointHashTable& + -> Neon::domain::tool::PointHashTable& { return mData[setIdx] [static_cast(byPartition)] diff --git a/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp b/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp index 591bd07f..9a81de6b 100644 --- a/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp +++ b/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp @@ -207,7 +207,7 @@ auto SpanLayout::findPossiblyLocalPointOffset( byDomain); auto const infoPtr = mapper.getMetadata(point); if (infoPtr != nullptr) { - return {true, *infoPtr, byPartition, byDirection, byDomain}; + return {true, int32_t(*infoPtr), byPartition, byDirection, byDomain}; } } } diff --git a/libNeonDomain/tests/CMakeLists.txt b/libNeonDomain/tests/CMakeLists.txt index 3f76cb4e..874e58fc 100644 --- a/libNeonDomain/tests/CMakeLists.txt +++ b/libNeonDomain/tests/CMakeLists.txt @@ -7,6 +7,7 @@ add_subdirectory("domain-neighbour-globalIdx") add_subdirectory("domain-halos") add_subdirectory("domain-stencil") add_subdirectory("domain-bGrid-tray") +add_subdirectory("domain-space-filling-curves") add_subdirectory("domainUt_sGrid") add_subdirectory("domain-unit-test-eGrid") diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu index 158d3e05..1b94b566 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu @@ -1,5 +1,6 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" @@ -27,18 +28,18 @@ auto defContainer(int streamIdx, return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); Neon::index_3d globalPoint = a.getGlobalIndex(e); - a(e, 0) = globalPoint.x ; + a(e, 0) = globalPoint.x; b(e, 0) = globalPoint.y; c(e, 0) = globalPoint.z; -// if constexpr (std::is_same_v) { -// printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx, -// e.mInDataBlockIdx.x, -// e.mInDataBlockIdx.y, -// e.mInDataBlockIdx.z, -// globalPoint.x, -// globalPoint.y, -// globalPoint.z); -// } + // if constexpr (std::is_same_v) { + // printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx, + // e.mInDataBlockIdx.x, + // e.mInDataBlockIdx.y, + // e.mInDataBlockIdx.z, + // globalPoint.x, + // globalPoint.y, + // globalPoint.z); + // } }; }); } @@ -98,5 +99,6 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h index 0a3b87eb..c766f7ca 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h +++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h @@ -3,9 +3,9 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" - namespace globalIdx { using namespace Neon::domain::tool::testing; @@ -15,6 +15,7 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; -} // namespace map +} // namespace globalIdx diff --git a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp index 783830ca..f0ecce78 100644 --- a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp @@ -4,7 +4,7 @@ #include "globalIdx.h" #include "runHelper.h" -TEST(domain_unit_test_globalIdx, dGrid) +TEST(domain_globalIdx, dGrid) { int nGpus = 3; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid) 1); } -TEST(domain_unit_test_globalIdx, eGrid) +TEST(domain_globalIdx, eGrid) { int nGpus = 3; using Type = int64_t; @@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGrid) +TEST(domain_globalIdx, bGrid) { int nGpus = 3; using Type = int64_t; @@ -31,6 +31,15 @@ TEST(domain_unit_test_globalIdx, bGrid) 1); } +TEST(domain_globalIdx, dGridSoA) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp index d0d43b60..c48511b7 100644 --- a/libNeonDomain/tests/domain-map/src/gtests.cpp +++ b/libNeonDomain/tests/domain-map/src/gtests.cpp @@ -13,6 +13,15 @@ TEST(domain_map, dGrid) 1); } +TEST(domain_map_dataView, dGrid) +{ + int nGpus = 2; + using Type = int64_t; + runAllTestConfiguration(std::function(map::dataView::run), + nGpus, + 2); +} + TEST(domain_map, eGrid) { int nGpus = 3; @@ -31,6 +40,15 @@ TEST(domain_map, bGrid) 1); } +TEST(domain_map, dGridSoA) +{ + int nGpus = 1; + using Type = int64_t; + runAllTestConfiguration(std::function(map::run), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu index bd25f178..2ed92ddb 100644 --- a/libNeonDomain/tests/domain-map/src/map.cu +++ b/libNeonDomain/tests/domain-map/src/map.cu @@ -1,6 +1,7 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" #include "gtest/gtest.h" @@ -31,6 +32,27 @@ auto mapContainer_axpy(int streamIdx, }); } +template +auto mapContainer_add(int streamIdx, + typename Field::Type& val, + Field& fieldB) + -> Neon::set::Container +{ + const auto& grid = fieldB.getGrid(); + return grid.newContainer( + "mapContainer_axpy", + [&, val](Neon::set::Loader& loader) { + auto b = loader.load(fieldB); + + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { + for (int i = 0; i < b.cardinality(); i++) { + // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); + b(e, i) += val; + } + }; + }); +} + using namespace Neon::domain::tool::testing; template @@ -75,6 +97,55 @@ auto run(TestData& data) -> void template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; + +namespace dataView { +template +auto run(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + + data.resetValuesToLinear(1, 100); + T val = T(33); + + { // NEON + const Neon::index_3d dim = grid.getDimension(); + std::vector elements; + + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + + + mapContainer_axpy(Neon::Backend::mainStreamIdx, + val, X, Y) + .run(0, Neon::DataView::BOUNDARY); + + mapContainer_axpy(Neon::Backend::mainStreamIdx, + val, X, Y) + .run(0, Neon::DataView::INTERNAL); + + X.updateHostData(0); + Y.updateHostData(0); + data.getBackend().sync(0); + } + + { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + data.axpy(&val, X, Y); + } + + bool isOk = data.compare(FieldNames::Y); + ASSERT_TRUE(isOk); +} +template auto run(TestData&) -> void; +template auto run(TestData&) -> void; +template auto run(TestData&) -> void; +template auto run(TestData&) -> void; +} // namespace dataView } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h index 611f2046..99864a3f 100644 --- a/libNeonDomain/tests/domain-map/src/map.h +++ b/libNeonDomain/tests/domain-map/src/map.h @@ -3,6 +3,7 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" @@ -14,6 +15,19 @@ auto run(TestData& data) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +namespace dataView { + +template +auto run(TestData& data) -> void; + +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; + +} // namespace dataView } // namespace map diff --git a/libNeonDomain/tests/domain-map/src/runHelper.h b/libNeonDomain/tests/domain-map/src/runHelper.h index 53ea8681..593e31c2 100644 --- a/libNeonDomain/tests/domain-map/src/runHelper.h +++ b/libNeonDomain/tests/domain-map/src/runHelper.h @@ -31,7 +31,7 @@ void runAllTestConfiguration( nGpuTest.push_back(i); } // std::vector nGpuTest{2,4,6,8}; - std::vector cardinalityTest{1}; + std::vector cardinalityTest{1,3,19}; std::vector dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}}; std::vector runtimeE{Neon::Runtime::openmp}; @@ -95,6 +95,7 @@ void runAllTestConfiguration( } } +#if 0 template void runOneTestConfiguration(const std::string& gname, @@ -144,3 +145,4 @@ void runOneTestConfiguration(const std::string& gname, } } } +#endif \ No newline at end of file diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp index feba5a9b..21bba9b5 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp @@ -1,10 +1,10 @@ +#include "./testsAndContainers.h" #include "Neon/Neon.h" #include "gtest/gtest.h" -#include "./testsAndContainers.h" #include "runHelper.h" -TEST(domain_unit_test_globalIdx, dGrid) +TEST(domain_neighbour_globalIdx, dGrid) { int nGpus = 5; using Type = int64_t; @@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid) 1); } -TEST(domain_unit_test_globalIdx, eGrid) +TEST(domain_neighbour_globalIdx, eGrid) { int nGpus = 5; using Type = int64_t; @@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid) 1); } -TEST(domain_unit_test_globalIdx, bGrid) +TEST(domain_neighbour_globalIdx, bGrid) { int nGpus = 5; using Type = int64_t; @@ -31,6 +31,53 @@ TEST(domain_unit_test_globalIdx, bGrid) 1); } +TEST(domain_neighbour_globalIdx, dGridSoA) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::run), + nGpus, + 1); +} + +/////////////////////////////////////////// + +TEST(domain_neighbour_globalIdx, dGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, eGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, bGrid_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + +TEST(domain_neighbour_globalIdx, dGridSoA_template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(globalIdx::runTemplate), + nGpus, + 1); +} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h index 0014594c..d74db246 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h @@ -8,6 +8,7 @@ #include "Neon/core/types/DeviceType.h" #include "Neon/domain/dGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/eGrid.h" #include "Neon/domain/tools/Geometries.h" #include "Neon/domain/tools/TestData.h" @@ -82,8 +83,8 @@ void runAllTestConfiguration( if (dim.z < 8 * ngpu * 3) { dim.z = ngpu * 3 * 8; } - if(memoryLayout == Neon::MemoryLayout::arrayOfStructs){ - continue ; + if (memoryLayout == Neon::MemoryLayout::arrayOfStructs) { + continue; } } diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu index 49dd3bd2..7b2c3fef 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu @@ -1,5 +1,6 @@ #include #include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" #include "Neon/domain/tools/TestData.h" #include "TestInformation.h" @@ -61,15 +62,15 @@ auto checkNeighbourData(Field const& filedA, Field const& filedB, Field const& filedC, Neon::index_3d testDirection, - Field const& checkFlatA, - Field const& checkFlatB, - Field const& checkFlatC) + Field& checkFlatA, + Field& checkFlatB, + Field& checkFlatC) -> Neon::set::Container { const auto& grid = filedA.getGrid(); return grid.newContainer( "defContainer", - [&](Neon::set::Loader& loader) { + [&, testDirection](Neon::set::Loader& loader) { auto a = loader.load(filedA, Neon::Pattern::STENCIL); auto b = loader.load(filedB, Neon::Pattern::STENCIL); auto c = loader.load(filedC, Neon::Pattern::STENCIL); @@ -102,6 +103,58 @@ auto checkNeighbourData(Field const& filedA, }); } +template +auto checkNeighbourDataTemplate(Field const& filedA, + Field const& filedB, + Field const& filedC, + Field& checkFlatA, + Field& checkFlatB, + Field& checkFlatC) + -> Neon::set::Container +{ + const auto& grid = filedA.getGrid(); + return grid.newContainer( + "defContainer", + [&](Neon::set::Loader& loader) { + auto a = loader.load(filedA, Neon::Pattern::STENCIL); + auto b = loader.load(filedB, Neon::Pattern::STENCIL); + auto c = loader.load(filedC, Neon::Pattern::STENCIL); + + auto resA = loader.load(checkFlatA, Neon::Pattern::MAP); + auto resB = loader.load(checkFlatB, Neon::Pattern::MAP); + auto resC = loader.load(checkFlatC, Neon::Pattern::MAP); + + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable { + constexpr Neon::index_3d testDirection(xOff, yOff, zOff); + + // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); + Neon::index_3d globalPoint = a.getGlobalIndex(e); + auto ngh = globalPoint + testDirection; + + decltype(a)* nghInfo[3] = {&a, &b, &c}; + decltype(a)* results[3] = {&resA, &resB, &resC}; + + for (int i = 0; i < 3; i++) { + auto d = nghInfo[i]->template getNghData(e, 0); + // auto d = nghInfo[i]->getNghData(e, testDirection.newType(), 0); + + if (d.isValid()) { + results[i]->operator()(e, 0) = d.getData() == ngh.v[i] ? +1 : -1; + if (d.getData() != ngh.v[i]) { + printf("ERROR: %d %d %d %d %d %d\n", globalPoint.x, globalPoint.y, globalPoint.z, ngh.v[0], ngh.v[1], ngh.v[2]); + d = nghInfo[i]->getNghData(e, testDirection.newType(), 0); + } + } else { + results[i]->operator()(e, 0) = 0; + } + } + }; + }); +} + using namespace Neon::domain::tool::testing; template @@ -165,15 +218,15 @@ auto run(TestData& data) -> void X, Y, Z); }; - // constexpr std::array - // stencil{Ngh3DIdx(1, 0, 0), - // Ngh3DIdx(-1, 0, 0), - // Ngh3DIdx(0, 1, 0), - // Ngh3DIdx(0, -1, 0), - // Ngh3DIdx(0, 0, 1), - // Ngh3DIdx(0, 0, -1)}; - constexpr std::array - stencil{Ngh3DIdx(0, 0, -1)}; + constexpr std::array + stencil{Ngh3DIdx(1, 0, 0), + Ngh3DIdx(-1, 0, 0), + Ngh3DIdx(0, 1, 0), + Ngh3DIdx(0, -1, 0), + Ngh3DIdx(0, 0, 1), + Ngh3DIdx(0, 0, -1)}; + // constexpr std::array + // stencil{Ngh3DIdx(0, 0, -1)}; for (auto const& direction : stencil) { reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx); @@ -214,8 +267,149 @@ auto run(TestData& data) -> void } } +template +auto runTemplate(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + + data.resetValuesToLinear(1, 100); + + auto aField = grid.template newField("a", 1, 0); + auto bField = grid.template newField("a", 1, 0); + auto cField = grid.template newField("a", 1, 0); + + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + auto& Z = data.getField(FieldNames::Z); + + const Neon::index_3d dim = grid.getDimension(); + auto bk = grid.getBackend(); + + { // NEON + { + initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + } + using Ngh3DIdx = Neon::int32_3d; + + auto setGolden = [&](Ngh3DIdx const& direction) { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + auto& Z = data.getIODomain(FieldNames::Z); + + data.forEachActiveIODomain([&](const Neon::index_3d& idx, + int cardinality, + Type& a, + Type& b, + Type& c) { + a = 1; + b = 1; + c = 1; + auto ngh = direction + idx; + if (!(ngh >= 0)) { + a = 0; + b = 0; + c = 0; + } + if (!(dim > ngh)) { + a = 0; + b = 0; + c = 0; + } + }, + X, Y, Z); + }; + + constexpr std::array + stencil{Ngh3DIdx(1, 0, 0), + Ngh3DIdx(-1, 0, 0), + Ngh3DIdx(0, 1, 0), + Ngh3DIdx(0, -1, 0), + Ngh3DIdx(0, 0, 1), + Ngh3DIdx(0, 0, -1)}; + // constexpr std::array + // stencil{Ngh3DIdx(0, 0, -1)}; + + for (auto const& direction : stencil) { + reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + reset(X, Y, Z).run(Neon::Backend::mainStreamIdx); + { // Updating halo with wrong data + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + { + initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx); + bk.sync(Neon::Backend::mainStreamIdx); + } + + + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + + if (direction == Neon::index_3d(1, 0, 0)) { + checkNeighbourDataTemplate<1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(-1, 0, 0)) { + checkNeighbourDataTemplate<-1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 1, 0)) { + checkNeighbourDataTemplate<0, 1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, -1, 0)) { + checkNeighbourDataTemplate<0, -1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 0, 1)) { + checkNeighbourDataTemplate<0, 0, 1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else if (direction == Neon::index_3d(0, 0, -1)) { + checkNeighbourDataTemplate<0, 0, -1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx); + // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx); + } else { + std::cout << "Direction not implemented " << direction << std::endl; + exit(99); + } + setGolden(direction); + + bk.sync(Neon::Backend::mainStreamIdx); + bool isOk = data.compare(FieldNames::X); + isOk = isOk && data.compare(FieldNames::Y); + isOk = isOk && data.compare(FieldNames::Z); + + if (!isOk) { + std::cout << "Direction with errors " << direction << std::endl; + data.getField(FieldNames::X).ioToVtk(grid.getImplementationName() + "X", "X", true); + data.getField(FieldNames::Y).ioToVtk(grid.getImplementationName() + "Y", "Y", true); + data.getField(FieldNames::Z).ioToVtk(grid.getImplementationName() + "Z", "Z", true); + exit(77); + ASSERT_TRUE(isOk); + } + } +} + + template auto run(TestData&) -> void; template auto run(TestData&) -> void; template auto run(TestData&) -> void; +template auto run(TestData&) -> void; + + +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace globalIdx \ No newline at end of file diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h index 0a3b87eb..bcf503f2 100644 --- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h +++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h @@ -4,6 +4,7 @@ #include "Neon/domain/Grids.h" #include "Neon/domain/tools/TestData.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" namespace globalIdx { @@ -12,9 +13,17 @@ using namespace Neon::domain::tool::testing; template auto run(TestData& data) -> void; +template +auto runTemplate(TestData& data) -> void; + extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; extern template auto run(TestData&) -> void; +extern template auto run(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; } // namespace map diff --git a/libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt b/libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt new file mode 100644 index 00000000..76af1689 --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt @@ -0,0 +1,19 @@ +cmake_minimum_required(VERSION 3.19 FATAL_ERROR) + +set(APP_NAME domain-space-filling-curves) +file(GLOB_RECURSE SrcFiles src/*.*) + +add_executable(${APP_NAME} ${SrcFiles}) + +target_link_libraries(${APP_NAME} + PUBLIC libNeonDomain + PUBLIC gtest_main) + +set_target_properties(${APP_NAME} PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON) + +set_target_properties(${APP_NAME} PROPERTIES FOLDER "libNeonDomain") +source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "${APP_NAME}" FILES ${SrcFiles}) + +add_test(NAME ${APP_NAME} COMMAND ${APP_NAME}) \ No newline at end of file diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h b/libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h new file mode 100644 index 00000000..3ac50ecd --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h @@ -0,0 +1,17 @@ +#pragma once +namespace { +struct TestInformation +{ + static auto prefix() + -> std::string + { + return "domain-unit-test-map"; + } + + static auto fullName(const std::string& gridName) + -> std::string + { + return prefix() + "-" + gridName; + } +}; +} // namespace \ No newline at end of file diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu new file mode 100644 index 00000000..b43ca7f4 --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu @@ -0,0 +1,74 @@ +#include +#include +#include +#include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/tools/SpaceCurves.h" +#include "Neon/domain/tools/TestData.h" +#include "TestInformation.h" +#include "gtest/gtest.h" + +#include +#include + +namespace space_filling_curves { + +template +auto defHostContainer(Field& filedSweep, + Field& filedMorton, + Field& filedHilbert) + -> Neon::set::Container +{ + const auto& grid = filedSweep.getGrid(); + return grid.template newContainer( + "defContainer", + [&](Neon::set::Loader& loader) { + auto sweep = loader.load(filedSweep); + auto morton = loader.load(filedMorton); + auto hilbert = loader.load(filedHilbert); + + return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& gidx) mutable { + Neon::index_3d p = sweep.getGlobalIndex(gidx); + Neon::index_3d dim = sweep.getDomainSize(); + using namespace Neon::domain::tool::spaceCurves; + sweep(gidx, 0) = Encoder::encode(EncoderType::sweep, dim, p); + morton(gidx, 0) = Encoder::encode(EncoderType::morton, dim, p); + hilbert(gidx, 0) = Encoder::encode(EncoderType::hilbert, dim, p); + }; + }); +} + + +using namespace Neon::domain::tool::testing; + +template +auto run(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + + data.resetValuesToLinear(1, 100); + + { // NEON + const Neon::index_3d dim = grid.getDimension(); + std::vector elements; + + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + auto& Z = data.getField(FieldNames::Z); + + defHostContainer(X, Y, Z).run(0); + data.getBackend().sync(0); + + data.getField(FieldNames::X).ioToVtk("spaceCurveSweep", "code", false); + data.getField(FieldNames::Y).ioToVtk("spaceCurveMorton", "code", false); + data.getField(FieldNames::Z).ioToVtk("spaceCurveHilbert", "code", false); + } +} + +template auto run(TestData&) -> void; + + +} // namespace space_filling_curves \ No newline at end of file diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h new file mode 100644 index 00000000..a5b9fd3a --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h @@ -0,0 +1,18 @@ + +#pragma once +#include + +#include "Neon/domain/Grids.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/tools/TestData.h" + +namespace space_filling_curves { +using namespace Neon::domain::tool::testing; + +template +auto run(TestData& data) -> void; + +extern template auto run(TestData&) -> void; + + +} // namespace globalIdx diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h b/libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h new file mode 100644 index 00000000..d6292c4b --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h @@ -0,0 +1,11 @@ + +#include "Neon/Neon.h" +#include "domain-space-filling-curves.h" +#include "gtest/gtest.h" +#include "runHelper.h" + +uint64_t morton_grid_16_16_16[16 * 16* 16] = { + 0, 4, 32, 36, 256, 260, 288, 292, 2048, 2052, 2080, 2084, 2304, 2308, 2336, 2340, 2, 6, 34, 38, 258, 262, 290, 294, 2050, 2054, 2082, 2086, 2306, 2310, 2338, 2342, 16, 20, 48, 52, 272, 276, 304, 308, 2064, 2068, 2096, 2100, 2320, 2324, 2352, 2356, 18, 22, 50, 54, 274, 278, 306, 310, 2066, 2070, 2098, 2102, 2322, 2326, 2354, 2358, 128, 132, 160, 164, 384, 388, 416, 420, 2176, 2180, 2208, 2212, 2432, 2436, 2464, 2468, 130, 134, 162, 166, 386, 390, 418, 422, 2178, 2182, 2210, 2214, 2434, 2438, 2466, 2470, 144, 148, 176, 180, 400, 404, 432, 436, 2192, 2196, 2224, 2228, 2448, 2452, 2480, 2484, 146, 150, 178, 182, 402, 406, 434, 438, 2194, 2198, 2226, 2230, 2450, 2454, 2482, 2486, 1024, 1028, 1056, 1060, 1280, 1284, 1312, 1316, 3072, 3076, 3104, 3108, 3328, 3332, 3360, 3364, 1026, 1030, 1058, 1062, 1282, 1286, 1314, 1318, 3074, 3078, 3106, 3110, 3330, 3334, 3362, 3366, 1040, 1044, 1072, 1076, 1296, 1300, 1328, 1332, 3088, 3092, 3120, 3124, 3344, 3348, 3376, 3380, 1042, 1046, 1074, 1078, 1298, 1302, 1330, 1334, 3090, 3094, 3122, 3126, 3346, 3350, 3378, 3382, 1152, 1156, 1184, 1188, 1408, 1412, 1440, 1444, 3200, 3204, 3232, 3236, 3456, 3460, 3488, 3492, 1154, 1158, 1186, 1190, 1410, 1414, 1442, 1446, 3202, 3206, 3234, 3238, 3458, 3462, 3490, 3494, 1168, 1172, 1200, 1204, 1424, 1428, 1456, 1460, 3216, 3220, 3248, 3252, 3472, 3476, 3504, 3508, 1170, 1174, 1202, 1206, 1426, 1430, 1458, 1462, 3218, 3222, 3250, 3254, 3474, 3478, 3506, 3510, 1, 5, 33, 37, 257, 261, 289, 293, 2049, 2053, 2081, 2085, 2305, 2309, 2337, 2341, 3, 7, 35, 39, 259, 263, 291, 295, 2051, 2055, 2083, 2087, 2307, 2311, 2339, 2343, 17, 21, 49, 53, 273, 277, 305, 309, 2065, 2069, 2097, 2101, 2321, 2325, 2353, 2357, 19, 23, 51, 55, 275, 279, 307, 311, 2067, 2071, 2099, 2103, 2323, 2327, 2355, 2359, 129, 133, 161, 165, 385, 389, 417, 421, 2177, 2181, 2209, 2213, 2433, 2437, 2465, 2469, 131, 135, 163, 167, 387, 391, 419, 423, 2179, 2183, 2211, 2215, 2435, 2439, 2467, 2471, 145, 149, 177, 181, 401, 405, 433, 437, 2193, 2197, 2225, 2229, 2449, 2453, 2481, 2485, 147, 151, 179, 183, 403, 407, 435, 439, 2195, 2199, 2227, 2231, 2451, 2455, 2483, 2487, 1025, 1029, 1057, 1061, 1281, 1285, 1313, 1317, 3073, 3077, 3105, 3109, 3329, 3333, 3361, 3365, 1027, 1031, 1059, 1063, 1283, 1287, 1315, 1319, 3075, 3079, 3107, 3111, 3331, 3335, 3363, 3367, 1041, 1045, 1073, 1077, 1297, 1301, 1329, 1333, 3089, 3093, 3121, 3125, 3345, 3349, 3377, 3381, 1043, 1047, 1075, 1079, 1299, 1303, 1331, 1335, 3091, 3095, 3123, 3127, 3347, 3351, 3379, 3383, 1153, 1157, 1185, 1189, 1409, 1413, 1441, 1445, 3201, 3205, 3233, 3237, 3457, 3461, 3489, 3493, 1155, 1159, 1187, 1191, 1411, 1415, 1443, 1447, 3203, 3207, 3235, 3239, 3459, 3463, 3491, 3495, 1169, 1173, 1201, 1205, 1425, 1429, 1457, 1461, 3217, 3221, 3249, 3253, 3473, 3477, 3505, 3509, 1171, 1175, 1203, 1207, 1427, 1431, 1459, 1463, 3219, 3223, 3251, 3255, 3475, 3479, 3507, 3511, 8, 12, 40, 44, 264, 268, 296, 300, 2056, 2060, 2088, 2092, 2312, 2316, 2344, 2348, 10, 14, 42, 46, 266, 270, 298, 302, 2058, 2062, 2090, 2094, 2314, 2318, 2346, 2350, 24, 28, 56, 60, 280, 284, 312, 316, 2072, 2076, 2104, 2108, 2328, 2332, 2360, 2364, 26, 30, 58, 62, 282, 286, 314, 318, 2074, 2078, 2106, 2110, 2330, 2334, 2362, 2366, 136, 140, 168, 172, 392, 396, 424, 428, 2184, 2188, 2216, 2220, 2440, 2444, 2472, 2476, 138, 142, 170, 174, 394, 398, 426, 430, 2186, 2190, 2218, 2222, 2442, 2446, 2474, 2478, 152, 156, 184, 188, 408, 412, 440, 444, 2200, 2204, 2232, 2236, 2456, 2460, 2488, 2492, 154, 158, 186, 190, 410, 414, 442, 446, 2202, 2206, 2234, 2238, 2458, 2462, 2490, 2494, 1032, 1036, 1064, 1068, 1288, 1292, 1320, 1324, 3080, 3084, 3112, 3116, 3336, 3340, 3368, 3372, 1034, 1038, 1066, 1070, 1290, 1294, 1322, 1326, 3082, 3086, 3114, 3118, 3338, 3342, 3370, 3374, 1048, 1052, 1080, 1084, 1304, 1308, 1336, 1340, 3096, 3100, 3128, 3132, 3352, 3356, 3384, 3388, 1050, 1054, 1082, 1086, 1306, 1310, 1338, 1342, 3098, 3102, 3130, 3134, 3354, 3358, 3386, 3390, 1160, 1164, 1192, 1196, 1416, 1420, 1448, 1452, 3208, 3212, 3240, 3244, 3464, 3468, 3496, 3500, 1162, 1166, 1194, 1198, 1418, 1422, 1450, 1454, 3210, 3214, 3242, 3246, 3466, 3470, 3498, 3502, 1176, 1180, 1208, 1212, 1432, 1436, 1464, 1468, 3224, 3228, 3256, 3260, 3480, 3484, 3512, 3516, 1178, 1182, 1210, 1214, 1434, 1438, 1466, 1470, 3226, 3230, 3258, 3262, 3482, 3486, 3514, 3518, 9, 13, 41, 45, 265, 269, 297, 301, 2057, 2061, 2089, 2093, 2313, 2317, 2345, 2349, 11, 15, 43, 47, 267, 271, 299, 303, 2059, 2063, 2091, 2095, 2315, 2319, 2347, 2351, 25, 29, 57, 61, 281, 285, 313, 317, 2073, 2077, 2105, 2109, 2329, 2333, 2361, 2365, 27, 31, 59, 63, 283, 287, 315, 319, 2075, 2079, 2107, 2111, 2331, 2335, 2363, 2367, 137, 141, 169, 173, 393, 397, 425, 429, 2185, 2189, 2217, 2221, 2441, 2445, 2473, 2477, 139, 143, 171, 175, 395, 399, 427, 431, 2187, 2191, 2219, 2223, 2443, 2447, 2475, 2479, 153, 157, 185, 189, 409, 413, 441, 445, 2201, 2205, 2233, 2237, 2457, 2461, 2489, 2493, 155, 159, 187, 191, 411, 415, 443, 447, 2203, 2207, 2235, 2239, 2459, 2463, 2491, 2495, 1033, 1037, 1065, 1069, 1289, 1293, 1321, 1325, 3081, 3085, 3113, 3117, 3337, 3341, 3369, 3373, 1035, 1039, 1067, 1071, 1291, 1295, 1323, 1327, 3083, 3087, 3115, 3119, 3339, 3343, 3371, 3375, 1049, 1053, 1081, 1085, 1305, 1309, 1337, 1341, 3097, 3101, 3129, 3133, 3353, 3357, 3385, 3389, 1051, 1055, 1083, 1087, 1307, 1311, 1339, 1343, 3099, 3103, 3131, 3135, 3355, 3359, 3387, 3391, 1161, 1165, 1193, 1197, 1417, 1421, 1449, 1453, 3209, 3213, 3241, 3245, 3465, 3469, 3497, 3501, 1163, 1167, 1195, 1199, 1419, 1423, 1451, 1455, 3211, 3215, 3243, 3247, 3467, 3471, 3499, 3503, 1177, 1181, 1209, 1213, 1433, 1437, 1465, 1469, 3225, 3229, 3257, 3261, 3481, 3485, 3513, 3517, 1179, 1183, 1211, 1215, 1435, 1439, 1467, 1471, 3227, 3231, 3259, 3263, 3483, 3487, 3515, 3519, 64, 68, 96, 100, 320, 324, 352, 356, 2112, 2116, 2144, 2148, 2368, 2372, 2400, 2404, 66, 70, 98, 102, 322, 326, 354, 358, 2114, 2118, 2146, 2150, 2370, 2374, 2402, 2406, 80, 84, 112, 116, 336, 340, 368, 372, 2128, 2132, 2160, 2164, 2384, 2388, 2416, 2420, 82, 86, 114, 118, 338, 342, 370, 374, 2130, 2134, 2162, 2166, 2386, 2390, 2418, 2422, 192, 196, 224, 228, 448, 452, 480, 484, 2240, 2244, 2272, 2276, 2496, 2500, 2528, 2532, 194, 198, 226, 230, 450, 454, 482, 486, 2242, 2246, 2274, 2278, 2498, 2502, 2530, 2534, 208, 212, 240, 244, 464, 468, 496, 500, 2256, 2260, 2288, 2292, 2512, 2516, 2544, 2548, 210, 214, 242, 246, 466, 470, 498, 502, 2258, 2262, 2290, 2294, 2514, 2518, 2546, 2550, 1088, 1092, 1120, 1124, 1344, 1348, 1376, 1380, 3136, 3140, 3168, 3172, 3392, 3396, 3424, 3428, 1090, 1094, 1122, 1126, 1346, 1350, 1378, 1382, 3138, 3142, 3170, 3174, 3394, 3398, 3426, 3430, 1104, 1108, 1136, 1140, 1360, 1364, 1392, 1396, 3152, 3156, 3184, 3188, 3408, 3412, 3440, 3444, 1106, 1110, 1138, 1142, 1362, 1366, 1394, 1398, 3154, 3158, 3186, 3190, 3410, 3414, 3442, 3446, 1216, 1220, 1248, 1252, 1472, 1476, 1504, 1508, 3264, 3268, 3296, 3300, 3520, 3524, 3552, 3556, 1218, 1222, 1250, 1254, 1474, 1478, 1506, 1510, 3266, 3270, 3298, 3302, 3522, 3526, 3554, 3558, 1232, 1236, 1264, 1268, 1488, 1492, 1520, 1524, 3280, 3284, 3312, 3316, 3536, 3540, 3568, 3572, 1234, 1238, 1266, 1270, 1490, 1494, 1522, 1526, 3282, 3286, 3314, 3318, 3538, 3542, 3570, 3574, 65, 69, 97, 101, 321, 325, 353, 357, 2113, 2117, 2145, 2149, 2369, 2373, 2401, 2405, 67, 71, 99, 103, 323, 327, 355, 359, 2115, 2119, 2147, 2151, 2371, 2375, 2403, 2407, 81, 85, 113, 117, 337, 341, 369, 373, 2129, 2133, 2161, 2165, 2385, 2389, 2417, 2421, 83, 87, 115, 119, 339, 343, 371, 375, 2131, 2135, 2163, 2167, 2387, 2391, 2419, 2423, 193, 197, 225, 229, 449, 453, 481, 485, 2241, 2245, 2273, 2277, 2497, 2501, 2529, 2533, 195, 199, 227, 231, 451, 455, 483, 487, 2243, 2247, 2275, 2279, 2499, 2503, 2531, 2535, 209, 213, 241, 245, 465, 469, 497, 501, 2257, 2261, 2289, 2293, 2513, 2517, 2545, 2549, 211, 215, 243, 247, 467, 471, 499, 503, 2259, 2263, 2291, 2295, 2515, 2519, 2547, 2551, 1089, 1093, 1121, 1125, 1345, 1349, 1377, 1381, 3137, 3141, 3169, 3173, 3393, 3397, 3425, 3429, 1091, 1095, 1123, 1127, 1347, 1351, 1379, 1383, 3139, 3143, 3171, 3175, 3395, 3399, 3427, 3431, 1105, 1109, 1137, 1141, 1361, 1365, 1393, 1397, 3153, 3157, 3185, 3189, 3409, 3413, 3441, 3445, 1107, 1111, 1139, 1143, 1363, 1367, 1395, 1399, 3155, 3159, 3187, 3191, 3411, 3415, 3443, 3447, 1217, 1221, 1249, 1253, 1473, 1477, 1505, 1509, 3265, 3269, 3297, 3301, 3521, 3525, 3553, 3557, 1219, 1223, 1251, 1255, 1475, 1479, 1507, 1511, 3267, 3271, 3299, 3303, 3523, 3527, 3555, 3559, 1233, 1237, 1265, 1269, 1489, 1493, 1521, 1525, 3281, 3285, 3313, 3317, 3537, 3541, 3569, 3573, 1235, 1239, 1267, 1271, 1491, 1495, 1523, 1527, 3283, 3287, 3315, 3319, 3539, 3543, 3571, 3575, 72, 76, 104, 108, 328, 332, 360, 364, 2120, 2124, 2152, 2156, 2376, 2380, 2408, 2412, 74, 78, 106, 110, 330, 334, 362, 366, 2122, 2126, 2154, 2158, 2378, 2382, 2410, 2414, 88, 92, 120, 124, 344, 348, 376, 380, 2136, 2140, 2168, 2172, 2392, 2396, 2424, 2428, 90, 94, 122, 126, 346, 350, 378, 382, 2138, 2142, 2170, 2174, 2394, 2398, 2426, 2430, 200, 204, 232, 236, 456, 460, 488, 492, 2248, 2252, 2280, 2284, 2504, 2508, 2536, 2540, 202, 206, 234, 238, 458, 462, 490, 494, 2250, 2254, 2282, 2286, 2506, 2510, 2538, 2542, 216, 220, 248, 252, 472, 476, 504, 508, 2264, 2268, 2296, 2300, 2520, 2524, 2552, 2556, 218, 222, 250, 254, 474, 478, 506, 510, 2266, 2270, 2298, 2302, 2522, 2526, 2554, 2558, 1096, 1100, 1128, 1132, 1352, 1356, 1384, 1388, 3144, 3148, 3176, 3180, 3400, 3404, 3432, 3436, 1098, 1102, 1130, 1134, 1354, 1358, 1386, 1390, 3146, 3150, 3178, 3182, 3402, 3406, 3434, 3438, 1112, 1116, 1144, 1148, 1368, 1372, 1400, 1404, 3160, 3164, 3192, 3196, 3416, 3420, 3448, 3452, 1114, 1118, 1146, 1150, 1370, 1374, 1402, 1406, 3162, 3166, 3194, 3198, 3418, 3422, 3450, 3454, 1224, 1228, 1256, 1260, 1480, 1484, 1512, 1516, 3272, 3276, 3304, 3308, 3528, 3532, 3560, 3564, 1226, 1230, 1258, 1262, 1482, 1486, 1514, 1518, 3274, 3278, 3306, 3310, 3530, 3534, 3562, 3566, 1240, 1244, 1272, 1276, 1496, 1500, 1528, 1532, 3288, 3292, 3320, 3324, 3544, 3548, 3576, 3580, 1242, 1246, 1274, 1278, 1498, 1502, 1530, 1534, 3290, 3294, 3322, 3326, 3546, 3550, 3578, 3582, 73, 77, 105, 109, 329, 333, 361, 365, 2121, 2125, 2153, 2157, 2377, 2381, 2409, 2413, 75, 79, 107, 111, 331, 335, 363, 367, 2123, 2127, 2155, 2159, 2379, 2383, 2411, 2415, 89, 93, 121, 125, 345, 349, 377, 381, 2137, 2141, 2169, 2173, 2393, 2397, 2425, 2429, 91, 95, 123, 127, 347, 351, 379, 383, 2139, 2143, 2171, 2175, 2395, 2399, 2427, 2431, 201, 205, 233, 237, 457, 461, 489, 493, 2249, 2253, 2281, 2285, 2505, 2509, 2537, 2541, 203, 207, 235, 239, 459, 463, 491, 495, 2251, 2255, 2283, 2287, 2507, 2511, 2539, 2543, 217, 221, 249, 253, 473, 477, 505, 509, 2265, 2269, 2297, 2301, 2521, 2525, 2553, 2557, 219, 223, 251, 255, 475, 479, 507, 511, 2267, 2271, 2299, 2303, 2523, 2527, 2555, 2559, 1097, 1101, 1129, 1133, 1353, 1357, 1385, 1389, 3145, 3149, 3177, 3181, 3401, 3405, 3433, 3437, 1099, 1103, 1131, 1135, 1355, 1359, 1387, 1391, 3147, 3151, 3179, 3183, 3403, 3407, 3435, 3439, 1113, 1117, 1145, 1149, 1369, 1373, 1401, 1405, 3161, 3165, 3193, 3197, 3417, 3421, 3449, 3453, 1115, 1119, 1147, 1151, 1371, 1375, 1403, 1407, 3163, 3167, 3195, 3199, 3419, 3423, 3451, 3455, 1225, 1229, 1257, 1261, 1481, 1485, 1513, 1517, 3273, 3277, 3305, 3309, 3529, 3533, 3561, 3565, 1227, 1231, 1259, 1263, 1483, 1487, 1515, 1519, 3275, 3279, 3307, 3311, 3531, 3535, 3563, 3567, 1241, 1245, 1273, 1277, 1497, 1501, 1529, 1533, 3289, 3293, 3321, 3325, 3545, 3549, 3577, 3581, 1243, 1247, 1275, 1279, 1499, 1503, 1531, 1535, 3291, 3295, 3323, 3327, 3547, 3551, 3579, 3583, 512, 516, 544, 548, 768, 772, 800, 804, 2560, 2564, 2592, 2596, 2816, 2820, 2848, 2852, 514, 518, 546, 550, 770, 774, 802, 806, 2562, 2566, 2594, 2598, 2818, 2822, 2850, 2854, 528, 532, 560, 564, 784, 788, 816, 820, 2576, 2580, 2608, 2612, 2832, 2836, 2864, 2868, 530, 534, 562, 566, 786, 790, 818, 822, 2578, 2582, 2610, 2614, 2834, 2838, 2866, 2870, 640, 644, 672, 676, 896, 900, 928, 932, 2688, 2692, 2720, 2724, 2944, 2948, 2976, 2980, 642, 646, 674, 678, 898, 902, 930, 934, 2690, 2694, 2722, 2726, 2946, 2950, 2978, 2982, 656, 660, 688, 692, 912, 916, 944, 948, 2704, 2708, 2736, 2740, 2960, 2964, 2992, 2996, 658, 662, 690, 694, 914, 918, 946, 950, 2706, 2710, 2738, 2742, 2962, 2966, 2994, 2998, 1536, 1540, 1568, 1572, 1792, 1796, 1824, 1828, 3584, 3588, 3616, 3620, 3840, 3844, 3872, 3876, 1538, 1542, 1570, 1574, 1794, 1798, 1826, 1830, 3586, 3590, 3618, 3622, 3842, 3846, 3874, 3878, 1552, 1556, 1584, 1588, 1808, 1812, 1840, 1844, 3600, 3604, 3632, 3636, 3856, 3860, 3888, 3892, 1554, 1558, 1586, 1590, 1810, 1814, 1842, 1846, 3602, 3606, 3634, 3638, 3858, 3862, 3890, 3894, 1664, 1668, 1696, 1700, 1920, 1924, 1952, 1956, 3712, 3716, 3744, 3748, 3968, 3972, 4000, 4004, 1666, 1670, 1698, 1702, 1922, 1926, 1954, 1958, 3714, 3718, 3746, 3750, 3970, 3974, 4002, 4006, 1680, 1684, 1712, 1716, 1936, 1940, 1968, 1972, 3728, 3732, 3760, 3764, 3984, 3988, 4016, 4020, 1682, 1686, 1714, 1718, 1938, 1942, 1970, 1974, 3730, 3734, 3762, 3766, 3986, 3990, 4018, 4022, 513, 517, 545, 549, 769, 773, 801, 805, 2561, 2565, 2593, 2597, 2817, 2821, 2849, 2853, 515, 519, 547, 551, 771, 775, 803, 807, 2563, 2567, 2595, 2599, 2819, 2823, 2851, 2855, 529, 533, 561, 565, 785, 789, 817, 821, 2577, 2581, 2609, 2613, 2833, 2837, 2865, 2869, 531, 535, 563, 567, 787, 791, 819, 823, 2579, 2583, 2611, 2615, 2835, 2839, 2867, 2871, 641, 645, 673, 677, 897, 901, 929, 933, 2689, 2693, 2721, 2725, 2945, 2949, 2977, 2981, 643, 647, 675, 679, 899, 903, 931, 935, 2691, 2695, 2723, 2727, 2947, 2951, 2979, 2983, 657, 661, 689, 693, 913, 917, 945, 949, 2705, 2709, 2737, 2741, 2961, 2965, 2993, 2997, 659, 663, 691, 695, 915, 919, 947, 951, 2707, 2711, 2739, 2743, 2963, 2967, 2995, 2999, 1537, 1541, 1569, 1573, 1793, 1797, 1825, 1829, 3585, 3589, 3617, 3621, 3841, 3845, 3873, 3877, 1539, 1543, 1571, 1575, 1795, 1799, 1827, 1831, 3587, 3591, 3619, 3623, 3843, 3847, 3875, 3879, 1553, 1557, 1585, 1589, 1809, 1813, 1841, 1845, 3601, 3605, 3633, 3637, 3857, 3861, 3889, 3893, 1555, 1559, 1587, 1591, 1811, 1815, 1843, 1847, 3603, 3607, 3635, 3639, 3859, 3863, 3891, 3895, 1665, 1669, 1697, 1701, 1921, 1925, 1953, 1957, 3713, 3717, 3745, 3749, 3969, 3973, 4001, 4005, 1667, 1671, 1699, 1703, 1923, 1927, 1955, 1959, 3715, 3719, 3747, 3751, 3971, 3975, 4003, 4007, 1681, 1685, 1713, 1717, 1937, 1941, 1969, 1973, 3729, 3733, 3761, 3765, 3985, 3989, 4017, 4021, 1683, 1687, 1715, 1719, 1939, 1943, 1971, 1975, 3731, 3735, 3763, 3767, 3987, 3991, 4019, 4023, 520, 524, 552, 556, 776, 780, 808, 812, 2568, 2572, 2600, 2604, 2824, 2828, 2856, 2860, 522, 526, 554, 558, 778, 782, 810, 814, 2570, 2574, 2602, 2606, 2826, 2830, 2858, 2862, 536, 540, 568, 572, 792, 796, 824, 828, 2584, 2588, 2616, 2620, 2840, 2844, 2872, 2876, 538, 542, 570, 574, 794, 798, 826, 830, 2586, 2590, 2618, 2622, 2842, 2846, 2874, 2878, 648, 652, 680, 684, 904, 908, 936, 940, 2696, 2700, 2728, 2732, 2952, 2956, 2984, 2988, 650, 654, 682, 686, 906, 910, 938, 942, 2698, 2702, 2730, 2734, 2954, 2958, 2986, 2990, 664, 668, 696, 700, 920, 924, 952, 956, 2712, 2716, 2744, 2748, 2968, 2972, 3000, 3004, 666, 670, 698, 702, 922, 926, 954, 958, 2714, 2718, 2746, 2750, 2970, 2974, 3002, 3006, 1544, 1548, 1576, 1580, 1800, 1804, 1832, 1836, 3592, 3596, 3624, 3628, 3848, 3852, 3880, 3884, 1546, 1550, 1578, 1582, 1802, 1806, 1834, 1838, 3594, 3598, 3626, 3630, 3850, 3854, 3882, 3886, 1560, 1564, 1592, 1596, 1816, 1820, 1848, 1852, 3608, 3612, 3640, 3644, 3864, 3868, 3896, 3900, 1562, 1566, 1594, 1598, 1818, 1822, 1850, 1854, 3610, 3614, 3642, 3646, 3866, 3870, 3898, 3902, 1672, 1676, 1704, 1708, 1928, 1932, 1960, 1964, 3720, 3724, 3752, 3756, 3976, 3980, 4008, 4012, 1674, 1678, 1706, 1710, 1930, 1934, 1962, 1966, 3722, 3726, 3754, 3758, 3978, 3982, 4010, 4014, 1688, 1692, 1720, 1724, 1944, 1948, 1976, 1980, 3736, 3740, 3768, 3772, 3992, 3996, 4024, 4028, 1690, 1694, 1722, 1726, 1946, 1950, 1978, 1982, 3738, 3742, 3770, 3774, 3994, 3998, 4026, 4030, 521, 525, 553, 557, 777, 781, 809, 813, 2569, 2573, 2601, 2605, 2825, 2829, 2857, 2861, 523, 527, 555, 559, 779, 783, 811, 815, 2571, 2575, 2603, 2607, 2827, 2831, 2859, 2863, 537, 541, 569, 573, 793, 797, 825, 829, 2585, 2589, 2617, 2621, 2841, 2845, 2873, 2877, 539, 543, 571, 575, 795, 799, 827, 831, 2587, 2591, 2619, 2623, 2843, 2847, 2875, 2879, 649, 653, 681, 685, 905, 909, 937, 941, 2697, 2701, 2729, 2733, 2953, 2957, 2985, 2989, 651, 655, 683, 687, 907, 911, 939, 943, 2699, 2703, 2731, 2735, 2955, 2959, 2987, 2991, 665, 669, 697, 701, 921, 925, 953, 957, 2713, 2717, 2745, 2749, 2969, 2973, 3001, 3005, 667, 671, 699, 703, 923, 927, 955, 959, 2715, 2719, 2747, 2751, 2971, 2975, 3003, 3007, 1545, 1549, 1577, 1581, 1801, 1805, 1833, 1837, 3593, 3597, 3625, 3629, 3849, 3853, 3881, 3885, 1547, 1551, 1579, 1583, 1803, 1807, 1835, 1839, 3595, 3599, 3627, 3631, 3851, 3855, 3883, 3887, 1561, 1565, 1593, 1597, 1817, 1821, 1849, 1853, 3609, 3613, 3641, 3645, 3865, 3869, 3897, 3901, 1563, 1567, 1595, 1599, 1819, 1823, 1851, 1855, 3611, 3615, 3643, 3647, 3867, 3871, 3899, 3903, 1673, 1677, 1705, 1709, 1929, 1933, 1961, 1965, 3721, 3725, 3753, 3757, 3977, 3981, 4009, 4013, 1675, 1679, 1707, 1711, 1931, 1935, 1963, 1967, 3723, 3727, 3755, 3759, 3979, 3983, 4011, 4015, 1689, 1693, 1721, 1725, 1945, 1949, 1977, 1981, 3737, 3741, 3769, 3773, 3993, 3997, 4025, 4029, 1691, 1695, 1723, 1727, 1947, 1951, 1979, 1983, 3739, 3743, 3771, 3775, 3995, 3999, 4027, 4031, 576, 580, 608, 612, 832, 836, 864, 868, 2624, 2628, 2656, 2660, 2880, 2884, 2912, 2916, 578, 582, 610, 614, 834, 838, 866, 870, 2626, 2630, 2658, 2662, 2882, 2886, 2914, 2918, 592, 596, 624, 628, 848, 852, 880, 884, 2640, 2644, 2672, 2676, 2896, 2900, 2928, 2932, 594, 598, 626, 630, 850, 854, 882, 886, 2642, 2646, 2674, 2678, 2898, 2902, 2930, 2934, 704, 708, 736, 740, 960, 964, 992, 996, 2752, 2756, 2784, 2788, 3008, 3012, 3040, 3044, 706, 710, 738, 742, 962, 966, 994, 998, 2754, 2758, 2786, 2790, 3010, 3014, 3042, 3046, 720, 724, 752, 756, 976, 980, 1008, 1012, 2768, 2772, 2800, 2804, 3024, 3028, 3056, 3060, 722, 726, 754, 758, 978, 982, 1010, 1014, 2770, 2774, 2802, 2806, 3026, 3030, 3058, 3062, 1600, 1604, 1632, 1636, 1856, 1860, 1888, 1892, 3648, 3652, 3680, 3684, 3904, 3908, 3936, 3940, 1602, 1606, 1634, 1638, 1858, 1862, 1890, 1894, 3650, 3654, 3682, 3686, 3906, 3910, 3938, 3942, 1616, 1620, 1648, 1652, 1872, 1876, 1904, 1908, 3664, 3668, 3696, 3700, 3920, 3924, 3952, 3956, 1618, 1622, 1650, 1654, 1874, 1878, 1906, 1910, 3666, 3670, 3698, 3702, 3922, 3926, 3954, 3958, 1728, 1732, 1760, 1764, 1984, 1988, 2016, 2020, 3776, 3780, 3808, 3812, 4032, 4036, 4064, 4068, 1730, 1734, 1762, 1766, 1986, 1990, 2018, 2022, 3778, 3782, 3810, 3814, 4034, 4038, 4066, 4070, 1744, 1748, 1776, 1780, 2000, 2004, 2032, 2036, 3792, 3796, 3824, 3828, 4048, 4052, 4080, 4084, 1746, 1750, 1778, 1782, 2002, 2006, 2034, 2038, 3794, 3798, 3826, 3830, 4050, 4054, 4082, 4086, 577, 581, 609, 613, 833, 837, 865, 869, 2625, 2629, 2657, 2661, 2881, 2885, 2913, 2917, 579, 583, 611, 615, 835, 839, 867, 871, 2627, 2631, 2659, 2663, 2883, 2887, 2915, 2919, 593, 597, 625, 629, 849, 853, 881, 885, 2641, 2645, 2673, 2677, 2897, 2901, 2929, 2933, 595, 599, 627, 631, 851, 855, 883, 887, 2643, 2647, 2675, 2679, 2899, 2903, 2931, 2935, 705, 709, 737, 741, 961, 965, 993, 997, 2753, 2757, 2785, 2789, 3009, 3013, 3041, 3045, 707, 711, 739, 743, 963, 967, 995, 999, 2755, 2759, 2787, 2791, 3011, 3015, 3043, 3047, 721, 725, 753, 757, 977, 981, 1009, 1013, 2769, 2773, 2801, 2805, 3025, 3029, 3057, 3061, 723, 727, 755, 759, 979, 983, 1011, 1015, 2771, 2775, 2803, 2807, 3027, 3031, 3059, 3063, 1601, 1605, 1633, 1637, 1857, 1861, 1889, 1893, 3649, 3653, 3681, 3685, 3905, 3909, 3937, 3941, 1603, 1607, 1635, 1639, 1859, 1863, 1891, 1895, 3651, 3655, 3683, 3687, 3907, 3911, 3939, 3943, 1617, 1621, 1649, 1653, 1873, 1877, 1905, 1909, 3665, 3669, 3697, 3701, 3921, 3925, 3953, 3957, 1619, 1623, 1651, 1655, 1875, 1879, 1907, 1911, 3667, 3671, 3699, 3703, 3923, 3927, 3955, 3959, 1729, 1733, 1761, 1765, 1985, 1989, 2017, 2021, 3777, 3781, 3809, 3813, 4033, 4037, 4065, 4069, 1731, 1735, 1763, 1767, 1987, 1991, 2019, 2023, 3779, 3783, 3811, 3815, 4035, 4039, 4067, 4071, 1745, 1749, 1777, 1781, 2001, 2005, 2033, 2037, 3793, 3797, 3825, 3829, 4049, 4053, 4081, 4085, 1747, 1751, 1779, 1783, 2003, 2007, 2035, 2039, 3795, 3799, 3827, 3831, 4051, 4055, 4083, 4087, 584, 588, 616, 620, 840, 844, 872, 876, 2632, 2636, 2664, 2668, 2888, 2892, 2920, 2924, 586, 590, 618, 622, 842, 846, 874, 878, 2634, 2638, 2666, 2670, 2890, 2894, 2922, 2926, 600, 604, 632, 636, 856, 860, 888, 892, 2648, 2652, 2680, 2684, 2904, 2908, 2936, 2940, 602, 606, 634, 638, 858, 862, 890, 894, 2650, 2654, 2682, 2686, 2906, 2910, 2938, 2942, 712, 716, 744, 748, 968, 972, 1000, 1004, 2760, 2764, 2792, 2796, 3016, 3020, 3048, 3052, 714, 718, 746, 750, 970, 974, 1002, 1006, 2762, 2766, 2794, 2798, 3018, 3022, 3050, 3054, 728, 732, 760, 764, 984, 988, 1016, 1020, 2776, 2780, 2808, 2812, 3032, 3036, 3064, 3068, 730, 734, 762, 766, 986, 990, 1018, 1022, 2778, 2782, 2810, 2814, 3034, 3038, 3066, 3070, 1608, 1612, 1640, 1644, 1864, 1868, 1896, 1900, 3656, 3660, 3688, 3692, 3912, 3916, 3944, 3948, 1610, 1614, 1642, 1646, 1866, 1870, 1898, 1902, 3658, 3662, 3690, 3694, 3914, 3918, 3946, 3950, 1624, 1628, 1656, 1660, 1880, 1884, 1912, 1916, 3672, 3676, 3704, 3708, 3928, 3932, 3960, 3964, 1626, 1630, 1658, 1662, 1882, 1886, 1914, 1918, 3674, 3678, 3706, 3710, 3930, 3934, 3962, 3966, 1736, 1740, 1768, 1772, 1992, 1996, 2024, 2028, 3784, 3788, 3816, 3820, 4040, 4044, 4072, 4076, 1738, 1742, 1770, 1774, 1994, 1998, 2026, 2030, 3786, 3790, 3818, 3822, 4042, 4046, 4074, 4078, 1752, 1756, 1784, 1788, 2008, 2012, 2040, 2044, 3800, 3804, 3832, 3836, 4056, 4060, 4088, 4092, 1754, 1758, 1786, 1790, 2010, 2014, 2042, 2046, 3802, 3806, 3834, 3838, 4058, 4062, 4090, 4094, 585, 589, 617, 621, 841, 845, 873, 877, 2633, 2637, 2665, 2669, 2889, 2893, 2921, 2925, 587, 591, 619, 623, 843, 847, 875, 879, 2635, 2639, 2667, 2671, 2891, 2895, 2923, 2927, 601, 605, 633, 637, 857, 861, 889, 893, 2649, 2653, 2681, 2685, 2905, 2909, 2937, 2941, 603, 607, 635, 639, 859, 863, 891, 895, 2651, 2655, 2683, 2687, 2907, 2911, 2939, 2943, 713, 717, 745, 749, 969, 973, 1001, 1005, 2761, 2765, 2793, 2797, 3017, 3021, 3049, 3053, 715, 719, 747, 751, 971, 975, 1003, 1007, 2763, 2767, 2795, 2799, 3019, 3023, 3051, 3055, 729, 733, 761, 765, 985, 989, 1017, 1021, 2777, 2781, 2809, 2813, 3033, 3037, 3065, 3069, 731, 735, 763, 767, 987, 991, 1019, 1023, 2779, 2783, 2811, 2815, 3035, 3039, 3067, 3071, 1609, 1613, 1641, 1645, 1865, 1869, 1897, 1901, 3657, 3661, 3689, 3693, 3913, 3917, 3945, 3949, 1611, 1615, 1643, 1647, 1867, 1871, 1899, 1903, 3659, 3663, 3691, 3695, 3915, 3919, 3947, 3951, 1625, 1629, 1657, 1661, 1881, 1885, 1913, 1917, 3673, 3677, 3705, 3709, 3929, 3933, 3961, 3965, 1627, 1631, 1659, 1663, 1883, 1887, 1915, 1919, 3675, 3679, 3707, 3711, 3931, 3935, 3963, 3967, 1737, 1741, 1769, 1773, 1993, 1997, 2025, 2029, 3785, 3789, 3817, 3821, 4041, 4045, 4073, 4077, 1739, 1743, 1771, 1775, 1995, 1999, 2027, 2031, 3787, 3791, 3819, 3823, 4043, 4047, 4075, 4079, 1753, 1757, 1785, 1789, 2009, 2013, 2041, 2045, 3801, 3805, 3833, 3837, 4057, 4061, 4089, 4093, 1755, 1759, 1787, 1791, 2011, 2015, 2043, 2047, 3803, 3807, 3835, 3839, 4059, 4063, 4091, 4095}; + +uint64_t hilbert_grid_16_16_16[16 * 16* 16] = { + 0, 7, 8, 11, 212, 211, 204, 203, 3892, 3891, 3884, 3883, 4084, 4087, 4088, 4095, 3, 4, 9, 10, 215, 208, 207, 200, 3895, 3888, 3887, 3880, 4085, 4086, 4091, 4092, 60, 59, 54, 53, 216, 219, 198, 199, 3896, 3897, 3876, 3879, 4042, 4041, 4036, 4035, 63, 56, 55, 52, 217, 218, 193, 192, 3903, 3902, 3877, 3878, 4043, 4040, 4039, 4032, 64, 67, 124, 127, 128, 131, 188, 191, 3904, 3907, 3964, 3967, 3968, 3971, 4028, 4031, 65, 66, 125, 126, 129, 130, 189, 190, 3905, 3906, 3965, 3966, 3969, 3970, 4029, 4030, 90, 93, 98, 101, 154, 157, 162, 165, 3930, 3933, 3938, 3941, 3994, 3997, 4002, 4005, 89, 94, 97, 102, 153, 158, 161, 166, 3929, 3934, 3937, 3942, 3993, 3998, 4001, 4006, 1702, 1703, 1704, 1707, 1876, 1879, 1880, 1881, 2214, 2215, 2216, 2219, 2388, 2391, 2392, 2393, 1697, 1696, 1705, 1706, 1877, 1878, 1887, 1886, 2209, 2208, 2217, 2218, 2389, 2390, 2399, 2398, 1694, 1695, 1686, 1685, 1898, 1897, 1888, 1889, 2206, 2207, 2198, 2197, 2410, 2409, 2400, 2401, 1689, 1688, 1687, 1684, 1899, 1896, 1895, 1894, 2201, 2200, 2199, 2196, 2411, 2408, 2407, 2406, 1638, 1639, 1640, 1643, 1940, 1943, 1944, 1945, 2150, 2151, 2152, 2155, 2452, 2455, 2456, 2457, 1633, 1632, 1641, 1642, 1941, 1942, 1951, 1950, 2145, 2144, 2153, 2154, 2453, 2454, 2463, 2462, 1630, 1631, 1622, 1621, 1962, 1961, 1952, 1953, 2142, 2143, 2134, 2133, 2474, 2473, 2464, 2465, 1625, 1624, 1623, 1620, 1963, 1960, 1959, 1958, 2137, 2136, 2135, 2132, 2475, 2472, 2471, 2470, 1, 6, 15, 12, 213, 210, 205, 202, 3893, 3890, 3885, 3882, 4083, 4080, 4089, 4094, 2, 5, 14, 13, 214, 209, 206, 201, 3894, 3889, 3886, 3881, 4082, 4081, 4090, 4093, 61, 58, 49, 50, 223, 220, 197, 196, 3899, 3898, 3875, 3872, 4045, 4046, 4037, 4034, 62, 57, 48, 51, 222, 221, 194, 195, 3900, 3901, 3874, 3873, 4044, 4047, 4038, 4033, 71, 68, 123, 120, 135, 132, 187, 184, 3911, 3908, 3963, 3960, 3975, 3972, 4027, 4024, 70, 69, 122, 121, 134, 133, 186, 185, 3910, 3909, 3962, 3961, 3974, 3973, 4026, 4025, 91, 92, 99, 100, 155, 156, 163, 164, 3931, 3932, 3939, 3940, 3995, 3996, 4003, 4004, 88, 95, 96, 103, 152, 159, 160, 167, 3928, 3935, 3936, 3943, 3992, 3999, 4000, 4007, 1701, 1700, 1711, 1708, 1875, 1872, 1883, 1882, 2213, 2212, 2223, 2220, 2387, 2384, 2395, 2394, 1698, 1699, 1710, 1709, 1874, 1873, 1884, 1885, 2210, 2211, 2222, 2221, 2386, 2385, 2396, 2397, 1693, 1692, 1681, 1682, 1901, 1902, 1891, 1890, 2205, 2204, 2193, 2194, 2413, 2414, 2403, 2402, 1690, 1691, 1680, 1683, 1900, 1903, 1892, 1893, 2202, 2203, 2192, 2195, 2412, 2415, 2404, 2405, 1637, 1636, 1647, 1644, 1939, 1936, 1947, 1946, 2149, 2148, 2159, 2156, 2451, 2448, 2459, 2458, 1634, 1635, 1646, 1645, 1938, 1937, 1948, 1949, 2146, 2147, 2158, 2157, 2450, 2449, 2460, 2461, 1629, 1628, 1617, 1618, 1965, 1966, 1955, 1954, 2141, 2140, 2129, 2130, 2477, 2478, 2467, 2466, 1626, 1627, 1616, 1619, 1964, 1967, 1956, 1957, 2138, 2139, 2128, 2131, 2476, 2479, 2468, 2469, 26, 27, 16, 19, 234, 237, 242, 245, 3850, 3853, 3858, 3861, 4076, 4079, 4068, 4069, 29, 28, 17, 18, 233, 238, 241, 246, 3849, 3854, 3857, 3862, 4077, 4078, 4067, 4066, 34, 35, 46, 45, 224, 227, 250, 251, 3844, 3845, 3868, 3871, 4050, 4049, 4060, 4061, 37, 36, 47, 44, 225, 226, 253, 252, 3843, 3842, 3869, 3870, 4051, 4048, 4059, 4058, 72, 73, 118, 119, 136, 137, 182, 183, 3912, 3913, 3958, 3959, 3976, 3977, 4022, 4023, 79, 78, 113, 112, 143, 142, 177, 176, 3919, 3918, 3953, 3952, 3983, 3982, 4017, 4016, 80, 81, 110, 111, 144, 145, 174, 175, 3920, 3921, 3950, 3951, 3984, 3985, 4014, 4015, 87, 86, 105, 104, 151, 150, 169, 168, 3927, 3926, 3945, 3944, 3991, 3990, 4009, 4008, 1726, 1721, 1712, 1715, 1868, 1871, 1862, 1857, 2238, 2233, 2224, 2227, 2380, 2383, 2374, 2369, 1725, 1722, 1713, 1714, 1869, 1870, 1861, 1858, 2237, 2234, 2225, 2226, 2381, 2382, 2373, 2370, 1666, 1669, 1678, 1677, 1906, 1905, 1914, 1917, 2178, 2181, 2190, 2189, 2418, 2417, 2426, 2429, 1665, 1670, 1679, 1676, 1907, 1904, 1913, 1918, 2177, 2182, 2191, 2188, 2419, 2416, 2425, 2430, 1662, 1657, 1648, 1651, 1932, 1935, 1926, 1921, 2174, 2169, 2160, 2163, 2444, 2447, 2438, 2433, 1661, 1658, 1649, 1650, 1933, 1934, 1925, 1922, 2173, 2170, 2161, 2162, 2445, 2446, 2437, 2434, 1602, 1605, 1614, 1613, 1970, 1969, 1978, 1981, 2114, 2117, 2126, 2125, 2482, 2481, 2490, 2493, 1601, 1606, 1615, 1612, 1971, 1968, 1977, 1982, 2113, 2118, 2127, 2124, 2483, 2480, 2489, 2494, 25, 24, 23, 20, 235, 236, 243, 244, 3851, 3852, 3859, 3860, 4075, 4072, 4071, 4070, 30, 31, 22, 21, 232, 239, 240, 247, 3848, 3855, 3856, 3863, 4074, 4073, 4064, 4065, 33, 32, 41, 42, 231, 228, 249, 248, 3847, 3846, 3867, 3864, 4053, 4054, 4063, 4062, 38, 39, 40, 43, 230, 229, 254, 255, 3840, 3841, 3866, 3865, 4052, 4055, 4056, 4057, 75, 74, 117, 116, 139, 138, 181, 180, 3915, 3914, 3957, 3956, 3979, 3978, 4021, 4020, 76, 77, 114, 115, 140, 141, 178, 179, 3916, 3917, 3954, 3955, 3980, 3981, 4018, 4019, 83, 82, 109, 108, 147, 146, 173, 172, 3923, 3922, 3949, 3948, 3987, 3986, 4013, 4012, 84, 85, 106, 107, 148, 149, 170, 171, 3924, 3925, 3946, 3947, 3988, 3989, 4010, 4011, 1727, 1720, 1719, 1716, 1867, 1864, 1863, 1856, 2239, 2232, 2231, 2228, 2379, 2376, 2375, 2368, 1724, 1723, 1718, 1717, 1866, 1865, 1860, 1859, 2236, 2235, 2230, 2229, 2378, 2377, 2372, 2371, 1667, 1668, 1673, 1674, 1909, 1910, 1915, 1916, 2179, 2180, 2185, 2186, 2421, 2422, 2427, 2428, 1664, 1671, 1672, 1675, 1908, 1911, 1912, 1919, 2176, 2183, 2184, 2187, 2420, 2423, 2424, 2431, 1663, 1656, 1655, 1652, 1931, 1928, 1927, 1920, 2175, 2168, 2167, 2164, 2443, 2440, 2439, 2432, 1660, 1659, 1654, 1653, 1930, 1929, 1924, 1923, 2172, 2171, 2166, 2165, 2442, 2441, 2436, 2435, 1603, 1604, 1609, 1610, 1973, 1974, 1979, 1980, 2115, 2116, 2121, 2122, 2485, 2486, 2491, 2492, 1600, 1607, 1608, 1611, 1972, 1975, 1976, 1983, 2112, 2119, 2120, 2123, 2484, 2487, 2488, 2495, 486, 487, 488, 491, 276, 275, 268, 267, 3828, 3827, 3820, 3819, 3604, 3607, 3608, 3609, 481, 480, 489, 490, 279, 272, 271, 264, 3831, 3824, 3823, 3816, 3605, 3606, 3615, 3614, 478, 479, 470, 469, 280, 283, 262, 263, 3832, 3833, 3812, 3815, 3626, 3625, 3616, 3617, 473, 472, 471, 468, 281, 282, 257, 256, 3839, 3838, 3813, 3814, 3627, 3624, 3623, 3622, 436, 437, 394, 395, 372, 373, 330, 331, 3764, 3765, 3722, 3723, 3700, 3701, 3658, 3659, 435, 434, 397, 396, 371, 370, 333, 332, 3763, 3762, 3725, 3724, 3699, 3698, 3661, 3660, 428, 429, 402, 403, 364, 365, 338, 339, 3756, 3757, 3730, 3731, 3692, 3693, 3666, 3667, 427, 426, 405, 404, 363, 362, 341, 340, 3755, 3754, 3733, 3732, 3691, 3690, 3669, 3668, 1728, 1731, 1788, 1791, 1792, 1795, 1852, 1855, 2240, 2243, 2300, 2303, 2304, 2307, 2364, 2367, 1729, 1730, 1789, 1790, 1793, 1794, 1853, 1854, 2241, 2242, 2301, 2302, 2305, 2306, 2365, 2366, 1754, 1757, 1762, 1765, 1818, 1821, 1826, 1829, 2266, 2269, 2274, 2277, 2330, 2333, 2338, 2341, 1753, 1758, 1761, 1766, 1817, 1822, 1825, 1830, 2265, 2270, 2273, 2278, 2329, 2334, 2337, 2342, 1588, 1587, 1580, 1579, 2004, 2003, 1996, 1995, 2100, 2099, 2092, 2091, 2516, 2515, 2508, 2507, 1591, 1584, 1583, 1576, 2007, 2000, 1999, 1992, 2103, 2096, 2095, 2088, 2519, 2512, 2511, 2504, 1592, 1593, 1572, 1575, 2008, 2011, 1990, 1991, 2104, 2105, 2084, 2087, 2520, 2523, 2502, 2503, 1599, 1598, 1573, 1574, 2009, 2010, 1985, 1984, 2111, 2110, 2085, 2086, 2521, 2522, 2497, 2496, 485, 484, 495, 492, 277, 274, 269, 266, 3829, 3826, 3821, 3818, 3603, 3600, 3611, 3610, 482, 483, 494, 493, 278, 273, 270, 265, 3830, 3825, 3822, 3817, 3602, 3601, 3612, 3613, 477, 476, 465, 466, 287, 284, 261, 260, 3835, 3834, 3811, 3808, 3629, 3630, 3619, 3618, 474, 475, 464, 467, 286, 285, 258, 259, 3836, 3837, 3810, 3809, 3628, 3631, 3620, 3621, 439, 438, 393, 392, 375, 374, 329, 328, 3767, 3766, 3721, 3720, 3703, 3702, 3657, 3656, 432, 433, 398, 399, 368, 369, 334, 335, 3760, 3761, 3726, 3727, 3696, 3697, 3662, 3663, 431, 430, 401, 400, 367, 366, 337, 336, 3759, 3758, 3729, 3728, 3695, 3694, 3665, 3664, 424, 425, 406, 407, 360, 361, 342, 343, 3752, 3753, 3734, 3735, 3688, 3689, 3670, 3671, 1735, 1732, 1787, 1784, 1799, 1796, 1851, 1848, 2247, 2244, 2299, 2296, 2311, 2308, 2363, 2360, 1734, 1733, 1786, 1785, 1798, 1797, 1850, 1849, 2246, 2245, 2298, 2297, 2310, 2309, 2362, 2361, 1755, 1756, 1763, 1764, 1819, 1820, 1827, 1828, 2267, 2268, 2275, 2276, 2331, 2332, 2339, 2340, 1752, 1759, 1760, 1767, 1816, 1823, 1824, 1831, 2264, 2271, 2272, 2279, 2328, 2335, 2336, 2343, 1589, 1586, 1581, 1578, 2005, 2002, 1997, 1994, 2101, 2098, 2093, 2090, 2517, 2514, 2509, 2506, 1590, 1585, 1582, 1577, 2006, 2001, 1998, 1993, 2102, 2097, 2094, 2089, 2518, 2513, 2510, 2505, 1595, 1594, 1571, 1568, 2015, 2012, 1989, 1988, 2107, 2106, 2083, 2080, 2527, 2524, 2501, 2500, 1596, 1597, 1570, 1569, 2014, 2013, 1986, 1987, 2108, 2109, 2082, 2081, 2526, 2525, 2498, 2499, 510, 505, 496, 499, 298, 301, 306, 309, 3786, 3789, 3794, 3797, 3596, 3599, 3590, 3585, 509, 506, 497, 498, 297, 302, 305, 310, 3785, 3790, 3793, 3798, 3597, 3598, 3589, 3586, 450, 453, 462, 461, 288, 291, 314, 315, 3780, 3781, 3804, 3807, 3634, 3633, 3642, 3645, 449, 454, 463, 460, 289, 290, 317, 316, 3779, 3778, 3805, 3806, 3635, 3632, 3641, 3646, 440, 443, 388, 391, 376, 379, 324, 327, 3768, 3771, 3716, 3719, 3704, 3707, 3652, 3655, 441, 442, 389, 390, 377, 378, 325, 326, 3769, 3770, 3717, 3718, 3705, 3706, 3653, 3654, 420, 419, 412, 411, 356, 355, 348, 347, 3748, 3747, 3740, 3739, 3684, 3683, 3676, 3675, 423, 416, 415, 408, 359, 352, 351, 344, 3751, 3744, 3743, 3736, 3687, 3680, 3679, 3672, 1736, 1737, 1782, 1783, 1800, 1801, 1846, 1847, 2248, 2249, 2294, 2295, 2312, 2313, 2358, 2359, 1743, 1742, 1777, 1776, 1807, 1806, 1841, 1840, 2255, 2254, 2289, 2288, 2319, 2318, 2353, 2352, 1744, 1745, 1774, 1775, 1808, 1809, 1838, 1839, 2256, 2257, 2286, 2287, 2320, 2321, 2350, 2351, 1751, 1750, 1769, 1768, 1815, 1814, 1833, 1832, 2263, 2262, 2281, 2280, 2327, 2326, 2345, 2344, 1546, 1549, 1554, 1557, 2026, 2029, 2034, 2037, 2058, 2061, 2066, 2069, 2538, 2541, 2546, 2549, 1545, 1550, 1553, 1558, 2025, 2030, 2033, 2038, 2057, 2062, 2065, 2070, 2537, 2542, 2545, 2550, 1540, 1541, 1564, 1567, 2016, 2019, 2042, 2043, 2052, 2053, 2076, 2079, 2528, 2531, 2554, 2555, 1539, 1538, 1565, 1566, 2017, 2018, 2045, 2044, 2051, 2050, 2077, 2078, 2529, 2530, 2557, 2556, 511, 504, 503, 500, 299, 300, 307, 308, 3787, 3788, 3795, 3796, 3595, 3592, 3591, 3584, 508, 507, 502, 501, 296, 303, 304, 311, 3784, 3791, 3792, 3799, 3594, 3593, 3588, 3587, 451, 452, 457, 458, 295, 292, 313, 312, 3783, 3782, 3803, 3800, 3637, 3638, 3643, 3644, 448, 455, 456, 459, 294, 293, 318, 319, 3776, 3777, 3802, 3801, 3636, 3639, 3640, 3647, 447, 444, 387, 384, 383, 380, 323, 320, 3775, 3772, 3715, 3712, 3711, 3708, 3651, 3648, 446, 445, 386, 385, 382, 381, 322, 321, 3774, 3773, 3714, 3713, 3710, 3709, 3650, 3649, 421, 418, 413, 410, 357, 354, 349, 346, 3749, 3746, 3741, 3738, 3685, 3682, 3677, 3674, 422, 417, 414, 409, 358, 353, 350, 345, 3750, 3745, 3742, 3737, 3686, 3681, 3678, 3673, 1739, 1738, 1781, 1780, 1803, 1802, 1845, 1844, 2251, 2250, 2293, 2292, 2315, 2314, 2357, 2356, 1740, 1741, 1778, 1779, 1804, 1805, 1842, 1843, 2252, 2253, 2290, 2291, 2316, 2317, 2354, 2355, 1747, 1746, 1773, 1772, 1811, 1810, 1837, 1836, 2259, 2258, 2285, 2284, 2323, 2322, 2349, 2348, 1748, 1749, 1770, 1771, 1812, 1813, 1834, 1835, 2260, 2261, 2282, 2283, 2324, 2325, 2346, 2347, 1547, 1548, 1555, 1556, 2027, 2028, 2035, 2036, 2059, 2060, 2067, 2068, 2539, 2540, 2547, 2548, 1544, 1551, 1552, 1559, 2024, 2031, 2032, 2039, 2056, 2063, 2064, 2071, 2536, 2543, 2544, 2551, 1543, 1542, 1563, 1560, 2023, 2020, 2041, 2040, 2055, 2054, 2075, 2072, 2535, 2532, 2553, 2552, 1536, 1537, 1562, 1561, 2022, 2021, 2046, 2047, 2048, 2049, 2074, 2073, 2534, 2533, 2558, 2559, 512, 515, 572, 575, 576, 577, 602, 601, 3494, 3493, 3518, 3519, 3520, 3523, 3580, 3583, 513, 514, 573, 574, 583, 582, 603, 600, 3495, 3492, 3513, 3512, 3521, 3522, 3581, 3582, 538, 541, 546, 549, 584, 591, 592, 599, 3496, 3503, 3504, 3511, 3546, 3549, 3554, 3557, 537, 542, 545, 550, 587, 588, 595, 596, 3499, 3500, 3507, 3508, 3545, 3550, 3553, 3558, 998, 993, 990, 985, 948, 947, 940, 939, 3156, 3155, 3148, 3147, 3110, 3105, 3102, 3097, 997, 994, 989, 986, 951, 944, 943, 936, 3159, 3152, 3151, 3144, 3109, 3106, 3101, 3098, 1022, 1021, 962, 961, 952, 953, 932, 935, 3160, 3163, 3142, 3143, 3134, 3133, 3074, 3073, 1023, 1020, 963, 960, 959, 958, 933, 934, 3161, 3162, 3137, 3136, 3135, 3132, 3075, 3072, 1024, 1027, 1084, 1087, 1088, 1089, 1114, 1113, 2982, 2981, 3006, 3007, 3008, 3011, 3068, 3071, 1025, 1026, 1085, 1086, 1095, 1094, 1115, 1112, 2983, 2980, 3001, 3000, 3009, 3010, 3069, 3070, 1050, 1053, 1058, 1061, 1096, 1103, 1104, 1111, 2984, 2991, 2992, 2999, 3034, 3037, 3042, 3045, 1049, 1054, 1057, 1062, 1099, 1100, 1107, 1108, 2987, 2988, 2995, 2996, 3033, 3038, 3041, 3046, 1510, 1505, 1502, 1497, 1460, 1459, 1452, 1451, 2644, 2643, 2636, 2635, 2598, 2593, 2590, 2585, 1509, 1506, 1501, 1498, 1463, 1456, 1455, 1448, 2647, 2640, 2639, 2632, 2597, 2594, 2589, 2586, 1534, 1533, 1474, 1473, 1464, 1465, 1444, 1447, 2648, 2651, 2630, 2631, 2622, 2621, 2562, 2561, 1535, 1532, 1475, 1472, 1471, 1470, 1445, 1446, 2649, 2650, 2625, 2624, 2623, 2620, 2563, 2560, 519, 516, 571, 568, 579, 578, 605, 606, 3489, 3490, 3517, 3516, 3527, 3524, 3579, 3576, 518, 517, 570, 569, 580, 581, 604, 607, 3488, 3491, 3514, 3515, 3526, 3525, 3578, 3577, 539, 540, 547, 548, 585, 590, 593, 598, 3497, 3502, 3505, 3510, 3547, 3548, 3555, 3556, 536, 543, 544, 551, 586, 589, 594, 597, 3498, 3501, 3506, 3509, 3544, 3551, 3552, 3559, 999, 992, 991, 984, 949, 946, 941, 938, 3157, 3154, 3149, 3146, 3111, 3104, 3103, 3096, 996, 995, 988, 987, 950, 945, 942, 937, 3158, 3153, 3150, 3145, 3108, 3107, 3100, 3099, 1017, 1018, 965, 966, 955, 954, 931, 928, 3167, 3164, 3141, 3140, 3129, 3130, 3077, 3078, 1016, 1019, 964, 967, 956, 957, 930, 929, 3166, 3165, 3138, 3139, 3128, 3131, 3076, 3079, 1031, 1028, 1083, 1080, 1091, 1090, 1117, 1118, 2977, 2978, 3005, 3004, 3015, 3012, 3067, 3064, 1030, 1029, 1082, 1081, 1092, 1093, 1116, 1119, 2976, 2979, 3002, 3003, 3014, 3013, 3066, 3065, 1051, 1052, 1059, 1060, 1097, 1102, 1105, 1110, 2985, 2990, 2993, 2998, 3035, 3036, 3043, 3044, 1048, 1055, 1056, 1063, 1098, 1101, 1106, 1109, 2986, 2989, 2994, 2997, 3032, 3039, 3040, 3047, 1511, 1504, 1503, 1496, 1461, 1458, 1453, 1450, 2645, 2642, 2637, 2634, 2599, 2592, 2591, 2584, 1508, 1507, 1500, 1499, 1462, 1457, 1454, 1449, 2646, 2641, 2638, 2633, 2596, 2595, 2588, 2587, 1529, 1530, 1477, 1478, 1467, 1466, 1443, 1440, 2655, 2652, 2629, 2628, 2617, 2618, 2565, 2566, 1528, 1531, 1476, 1479, 1468, 1469, 1442, 1441, 2654, 2653, 2626, 2627, 2616, 2619, 2564, 2567, 520, 521, 566, 567, 636, 637, 610, 609, 3486, 3485, 3458, 3459, 3528, 3529, 3574, 3575, 527, 526, 561, 560, 635, 634, 611, 608, 3487, 3484, 3461, 3460, 3535, 3534, 3569, 3568, 528, 529, 558, 559, 630, 625, 622, 617, 3478, 3473, 3470, 3465, 3536, 3537, 3566, 3567, 535, 534, 553, 552, 629, 626, 621, 618, 3477, 3474, 3469, 3466, 3543, 3542, 3561, 3560, 1000, 1001, 982, 983, 906, 909, 914, 917, 3178, 3181, 3186, 3189, 3112, 3113, 3094, 3095, 1007, 1006, 977, 976, 905, 910, 913, 918, 3177, 3182, 3185, 3190, 3119, 3118, 3089, 3088, 1008, 1009, 974, 975, 900, 901, 924, 927, 3168, 3171, 3194, 3195, 3120, 3121, 3086, 3087, 1015, 1014, 969, 968, 899, 898, 925, 926, 3169, 3170, 3197, 3196, 3127, 3126, 3081, 3080, 1032, 1033, 1078, 1079, 1148, 1149, 1122, 1121, 2974, 2973, 2946, 2947, 3016, 3017, 3062, 3063, 1039, 1038, 1073, 1072, 1147, 1146, 1123, 1120, 2975, 2972, 2949, 2948, 3023, 3022, 3057, 3056, 1040, 1041, 1070, 1071, 1142, 1137, 1134, 1129, 2966, 2961, 2958, 2953, 3024, 3025, 3054, 3055, 1047, 1046, 1065, 1064, 1141, 1138, 1133, 1130, 2965, 2962, 2957, 2954, 3031, 3030, 3049, 3048, 1512, 1513, 1494, 1495, 1418, 1421, 1426, 1429, 2666, 2669, 2674, 2677, 2600, 2601, 2582, 2583, 1519, 1518, 1489, 1488, 1417, 1422, 1425, 1430, 2665, 2670, 2673, 2678, 2607, 2606, 2577, 2576, 1520, 1521, 1486, 1487, 1412, 1413, 1436, 1439, 2656, 2659, 2682, 2683, 2608, 2609, 2574, 2575, 1527, 1526, 1481, 1480, 1411, 1410, 1437, 1438, 2657, 2658, 2685, 2684, 2615, 2614, 2569, 2568, 523, 522, 565, 564, 639, 638, 613, 614, 3481, 3482, 3457, 3456, 3531, 3530, 3573, 3572, 524, 525, 562, 563, 632, 633, 612, 615, 3480, 3483, 3462, 3463, 3532, 3533, 3570, 3571, 531, 530, 557, 556, 631, 624, 623, 616, 3479, 3472, 3471, 3464, 3539, 3538, 3565, 3564, 532, 533, 554, 555, 628, 627, 620, 619, 3476, 3475, 3468, 3467, 3540, 3541, 3562, 3563, 1003, 1002, 981, 980, 907, 908, 915, 916, 3179, 3180, 3187, 3188, 3115, 3114, 3093, 3092, 1004, 1005, 978, 979, 904, 911, 912, 919, 3176, 3183, 3184, 3191, 3116, 3117, 3090, 3091, 1011, 1010, 973, 972, 903, 902, 923, 920, 3175, 3172, 3193, 3192, 3123, 3122, 3085, 3084, 1012, 1013, 970, 971, 896, 897, 922, 921, 3174, 3173, 3198, 3199, 3124, 3125, 3082, 3083, 1035, 1034, 1077, 1076, 1151, 1150, 1125, 1126, 2969, 2970, 2945, 2944, 3019, 3018, 3061, 3060, 1036, 1037, 1074, 1075, 1144, 1145, 1124, 1127, 2968, 2971, 2950, 2951, 3020, 3021, 3058, 3059, 1043, 1042, 1069, 1068, 1143, 1136, 1135, 1128, 2967, 2960, 2959, 2952, 3027, 3026, 3053, 3052, 1044, 1045, 1066, 1067, 1140, 1139, 1132, 1131, 2964, 2963, 2956, 2955, 3028, 3029, 3050, 3051, 1515, 1514, 1493, 1492, 1419, 1420, 1427, 1428, 2667, 2668, 2675, 2676, 2603, 2602, 2581, 2580, 1516, 1517, 1490, 1491, 1416, 1423, 1424, 1431, 2664, 2671, 2672, 2679, 2604, 2605, 2578, 2579, 1523, 1522, 1485, 1484, 1415, 1414, 1435, 1432, 2663, 2660, 2681, 2680, 2611, 2610, 2573, 2572, 1524, 1525, 1482, 1483, 1408, 1409, 1434, 1433, 2662, 2661, 2686, 2687, 2612, 2613, 2570, 2571, 724, 727, 728, 729, 640, 641, 666, 665, 3430, 3429, 3454, 3455, 3366, 3367, 3368, 3371, 725, 726, 735, 734, 647, 646, 667, 664, 3431, 3428, 3449, 3448, 3361, 3360, 3369, 3370, 746, 745, 736, 737, 648, 655, 656, 663, 3432, 3439, 3440, 3447, 3358, 3359, 3350, 3349, 747, 744, 743, 742, 651, 652, 659, 660, 3435, 3436, 3443, 3444, 3353, 3352, 3351, 3348, 788, 791, 792, 793, 884, 883, 876, 875, 3220, 3219, 3212, 3211, 3302, 3303, 3304, 3307, 789, 790, 799, 798, 887, 880, 879, 872, 3223, 3216, 3215, 3208, 3297, 3296, 3305, 3306, 810, 809, 800, 801, 888, 889, 868, 871, 3224, 3227, 3206, 3207, 3294, 3295, 3286, 3285, 811, 808, 807, 806, 895, 894, 869, 870, 3225, 3226, 3201, 3200, 3289, 3288, 3287, 3284, 1236, 1239, 1240, 1241, 1152, 1153, 1178, 1177, 2918, 2917, 2942, 2943, 2854, 2855, 2856, 2859, 1237, 1238, 1247, 1246, 1159, 1158, 1179, 1176, 2919, 2916, 2937, 2936, 2849, 2848, 2857, 2858, 1258, 1257, 1248, 1249, 1160, 1167, 1168, 1175, 2920, 2927, 2928, 2935, 2846, 2847, 2838, 2837, 1259, 1256, 1255, 1254, 1163, 1164, 1171, 1172, 2923, 2924, 2931, 2932, 2841, 2840, 2839, 2836, 1300, 1303, 1304, 1305, 1396, 1395, 1388, 1387, 2708, 2707, 2700, 2699, 2790, 2791, 2792, 2795, 1301, 1302, 1311, 1310, 1399, 1392, 1391, 1384, 2711, 2704, 2703, 2696, 2785, 2784, 2793, 2794, 1322, 1321, 1312, 1313, 1400, 1401, 1380, 1383, 2712, 2715, 2694, 2695, 2782, 2783, 2774, 2773, 1323, 1320, 1319, 1318, 1407, 1406, 1381, 1382, 2713, 2714, 2689, 2688, 2777, 2776, 2775, 2772, 723, 720, 731, 730, 643, 642, 669, 670, 3425, 3426, 3453, 3452, 3365, 3364, 3375, 3372, 722, 721, 732, 733, 644, 645, 668, 671, 3424, 3427, 3450, 3451, 3362, 3363, 3374, 3373, 749, 750, 739, 738, 649, 654, 657, 662, 3433, 3438, 3441, 3446, 3357, 3356, 3345, 3346, 748, 751, 740, 741, 650, 653, 658, 661, 3434, 3437, 3442, 3445, 3354, 3355, 3344, 3347, 787, 784, 795, 794, 885, 882, 877, 874, 3221, 3218, 3213, 3210, 3301, 3300, 3311, 3308, 786, 785, 796, 797, 886, 881, 878, 873, 3222, 3217, 3214, 3209, 3298, 3299, 3310, 3309, 813, 814, 803, 802, 891, 890, 867, 864, 3231, 3228, 3205, 3204, 3293, 3292, 3281, 3282, 812, 815, 804, 805, 892, 893, 866, 865, 3230, 3229, 3202, 3203, 3290, 3291, 3280, 3283, 1235, 1232, 1243, 1242, 1155, 1154, 1181, 1182, 2913, 2914, 2941, 2940, 2853, 2852, 2863, 2860, 1234, 1233, 1244, 1245, 1156, 1157, 1180, 1183, 2912, 2915, 2938, 2939, 2850, 2851, 2862, 2861, 1261, 1262, 1251, 1250, 1161, 1166, 1169, 1174, 2921, 2926, 2929, 2934, 2845, 2844, 2833, 2834, 1260, 1263, 1252, 1253, 1162, 1165, 1170, 1173, 2922, 2925, 2930, 2933, 2842, 2843, 2832, 2835, 1299, 1296, 1307, 1306, 1397, 1394, 1389, 1386, 2709, 2706, 2701, 2698, 2789, 2788, 2799, 2796, 1298, 1297, 1308, 1309, 1398, 1393, 1390, 1385, 2710, 2705, 2702, 2697, 2786, 2787, 2798, 2797, 1325, 1326, 1315, 1314, 1403, 1402, 1379, 1376, 2719, 2716, 2693, 2692, 2781, 2780, 2769, 2770, 1324, 1327, 1316, 1317, 1404, 1405, 1378, 1377, 2718, 2717, 2690, 2691, 2778, 2779, 2768, 2771, 716, 719, 710, 705, 700, 701, 674, 673, 3422, 3421, 3394, 3395, 3390, 3385, 3376, 3379, 717, 718, 709, 706, 699, 698, 675, 672, 3423, 3420, 3397, 3396, 3389, 3386, 3377, 3378, 754, 753, 762, 765, 694, 689, 686, 681, 3414, 3409, 3406, 3401, 3330, 3333, 3342, 3341, 755, 752, 761, 766, 693, 690, 685, 682, 3413, 3410, 3405, 3402, 3329, 3334, 3343, 3340, 780, 783, 774, 769, 842, 845, 850, 853, 3242, 3245, 3250, 3253, 3326, 3321, 3312, 3315, 781, 782, 773, 770, 841, 846, 849, 854, 3241, 3246, 3249, 3254, 3325, 3322, 3313, 3314, 818, 817, 826, 829, 836, 837, 860, 863, 3232, 3235, 3258, 3259, 3266, 3269, 3278, 3277, 819, 816, 825, 830, 835, 834, 861, 862, 3233, 3234, 3261, 3260, 3265, 3270, 3279, 3276, 1228, 1231, 1222, 1217, 1212, 1213, 1186, 1185, 2910, 2909, 2882, 2883, 2878, 2873, 2864, 2867, 1229, 1230, 1221, 1218, 1211, 1210, 1187, 1184, 2911, 2908, 2885, 2884, 2877, 2874, 2865, 2866, 1266, 1265, 1274, 1277, 1206, 1201, 1198, 1193, 2902, 2897, 2894, 2889, 2818, 2821, 2830, 2829, 1267, 1264, 1273, 1278, 1205, 1202, 1197, 1194, 2901, 2898, 2893, 2890, 2817, 2822, 2831, 2828, 1292, 1295, 1286, 1281, 1354, 1357, 1362, 1365, 2730, 2733, 2738, 2741, 2814, 2809, 2800, 2803, 1293, 1294, 1285, 1282, 1353, 1358, 1361, 1366, 2729, 2734, 2737, 2742, 2813, 2810, 2801, 2802, 1330, 1329, 1338, 1341, 1348, 1349, 1372, 1375, 2720, 2723, 2746, 2747, 2754, 2757, 2766, 2765, 1331, 1328, 1337, 1342, 1347, 1346, 1373, 1374, 2721, 2722, 2749, 2748, 2753, 2758, 2767, 2764, 715, 712, 711, 704, 703, 702, 677, 678, 3417, 3418, 3393, 3392, 3391, 3384, 3383, 3380, 714, 713, 708, 707, 696, 697, 676, 679, 3416, 3419, 3398, 3399, 3388, 3387, 3382, 3381, 757, 758, 763, 764, 695, 688, 687, 680, 3415, 3408, 3407, 3400, 3331, 3332, 3337, 3338, 756, 759, 760, 767, 692, 691, 684, 683, 3412, 3411, 3404, 3403, 3328, 3335, 3336, 3339, 779, 776, 775, 768, 843, 844, 851, 852, 3243, 3244, 3251, 3252, 3327, 3320, 3319, 3316, 778, 777, 772, 771, 840, 847, 848, 855, 3240, 3247, 3248, 3255, 3324, 3323, 3318, 3317, 821, 822, 827, 828, 839, 838, 859, 856, 3239, 3236, 3257, 3256, 3267, 3268, 3273, 3274, 820, 823, 824, 831, 832, 833, 858, 857, 3238, 3237, 3262, 3263, 3264, 3271, 3272, 3275, 1227, 1224, 1223, 1216, 1215, 1214, 1189, 1190, 2905, 2906, 2881, 2880, 2879, 2872, 2871, 2868, 1226, 1225, 1220, 1219, 1208, 1209, 1188, 1191, 2904, 2907, 2886, 2887, 2876, 2875, 2870, 2869, 1269, 1270, 1275, 1276, 1207, 1200, 1199, 1192, 2903, 2896, 2895, 2888, 2819, 2820, 2825, 2826, 1268, 1271, 1272, 1279, 1204, 1203, 1196, 1195, 2900, 2899, 2892, 2891, 2816, 2823, 2824, 2827, 1291, 1288, 1287, 1280, 1355, 1356, 1363, 1364, 2731, 2732, 2739, 2740, 2815, 2808, 2807, 2804, 1290, 1289, 1284, 1283, 1352, 1359, 1360, 1367, 2728, 2735, 2736, 2743, 2812, 2811, 2806, 2805, 1333, 1334, 1339, 1340, 1351, 1350, 1371, 1368, 2727, 2724, 2745, 2744, 2755, 2756, 2761, 2762, 1332, 1335, 1336, 1343, 1344, 1345, 1370, 1369, 2726, 2725, 2750, 2751, 2752, 2759, 2760, 2763}; \ No newline at end of file diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp b/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp new file mode 100644 index 00000000..954d4ecd --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp @@ -0,0 +1,116 @@ + +#include "Neon/Neon.h" +#include "Neon/domain/tools/SpaceCurves.h" +#include "domain-space-filling-curves.h" +#include "goldenEncoding.h" +#include "gtest/gtest.h" +#include "runHelper.h" + +TEST(domain_space_filling_curves, morton) +{ + Neon::int32_3d dim = {16, 16, 16}; + for (int x = 0; x < dim.x; x++) { + for (int y = 0; y < dim.y; y++) { + for (int z = 0; z < dim.z; z++) { + using namespace Neon::domain::tool::spaceCurves; + Neon::int32_3d idx = {x, y, z}; + auto morton = Encoder::encode(EncoderType::morton, dim, idx); + auto sweep = Encoder::encode(EncoderType::sweep, dim, {z, y, x}); + + ASSERT_EQ(morton_grid_16_16_16[sweep], morton) << dim << " " << idx << " " << morton; + } + } + } +} + +TEST(domain_space_filling_curves, hilbert) +{ + Neon::int32_3d dim = {16, 16, 16}; + for (int x = 0; x < dim.x; x++) { + for (int y = 0; y < dim.y; y++) { + for (int z = 0; z < dim.z; z++) { + + using namespace Neon::domain::tool::spaceCurves; + Neon::int32_3d idx = {x, y, z}; + auto hilbert = Encoder::encode(EncoderType::hilbert, dim, idx); + auto sweep = Encoder::encode(EncoderType::sweep, dim, {z, y, x}); + + ASSERT_EQ(hilbert_grid_16_16_16[sweep], hilbert) << dim << " " << idx << " " << hilbert; + } + } + } +} + +TEST(domain_space_filling_curves, hilbert_hilbert) +{ + auto run = [](Neon::domain::tool::spaceCurves::EncoderType encodingType, int dimEdge) { + // Step 1 -> Neon backend: choosing the hardware for the computation + Neon::init(); + // auto runtime = Neon::Runtime::openmp; + auto runtime = Neon::Runtime::openmp; + // We are overbooking GPU 0 three times + std::vector devIds{0}; + Neon::Backend backend(devIds, runtime); + + // Step 2 -> Neon grid: setting up a dense cartesian domain + Neon::index_3d dim(dimEdge, dimEdge, dimEdge); // Size of the domain + + using Grid = Neon::eGrid; // Selecting one of the grid provided by Neon + Neon::domain::Stencil gradStencil([] { + // We use a center difference scheme to compute the grad + // The order of the points is important, + // as we'll leverage the specific order when computing the grad. + // First positive direction on x, y and z, + // then negative direction on x, y, z respectively. + return std::vector{ + {1, 0, 0}, + {0, 1, 0}, + {0, 0, 1}, + {-1, 0, 0}, + {0, -1, 0}, + {0, 0, -1}}; + }()); + // Actual Neon grid allocation + Grid grid( + backend, + dim, + [&](const Neon::index_3d&) -> bool { + return true; + }, // <- defining the active cells. + gradStencil, + 1.0, + 0.0, encodingType); + + auto field = grid.newField("spaceCode", 1, 0); + + grid.newContainer("DecoceFromId", + [&](Neon::set::Loader& l) { + auto f = l.load(field); + return [=] NEON_CUDA_HOST_DEVICE(const Grid::Idx& gidx) mutable { + auto internalId = gidx.helpGet(); + auto global = f.getGlobalIndex(gidx); +#pragma omp critical + { + using namespace Neon::domain::tool::spaceCurves; + auto encoded = Encoder::encode(encodingType, dim, global); + // std::cout << global << " -> internal " << internalId << " code " << encoded << std::endl; + EXPECT_EQ(internalId, encoded); + } + f(gidx, 0) = internalId; + }; + }) + .run(Neon::Backend::mainStreamIdx); + field.ioToVtk("DecoceFromId", "grad"); + printf("DONE\n"); + }; + run(Neon::domain::tool::spaceCurves::EncoderType::sweep, 32); + run(Neon::domain::tool::spaceCurves::EncoderType::morton,32); + run(Neon::domain::tool::spaceCurves::EncoderType::hilbert,32); +} + +int main(int argc, char** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + Neon::init(); + return RUN_ALL_TESTS(); +} diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h b/libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h new file mode 100644 index 00000000..993bce70 --- /dev/null +++ b/libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h @@ -0,0 +1,100 @@ +#pragma once +#include +#include "gtest/gtest.h" + +#include "Neon/core/core.h" +#include "Neon/core/tools/io/ioToVti.h" +#include "Neon/core/types/DataUse.h" +#include "Neon/core/types/DeviceType.h" + +#include "Neon/domain/dGrid.h" +#include "Neon/domain/details/dGridSoA/dGridSoA.h" +#include "Neon/domain/eGrid.h" +#include "Neon/domain/tools/Geometries.h" +#include "Neon/domain/tools/TestData.h" + +#include "gtest/gtest.h" + +using namespace Neon; +using namespace Neon::domain; + +using namespace Neon::domain::tool::testing; +using namespace Neon::domain::tool; + +template +void runAllTestConfigurations(std::function&)> f) +{ + std::vector nGpuTest; + nGpuTest.push_back(1); + std::vector cardinalityTest{1}; + + std::vector dimTest{{32,32,32}}; + std::vector runtimeE; + + runtimeE.push_back(Neon::Runtime::openmp); + + + std::vector geos; + std::vector memoryLayoutOptions{Neon::MemoryLayout::structOfArrays}; + + if constexpr (std::is_same_v) { + geos = std::vector{ + Geometry::FullDomain, + }; + } else { + geos = std::vector{ + Geometry::FullDomain, + // Geometry::Sphere, + // Geometry::HollowSphere, + + }; + } + + for (auto dim : dimTest) { + for (const auto& card : cardinalityTest) { + for (auto& geo : geos) { + for (const auto& ngpu : nGpuTest) { + for (const auto& runtime : runtimeE) { + for (const auto& memoryLayout : memoryLayoutOptions) { + + int maxnGPUs = [] { + if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) { + return Neon::set::DevSet::maxSet().setCardinality(); + } + return 1; + }(); + + std::vector ids; + for (int i = 0; i < ngpu; i++) { + ids.push_back(i % maxnGPUs); + } + + Neon::Backend backend(ids, runtime); + Neon::MemoryOptions memoryOptions = backend.getMemoryOptions(); + memoryOptions.setOrder(memoryLayout); + + if constexpr (std::is_same_v) { + if (dim.z < 8 * ngpu * 3) { + dim.z = ngpu * 3 * 8; + } + if (memoryLayout == Neon::MemoryLayout::arrayOfStructs) { + continue; + } + } + + assert(card == 1); + TestData testData(backend, + dim, + card, + memoryOptions, + geo); + + NEON_INFO(testData.toString()); + f(testData); + } + } + } + } + } + } +} diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp index ec6f892a..9fed3354 100644 --- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp +++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp @@ -4,29 +4,74 @@ #include "runHelper.h" #include "stencil.h" -TEST(domain_stencil, dGrid) +TEST(domain_stencil, dGrid_NoTemplate) { int nGpus = 3; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, eGrid) +TEST(domain_stencil, eGrid_NoTemplate) { int nGpus = 3; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), nGpus, 1); } -TEST(domain_stencil, bGri ) +TEST(domain_stencil, bGri_NoTemplate) { int nGpus = 5; using Type = int64_t; - runAllTestConfiguration(std::function(map::run), + runAllTestConfiguration(std::function(map::runNoTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGridSoA_NoTemplate) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runNoTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGrid_Template) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, eGrid_Template) +{ + int nGpus = 3; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, bGri_Template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), + nGpus, + 1); +} + +TEST(domain_stencil, dGridSoA_Template) +{ + int nGpus = 5; + using Type = int64_t; + runAllTestConfiguration(std::function(map::runTemplate), nGpus, 1); } diff --git a/libNeonDomain/tests/domain-stencil/src/runHelper.h b/libNeonDomain/tests/domain-stencil/src/runHelper.h index e8f286ae..16cefb0f 100644 --- a/libNeonDomain/tests/domain-stencil/src/runHelper.h +++ b/libNeonDomain/tests/domain-stencil/src/runHelper.h @@ -33,7 +33,7 @@ void runAllTestConfiguration( // std::vector nGpuTest{2,4,6,8}; std::vector cardinalityTest{1}; - std::vector dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}}; + std::vector dimTest{{10, 17, 90}, {1, 1, 100}, {17, 1, 77}}; std::vector runtimeE{Neon::Runtime::openmp}; if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) { runtimeE.push_back(Neon::Runtime::stream); diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu index a86f1def..6cd4f6ff 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.cu +++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu @@ -9,8 +9,8 @@ namespace map { template -auto stencilContainer_laplace(const Field& filedA, - Field& fieldB) +auto laplaceNoTemplate(const Field& filedA, + Field& fieldB) -> Neon::set::Container { const auto& grid = filedA.getGrid(); @@ -59,20 +59,37 @@ static constexpr std::array stencil{ Ngh3DIdx(0, 0, 1), Ngh3DIdx(0, 0, -1)}; -template -inline auto viaTemplate (const IDX& idx, int i, const Field& a, int& partial, int& count){ - a.template getNghData(idx, i, - [&](typename Field::Type const& val) { - partial += val; - count++; - }); +template +NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Partition& a, Partial& partial, int& count) +{ + // Neon::index_3d direction(X, Y, Z); + // auto nghData = a.getNghData(idx, direction.newType(), i); + // if (nghData.isValid()) { + // partial += nghData.getData(); + // count++; + // } + a.template getNghData(idx, i, + [&](typename Partition::Type const& val) { + partial += val; + count++; + }); }; + +//template +//constexpr void constexpr_for(F&& f) +//{ +// if constexpr (Start < End) { +// f(std::integral_constant()); +// constexpr_for(f); +// } +//} + template -auto stencilContainerLaplaceTemplate(const Field& filedA, - Field& fieldB) +auto laplaceTemplate(const Field& filedA, + Field& fieldB) -> Neon::set::Container { const auto& grid = filedA.getGrid(); @@ -88,35 +105,18 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val); typename Field::Type partial = 0; int count = 0; + using Ngh3DIdx = Neon::int8_3d; - constexpr std::array stencil{ - Ngh3DIdx(1, 0, 0), - Ngh3DIdx(-1, 0, 0), - Ngh3DIdx(0, 1, 0), - Ngh3DIdx(0, -1, 0), - Ngh3DIdx(0, 0, 1), - Ngh3DIdx(0, 0, -1)}; - -#if 0 - auto viaTemplate = [&]() { - if constexpr (std::is_same_v) { - a.template getNghData(idx, i, - [&](Field::Type const& val) { - partial += val; - count++; - }); - } - }; -#endif - viaTemplate<0>(idx, i, a, partial, count); - viaTemplate<1>(idx, i, a, partial, count); - viaTemplate<2>(idx, i, a, partial, count); - viaTemplate<3>(idx, i, a, partial, count); - viaTemplate<4>(idx, i, a, partial, count); - viaTemplate<5>(idx, i, a, partial, count); - + Neon::ConstexprFor<0, 6, 1>([&](auto sIdx) { + a.template getNghData(idx, i, + [&](auto const& val) { + partial += val; + count++; + }); + }); + b(idx, i) = a(idx, i) - count * partial; } }; @@ -126,7 +126,82 @@ auto stencilContainerLaplaceTemplate(const Field& filedA, using namespace Neon::domain::tool::testing; template -auto run(TestData& data) -> void +auto runNoTemplate(TestData& data) -> void +{ + + using Type = typename TestData::Type; + auto& grid = data.getGrid(); + const std::string appName = TestInformation::fullName(grid.getImplementationName()); + const int maxIters = 1; + + NEON_INFO(grid.toString()); + + // data.resetValuesToLinear(1, 100); + data.resetValuesToMasked(1); + + { // NEON + const Neon::index_3d dim = grid.getDimension(); + std::vector elements; + auto bk = grid.getBackend(); + auto& X = data.getField(FieldNames::X); + auto& Y = data.getField(FieldNames::Y); + for (int iter = maxIters; iter > 0; iter--) { + bk.sync(Neon::Backend::mainStreamIdx); + X.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::put, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(X, Y).run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + Y.newHaloUpdate(Neon::set::StencilSemantic::standard, + Neon::set::TransferMode::get, + Neon::Execution::device) + .run(Neon::Backend::mainStreamIdx); + + bk.sync(Neon::Backend::mainStreamIdx); + laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx); + } + data.getBackend().sync(0); + } + + { // Golden data + auto& X = data.getIODomain(FieldNames::X); + auto& Y = data.getIODomain(FieldNames::Y); + for (int iter = maxIters; iter > 0; iter--) { + data.laplace(X, Y); + data.laplace(Y, X); + } + } + + data.updateHostData(); + + data.getField(FieldNames::X).ioToVtk("X", "X", true); + // data.getField(FieldNames::Y).ioToVtk("Y", "Y", false); + // data.getField(FieldNames::Z).ioToVtk("Z", "Z", false); + // + data.getIODomain(FieldNames::X).ioToVti("X_", "X_"); + // data.getField(FieldNames::Y).ioVtiAllocator("Y_"); + // data.getField(FieldNames::Z).ioVtiAllocator("Z_"); + + bool isOk = data.compare(FieldNames::X); + isOk = data.compare(FieldNames::Y); + if (!isOk) { + auto flagField = data.compareAndGetField(FieldNames::X); + flagField.ioToVti("X_diffFlag", "X_diffFlag"); + flagField = data.compareAndGetField(FieldNames::Y); + flagField.ioToVti("Y_diffFlag", "Y_diffFlag"); + } + ASSERT_TRUE(isOk); + if (!isOk) { + exit(99); + } +} + +template +auto runTemplate(TestData& data) -> void { using Type = typename TestData::Type; @@ -153,7 +228,7 @@ auto run(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainer_laplace(X, Y).run(Neon::Backend::mainStreamIdx); + laplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); Y.newHaloUpdate(Neon::set::StencilSemantic::standard, @@ -162,7 +237,7 @@ auto run(TestData& data) -> void .run(Neon::Backend::mainStreamIdx); bk.sync(Neon::Backend::mainStreamIdx); - stencilContainer_laplace(Y, X).run(Neon::Backend::mainStreamIdx); + laplaceTemplate(Y, X).run(Neon::Backend::mainStreamIdx); } data.getBackend().sync(0); } @@ -200,9 +275,14 @@ auto run(TestData& data) -> void } } -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; -template auto run(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runNoTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; +template auto runTemplate(TestData&) -> void; } // namespace map \ No newline at end of file diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.h b/libNeonDomain/tests/domain-stencil/src/stencil.h index a35d8011..456f5f01 100644 --- a/libNeonDomain/tests/domain-stencil/src/stencil.h +++ b/libNeonDomain/tests/domain-stencil/src/stencil.h @@ -11,9 +11,20 @@ namespace map { using namespace Neon::domain::tool::testing; template -auto run(TestData& data) -> void; +auto runNoTemplate(TestData& data) -> void; -extern template auto run(TestData&) -> void; -extern template auto run(TestData&) -> void; +template +auto runTemplate(TestData& data) -> void; + + +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; +extern template auto runNoTemplate(TestData&) -> void; + +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; +extern template auto runTemplate(TestData&) -> void; } // namespace map diff --git a/libNeonSet/include/Neon/set/DevSet.h b/libNeonSet/include/Neon/set/DevSet.h index 5ac38250..5e8b03b7 100644 --- a/libNeonSet/include/Neon/set/DevSet.h +++ b/libNeonSet/include/Neon/set/DevSet.h @@ -20,6 +20,7 @@ #include "Neon/set/LambdaExecutor.h" #include "Neon/set/LaunchParameters.h" #include "Neon/set/Transfer.h" +#include "Neon/set/container/CudaLaunchCompileTimeHints.h" #include "Neon/set/memory/memDevSet.h" #include "Neon/set/memory/memSet.h" #include "Neon/sys/global/GpuSysGlobal.h" @@ -222,7 +223,9 @@ class DevSet auto newLaunchParameters() const -> LaunchParameters; - template + template inline auto launchLambdaOnSpan( Neon::Execution execution, const Neon::set::KernelConfig& kernelConfig, @@ -236,9 +239,11 @@ class DevSet switch (mode) { case Neon::Runtime::stream: { if (execution == Neon::Execution::device) { - this->template helpLaunchLambdaOnSpanCUDA(kernelConfig, - dataSetContainer, - lambdaHolder); + this->template helpLaunchLambdaOnSpanCUDA(kernelConfig, + dataSetContainer, + lambdaHolder); return; } #if defined(NEON_OS_LINUX) || defined(NEON_OS_MAC) @@ -352,7 +357,9 @@ class DevSet } } - template + template inline auto helpLaunchLambdaOnSpanCUDA([[maybe_unused]] const Neon::set::KernelConfig& kernelConfig, [[maybe_unused]] DataSetContainer& dataSetContainer, [[maybe_unused]] std::function; - } else { - executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDA; + if constexpr (!CudaLaunchCompilerTimeHints::initialized) { + if constexpr (!details::ExecutionThreadSpanUtils::isBlockSpan(DataSetContainer::executionThreadSpan)) { + executor = (void*)Neon::set::details::denseSpan::launchLambdaOnSpanCUDA; + } else { + executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDA; + } + } + + if constexpr (CudaLaunchCompilerTimeHints::initialized) { + if constexpr (!details::ExecutionThreadSpanUtils::isBlockSpan(DataSetContainer::executionThreadSpan)) { + executor = (void*)Neon::set::details::denseSpan::launchLambdaOnSpanCUDAWithCompilerHints; + } else { + executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDAWithCompilerHints; + } } dev.kernel.template cudaLaunchKernel(gpuStreamSet[setIdx.idx()], launchInfoSet[setIdx.idx()], diff --git a/libNeonSet/include/Neon/set/LambdaExecutor.h b/libNeonSet/include/Neon/set/LambdaExecutor.h index 4ffe2501..825e86a7 100644 --- a/libNeonSet/include/Neon/set/LambdaExecutor.h +++ b/libNeonSet/include/Neon/set/LambdaExecutor.h @@ -36,6 +36,38 @@ NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDA(typename DataSetContainer::Span spa } } } + +template +__launch_bounds__(CudaLaunchCompilerTimeHints::maxThreadsPerBlock) + NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDAWithCompilerHints(typename DataSetContainer::Span span, + UserLambda userLambdaTa) + -> void +{ + typename DataSetContainer::Idx e; + if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d1) { + if (span.setAndValidate(e, + threadIdx.x + blockIdx.x * blockDim.x)) { + userLambdaTa(e); + } + } + if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d2) { + if (span.setAndValidate(e, + threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y)) { + userLambdaTa(e); + } + } + if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d3) { + if (span.setAndValidate(e, + threadIdx.x + blockIdx.x * blockDim.x, + threadIdx.y + blockIdx.y * blockDim.y, + threadIdx.z + blockIdx.z * blockDim.z)) { + userLambdaTa(e); + } + } +} #endif @@ -48,9 +80,9 @@ void launchLambdaOnSpanOMP(Neon::Integer_3d const& gridDim, { if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d1) { #ifdef NEON_OS_WINDOWS -//#pragma omp parallel for default(shared) +// #pragma omp parallel for default(shared) #else - #pragma omp parallel for simd default(shared) +#pragma omp parallel for simd default(shared) #endif for (IndexType x = 0; x < gridDim.x; x++) { typename DataSetContainer::Idx e; @@ -65,7 +97,7 @@ void launchLambdaOnSpanOMP(Neon::Integer_3d const& gridDim, #ifdef NEON_OS_WINDOWS #pragma omp parallel for default(shared) #else -// #pragma omp parallel for simd collapse(2) default(shared) + // #pragma omp parallel for simd collapse(2) default(shared) #endif for (IndexType y = 0; y < gridDim.y; y++) { for (IndexType x = 0; x < gridDim.x; x++) { @@ -81,7 +113,7 @@ void launchLambdaOnSpanOMP(Neon::Integer_3d const& gridDim, #ifdef NEON_OS_WINDOWS #pragma omp parallel for default(shared) #else -// #pragma omp parallel for simd collapse(1) default(shared) schedule(guided) + // #pragma omp parallel for simd collapse(1) default(shared) schedule(guided) #endif for (IndexType z = 0; z < gridDim.z; z++) { for (IndexType y = 0; y < gridDim.y; y++) { @@ -113,6 +145,21 @@ NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDA(typename DataSetContainer::Span spa } } } + +template +NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDAWithCompilerHints(typename DataSetContainer::Span span, + UserLambda userLambdaTa) + -> void +{ + typename DataSetContainer::Idx e; + if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d1b3) { + if (span.setAndValidateGPUDevice(e)) { + userLambdaTa(e); + } + } +} #endif diff --git a/libNeonSet/include/Neon/set/StencilSemantic.h b/libNeonSet/include/Neon/set/StencilSemantic.h index cd512ae7..28b596dc 100644 --- a/libNeonSet/include/Neon/set/StencilSemantic.h +++ b/libNeonSet/include/Neon/set/StencilSemantic.h @@ -2,6 +2,7 @@ #include #include +#include "Neon/Report.h" #include "Neon/core/core.h" namespace Neon::set { @@ -9,7 +10,7 @@ namespace Neon::set { enum struct StencilSemantic { standard = 0 /*< Transfer for halo update on grid structure */, - streaming = 1 /*< Transfer for halo update on lattice structure */ + lattice = 1 /*< Transfer for halo update on lattice structure */ }; @@ -20,19 +21,24 @@ struct StencilSemanticUtils static auto toString(StencilSemantic opt) -> std::string; static auto fromString(const std::string& opt) -> StencilSemantic; static auto getOptions() -> std::array; - + struct Cli { explicit Cli(std::string); explicit Cli(StencilSemantic model); Cli(); - auto getOption() -> StencilSemantic; + auto getOption() const -> StencilSemantic; auto set(const std::string& opt) -> void; - auto getStringOptions() -> std::string; + auto getStringOptions() const -> std::string; + auto getStringOption() const -> std::string; + auto getDoc() const -> std::string; + + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; + auto addToReport(Neon::Report& report) const -> void; private: - bool mSet = false; + bool mSet = false; StencilSemantic mOption; }; }; diff --git a/libNeonSet/include/Neon/set/TransferMode.h b/libNeonSet/include/Neon/set/TransferMode.h index b6f4ec86..a335f5da 100644 --- a/libNeonSet/include/Neon/set/TransferMode.h +++ b/libNeonSet/include/Neon/set/TransferMode.h @@ -3,6 +3,7 @@ #include #include "Neon/core/core.h" +#include "Neon/Report.h" namespace Neon::set { @@ -26,9 +27,14 @@ class TransferModeUtils explicit Cli(TransferMode model); Cli(); - auto getOption() -> TransferMode; + auto getOption() const -> TransferMode; auto set(const std::string& opt) -> void; - auto getStringOptions() -> std::string; + auto getStringOptions() const -> std::string; + auto getStringOption() const -> std::string; + auto getDoc () const -> std::string; + + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const ->void; + auto addToReport(Neon::Report& report) const ->void; private: bool mSet = false; diff --git a/libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h b/libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h new file mode 100644 index 00000000..84fee176 --- /dev/null +++ b/libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h @@ -0,0 +1,21 @@ +#pragma once + +#include "Neon/core/core.h" + + +namespace Neon::set::container { + +template +struct CudaLaunchCompileTimeHint +{ + public: + static constexpr bool initialized = inited__; + static constexpr int maxThreadsPerBlock = maxThreadsPerBlock__; + static constexpr int minBlocksPerMultiprocessor = minBlocksPerMultiprocessor__; + static constexpr int maxBlocksPerCluster = maxBlocksPerCluster__; +}; + +} // namespace Neon::set::container diff --git a/libNeonSet/include/Neon/set/container/DeviceContainer.h b/libNeonSet/include/Neon/set/container/DeviceContainer.h index 6f729894..ae3bf957 100644 --- a/libNeonSet/include/Neon/set/container/DeviceContainer.h +++ b/libNeonSet/include/Neon/set/container/DeviceContainer.h @@ -6,8 +6,8 @@ namespace Neon::set::internal { -template +template < typename DataIteratorContainerT, + typename UserComputeLambdaT, typename CudaLaunchCompileTimeHintT = Neon::set::container::CudaLaunchCompileTimeHint> struct DeviceContainer : ContainerAPI { public: @@ -93,7 +93,7 @@ struct DeviceContainer : ContainerAPI Neon::set::KernelConfig kernelConfig(dataView, bk, streamIdx, this->getLaunchParameters(dataView)); if (ContainerExecutionType::device == this->getContainerExecutionType()) { - bk.devSet().template launchLambdaOnSpan( + bk.devSet().template launchLambdaOnSpan( mExecution, kernelConfig, m_dataIteratorContainer, diff --git a/libNeonSet/include/Neon/set/container/Loader_imp.h b/libNeonSet/include/Neon/set/container/Loader_imp.h index e134effe..c9682ff9 100644 --- a/libNeonSet/include/Neon/set/container/Loader_imp.h +++ b/libNeonSet/include/Neon/set/container/Loader_imp.h @@ -115,7 +115,7 @@ auto Loader:: if (compute == Neon::Pattern::STENCIL && (stencilSemantic == StencilSemantic::standard || - stencilSemantic == StencilSemantic::streaming)) { + stencilSemantic == StencilSemantic::lattice)) { Neon::NeonException exp("Loader"); exp << "Loading a non const field for a stencil operation is not supported in Neon"; NEON_THROW(exp); diff --git a/libNeonSet/src/set/StencilSemantic.cpp b/libNeonSet/src/set/StencilSemantic.cpp index 560b687a..0e6b2114 100644 --- a/libNeonSet/src/set/StencilSemantic.cpp +++ b/libNeonSet/src/set/StencilSemantic.cpp @@ -5,11 +5,11 @@ namespace Neon::set { auto StencilSemanticUtils::toString(StencilSemantic option) -> std::string { switch (option) { - case StencilSemantic::streaming: { - return "streaming"; + case StencilSemantic::lattice: { + return "lattice"; } case StencilSemantic::standard: { - return "grid"; + return "standard"; } } NEON_THROW_UNSUPPORTED_OPTION(""); @@ -17,7 +17,7 @@ auto StencilSemanticUtils::toString(StencilSemantic option) -> std::string auto StencilSemanticUtils::fromString(const std::string& occ) -> StencilSemantic { - std::array opts{StencilSemantic::standard, StencilSemantic::streaming}; + std::array opts{StencilSemantic::standard, StencilSemantic::lattice}; for (auto a : opts) { if (toString(a) == occ) { return a; @@ -28,7 +28,7 @@ auto StencilSemanticUtils::fromString(const std::string& occ) -> StencilSemantic auto StencilSemanticUtils::getOptions() -> std::array { - std::array opts = {StencilSemantic::standard, StencilSemantic::streaming}; + std::array opts = {StencilSemantic::standard, StencilSemantic::lattice}; return opts; } @@ -47,7 +47,7 @@ StencilSemanticUtils::Cli::Cli(StencilSemantic model) mOption = model; } -auto StencilSemanticUtils::Cli::getOption() -> StencilSemantic +auto StencilSemanticUtils::Cli::getOption() const -> StencilSemantic { if (!mSet) { std::stringstream errorMsg; @@ -66,13 +66,13 @@ auto StencilSemanticUtils::Cli::set(const std::string& opt) std::stringstream errorMsg; errorMsg << "TransferSemantic: " << opt << " is not a valid option (valid options are {"; auto options = StencilSemanticUtils::getOptions(); - int i = 0; + int i = 0; for (auto o : options) { - if(i!=0){ - errorMsg << ", "<< StencilSemanticUtils::toString(o) ; + if (i != 0) { + errorMsg << ", " << StencilSemanticUtils::toString(o); } errorMsg << StencilSemanticUtils::toString(o); - i=1; + i = 1; } errorMsg << "})"; NEON_ERROR(errorMsg.str()); @@ -80,19 +80,48 @@ auto StencilSemanticUtils::Cli::set(const std::string& opt) mSet = true; } -auto StencilSemanticUtils::Cli::getStringOptions() -> std::string +auto StencilSemanticUtils::Cli::getStringOptions() const -> std::string { std::stringstream s; auto options = StencilSemanticUtils::getOptions(); int i = 0; for (auto o : options) { if (i != 0) { - s << ", " ; + s << ", "; } s << StencilSemanticUtils::toString(o); i = 1; } - std::string msg= s.str(); + std::string msg = s.str(); return msg; } -} // namespace Neon + +auto StencilSemanticUtils::Cli::getStringOption() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "TransferSemantic was not set."; + NEON_ERROR(errorMsg.str()); + } + return StencilSemanticUtils::toString(mOption); +} + +auto StencilSemanticUtils::Cli::getDoc() const-> std::string +{ + std::stringstream s; + s << getStringOptions(); + s << " default: " << getStringOptions(); + return s.str(); +} + + +auto StencilSemanticUtils::Cli::addToReport(Neon::Report& report) const -> void +{ + report.addMember("StencilSemantic", StencilSemanticUtils::toString(this->getOption())); +} + +auto StencilSemanticUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void +{ + report.addMember("StencilSemantic", StencilSemanticUtils::toString(this->getOption()), &subBlock); +} +} // namespace Neon::set diff --git a/libNeonSet/src/set/TransferMode.cpp b/libNeonSet/src/set/TransferMode.cpp index 9ef657eb..c2a30ab2 100644 --- a/libNeonSet/src/set/TransferMode.cpp +++ b/libNeonSet/src/set/TransferMode.cpp @@ -47,7 +47,7 @@ TransferModeUtils::Cli::Cli(TransferMode model) mOption = model; } -auto TransferModeUtils::Cli::getOption() -> TransferMode +auto TransferModeUtils::Cli::getOption() const -> TransferMode { if (!mSet) { std::stringstream errorMsg; @@ -66,13 +66,13 @@ auto TransferModeUtils::Cli::set(const std::string& opt) std::stringstream errorMsg; errorMsg << "Transfer: " << opt << " is not a valid option (valid options are {"; auto options = TransferModeUtils::getOptions(); - int i = 0; + int i = 0; for (auto o : options) { - if(i!=0){ - errorMsg << ", "<< TransferModeUtils::toString(o) ; + if (i != 0) { + errorMsg << ", " << TransferModeUtils::toString(o); } errorMsg << TransferModeUtils::toString(o); - i=1; + i = 1; } errorMsg << "})"; NEON_ERROR(errorMsg.str()); @@ -80,19 +80,47 @@ auto TransferModeUtils::Cli::set(const std::string& opt) mSet = true; } -auto TransferModeUtils::Cli::getStringOptions() -> std::string +auto TransferModeUtils::Cli::getStringOptions() const -> std::string { std::stringstream s; auto options = TransferModeUtils::getOptions(); int i = 0; for (auto o : options) { if (i != 0) { - s << ", " ; + s << ", "; } s << TransferModeUtils::toString(o); i = 1; } - std::string msg= s.str(); + std::string msg = s.str(); return msg; } -} // namespace Neon + +auto TransferModeUtils::Cli::getStringOption() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "TransferMode was not set."; + NEON_ERROR(errorMsg.str()); + } + return TransferModeUtils::toString(mOption); +} + +auto TransferModeUtils::Cli::getDoc() const -> std::string +{ + std::stringstream s; + s << getStringOptions(); + s << " default: " << getStringOptions(); + return s.str(); +} + +auto TransferModeUtils::Cli::addToReport(Neon::Report& report) const -> void +{ + report.addMember("TransferMode", TransferModeUtils::toString(this->getOption())); +} + +auto TransferModeUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void +{ + report.addMember("TransferMode", TransferModeUtils::toString(this->getOption()), &subBlock); +} +} // namespace Neon::set diff --git a/libNeonSkeleton/include/Neon/skeleton/Occ.h b/libNeonSkeleton/include/Neon/skeleton/Occ.h index a54f799a..041d178f 100644 --- a/libNeonSkeleton/include/Neon/skeleton/Occ.h +++ b/libNeonSkeleton/include/Neon/skeleton/Occ.h @@ -27,12 +27,15 @@ struct OccUtils explicit Cli(Occ model); Cli(); - auto getOption() -> Occ; + auto getOption() const -> Occ; auto set(const std::string& opt) -> void; - auto getStringOptions() -> std::string; + auto getStringOptions() const -> std::string; + auto getDoc() const -> std::string; - auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock)->void; - auto addToReport(Neon::Report& report)->void; + auto getStringOption() const -> std::string; + + auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void; + auto addToReport(Neon::Report& report) const -> void; private: bool mSet = false; @@ -41,4 +44,4 @@ struct OccUtils }; -} // namespace Neon::skeleton \ No newline at end of file +} // namespace Neon::skeleton diff --git a/libNeonSkeleton/src/skeleton/Occ.cpp b/libNeonSkeleton/src/skeleton/Occ.cpp index 44ac9155..44ba2cd9 100644 --- a/libNeonSkeleton/src/skeleton/Occ.cpp +++ b/libNeonSkeleton/src/skeleton/Occ.cpp @@ -48,12 +48,23 @@ OccUtils::Cli::Cli(std::string s) set(s); } +auto OccUtils::Cli::getStringOption() const -> std::string +{ + if (!mSet) { + std::stringstream errorMsg; + errorMsg << "Occ was not set."; + NEON_ERROR(errorMsg.str()); + } + return OccUtils::toString(mOption); +} + OccUtils::Cli::Cli(Occ model) { mOption = model; + mSet = true; } -auto OccUtils::Cli::getOption() -> Occ +auto OccUtils::Cli::getOption() const -> Occ { if (!mSet) { std::stringstream errorMsg; @@ -86,7 +97,7 @@ auto OccUtils::Cli::set(const std::string& opt) mSet = true; } -auto OccUtils::Cli::getStringOptions() -> std::string +auto OccUtils::Cli::getStringOptions() const -> std::string { std::stringstream s; auto options = OccUtils::getOptions(); @@ -102,14 +113,22 @@ auto OccUtils::Cli::getStringOptions() -> std::string return msg; } -auto OccUtils::Cli::addToReport(Neon::Report& report) -> void +auto OccUtils::Cli::getDoc() const -> std::string +{ + std::stringstream s; + s << getStringOptions(); + s << " default: " << OccUtils::toString(getOption()); + return s.str(); +} + +auto OccUtils::Cli::addToReport(Neon::Report& report) const -> void { report.addMember("Occ", OccUtils::toString(this->getOption())); } -auto OccUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) -> void +auto OccUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void { report.addMember("Occ", OccUtils::toString(this->getOption()), &subBlock); } -} // namespace Neon::skeleton \ No newline at end of file +} // namespace Neon::skeleton diff --git a/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu b/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu index 0170936c..2e2a2929 100644 --- a/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu +++ b/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu @@ -160,7 +160,7 @@ void SingleStencil(TestData& data, } template -void SingleStencilOCC(TestData& data) +void SingleStencilStandardOCC(TestData& data) { SingleStencil(data, Neon::skeleton::Occ::standard, Neon::set::TransferMode::get); } @@ -208,4 +208,14 @@ TEST(SingleStencil_NoOCC, bGrid) // using Grid = Neon::dGrid; using Type = int32_t; runAllTestConfiguration("bGrid_t", SingleStencilNoOCC, nGpus, 1); +} + +TEST(SingleStencil_StandardOCC, bGrid) +{ + int nGpus = 1; + using Grid = Neon::bGrid; + // using Grid = Neon::domain::eGrid; + // using Grid = Neon::dGrid; + using Type = int32_t; + runAllTestConfiguration("bGrid_t", SingleStencilStandardOCC, nGpus, 1); } \ No newline at end of file diff --git a/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h b/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h index 4858b819..8cd53082 100644 --- a/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h +++ b/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h @@ -22,10 +22,11 @@ using namespace Neon::domain::tool::testing; using namespace Neon::domain::tool; template -void runAllTestConfiguration(const std::string& gname, - std::function&)> f, - int nGpus, - int minNumGpus) +void runAllTestConfiguration(const std::string& gname, + std::function&, Neon::skeleton::Occ)> f, + Neon::skeleton::Occ occ, + int nGpus, + int minNumGpus) { if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) { std::vector nGpuTest; @@ -69,7 +70,7 @@ void runAllTestConfiguration(const std::string& gname, NEON_INFO(testData.toString()); - f(testData); + f(gname, testData, occ); } } } diff --git a/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu b/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu index 0e88980a..095959f9 100644 --- a/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu +++ b/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu @@ -59,7 +59,9 @@ auto laplaceOnIntegers(const Field& filedA, template -void singleStencil(TestData& data) +void singleStencil(std::string testName, + TestData& data, + Neon::skeleton::Occ occ) { using Type = typename TestData::Type; @@ -82,7 +84,9 @@ void singleStencil(TestData& data) ops.push_back(laplaceOnIntegers(Y, X)); Neon::skeleton::Skeleton skl(data.getBackend()); - skl.sequence(ops, "sUt_dGridStencil"); + Neon::skeleton::Options opt(occ, Neon::set::TransferMode::get); + skl.sequence(ops, testName, opt); + skl.ioToDot(testName, testName, true); for (int j = 0; j < nIterations; j++) { skl.run(); @@ -108,20 +112,29 @@ void singleStencil(TestData& data) ASSERT_TRUE(isOk); } -TEST(singleStencil, dGrid) +TEST(skeleton_stencil_occ_none, dGrid) { int nGpus = 1; using Grid = Neon::dGrid; using Type = int32_t; constexpr int C = 0; - runAllTestConfiguration("dGrid", singleStencil, nGpus, 1); + runAllTestConfiguration("skeleton_stencil_occ_none_dGrid", singleStencil, Neon::skeleton::Occ::none, nGpus, 1); } -TEST(singleStencil, bGridSingleGpu) +TEST(skeleton_stencil_occ_standard, dGrid) +{ + int nGpus = 1; + using Grid = Neon::dGrid; + using Type = int32_t; + constexpr int C = 0; + runAllTestConfiguration("skeleton_stencil_occ_standard_dGrid", singleStencil, Neon::skeleton::Occ::standard, nGpus, 1); +} + +TEST(skeleton_stencil, bGridSingleGpu) { int nGpus = 1; using Grid = Neon::bGrid; using Type = int32_t; constexpr int C = 0; - runAllTestConfiguration("bGrid", singleStencil, nGpus, 1); + runAllTestConfiguration("bGrid", singleStencil, Neon::skeleton::Occ::none, nGpus, 1); } \ No newline at end of file