diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 10c30fea..efb267c6 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,4 +1,5 @@
 cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
 
-add_subdirectory("lbm-lid-driven-cavity-flow")
+add_subdirectory(lbm)
+# add_subdirectory("lbm-lid-driven-cavity-flow")
 # add_subdirectory("lbm-flow-over-sphere")
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
index 5aebe104..2ce5dcd3 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
+++ b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
@@ -4,9 +4,11 @@
 GRID_LIST = "dGrid bGrid eGrid".split()
 STORAGE_FP_LIST = "double float".split()
 COMPUTE_FP_LIST = "double float".split()
-OCC_LIST = "nOCC".split()
+OCC_LIST = "nOCC sOCC".split()
+HU_LIST = "huGrid huLattice".split()
+CURVE_LIST = "sweep morton hilbert".split()
 WARM_UP_ITER = 10
-MAX_ITER = 100
+MAX_ITER = 10000
 REPETITIONS = 5
 
 import subprocess
@@ -38,60 +40,79 @@ def countAll():
                     for COMPUTE_FP in COMPUTE_FP_LIST:
                         for DEVICE_SET in DEVICE_SET_LIST:
                             for GRID in GRID_LIST:
-                                if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
-                                    continue
+                                for HU in HU_LIST:
+                                    for CURVE in CURVE_LIST:
+                                        if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                            continue
+                                        if STORAGE_FP == 'float' and COMPUTE_FP == 'double':
+                                            continue
 
-                                counter += 1
+                                        counter += 1
     return counter
 
 
 SAMPLES = countAll()
 counter = 0
 command = './lbm-lid-driven-cavity-flow'
+# command = 'echo'
 with open(command + '.log', 'w') as fp:
     for DEVICE_TYPE in DEVICE_TYPE_LIST:
         DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
         if DEVICE_TYPE == 'gpu':
             for DEVICE in DEVICE_ID_LIST[1:]:
                 DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
-        for OCC in OCC_LIST:
-            for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
-                for STORAGE_FP in STORAGE_FP_LIST:
-                    for COMPUTE_FP in COMPUTE_FP_LIST:
-                        for DEVICE_SET in DEVICE_SET_LIST:
+        for DEVICE_SET in DEVICE_SET_LIST:
+            for OCC in OCC_LIST:
+                for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
+                    for STORAGE_FP in STORAGE_FP_LIST:
+                        for COMPUTE_FP in COMPUTE_FP_LIST:
                             for GRID in GRID_LIST:
-                                if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
-                                    continue
+                                for HU in HU_LIST:
+                                    for CURVE in CURVE_LIST:
+
+                                        if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                            continue
+                                        if STORAGE_FP == 'float' and COMPUTE_FP == 'double':
+                                            continue
+
+                                        parameters = []
+                                        parameters.append('--deviceType ' + DEVICE_TYPE)
+                                        parameters.append('--deviceIds ' + DEVICE_SET)
+                                        parameters.append('--grid ' + GRID)
+                                        parameters.append('--domain-size ' + DOMAIN_SIZE)
+                                        parameters.append('--warmup-iter ' + str(WARM_UP_ITER))
+                                        parameters.append('--repetitions ' + str(REPETITIONS))
+                                        parameters.append('--max-iter ' + str(MAX_ITER))
+                                        parameters.append(
+                                            '--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
+                                            DEVICE_TYPE + '_' +
+                                            DEVICE_SET.replace(' ', '_') + '-' +
+                                            GRID + '_' +
+                                            DOMAIN_SIZE + '-' +
+                                            STORAGE_FP + '-' + COMPUTE_FP + '-' +
+                                            OCC + '-' +
+                                            HU + '-' +
+                                            CURVE)
+                                        parameters.append('--computeFP ' + COMPUTE_FP)
+                                        parameters.append('--storageFP ' + STORAGE_FP)
+                                        parameters.append('--curve ' + CURVE)
 
-                                parameters = []
-                                parameters.append('--deviceType ' + DEVICE_TYPE)
-                                parameters.append('--deviceIds ' + DEVICE_SET)
-                                parameters.append('--grid ' + GRID)
-                                parameters.append('--domain-size ' + DOMAIN_SIZE)
-                                parameters.append('--warmup-iter ' + str(WARM_UP_ITER))
-                                parameters.append('--repetitions ' + str(REPETITIONS))
-                                parameters.append('--max-iter ' + str(MAX_ITER))
-                                parameters.append(
-                                    '--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
-                                    DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' +
-                                    STORAGE_FP + '_' + COMPUTE_FP + '_' +
-                                    DEVICE_SET.replace(' ', '_') + '_' + OCC)
-                                parameters.append('--computeFP ' + COMPUTE_FP)
-                                parameters.append('--storageFP ' + STORAGE_FP)
-                                parameters.append('--benchmark')
-                                parameters.append('--' + OCC)
+                                        parameters.append('--benchmark')
+                                        parameters.append('--' + OCC)
+                                        parameters.append('--' + HU)
 
-                                commandList = []
-                                commandList.append(command)
-                                for el in parameters:
-                                    for s in el.split():
-                                        commandList.append(s)
+                                        commandList = []
+                                        commandList.append(command)
+                                        for el in parameters:
+                                            for s in el.split():
+                                                commandList.append(s)
 
-                                fp.write("\n-------------------------------------------\n")
-                                fp.write(' '.join(commandList))
-                                fp.write("\n-------------------------------------------\n")
-                                fp.flush()
-                                subprocess.run(commandList, text=True, stdout=fp)
+                                        fp.write("\n-------------------------------------------\n")
+                                        fp.write(' '.join(commandList))
+                                        fp.write("\n-------------------------------------------\n")
+                                        fp.flush()
+                                        print(' '.join(commandList))
+                                        subprocess.run(commandList, text=True, stdout=fp)
 
-                                counter += 1
-                                printProgressBar(counter * 100.0 / SAMPLES, 'Progress')
+                                        counter += 1
+                                        printProgressBar(counter * 100.0 / SAMPLES, 'Progress')
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h b/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h
index 7037b6ae..1ca70c6f 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/CellType.h
@@ -22,13 +22,28 @@ struct CellType
         classification = c;
         wallNghBitflag = n;
     }
+
     NEON_CUDA_HOST_DEVICE explicit CellType(Classification c)
     {
         classification = c;
         wallNghBitflag = 0;
     }
 
+    // Converting to int to exportVti
+    operator int() const { return int(classification); }
+
+    template <int fwdRegIdx>
+    static auto isWall(const uint32_t& wallNghBitFlag)
+        -> bool
+    {
+        return wallNghBitFlag & (uint32_t(1) << fwdRegIdx);
+    }
 
+    auto setWall(int fwdRegIdx)
+        -> void
+    {
+        wallNghBitflag = wallNghBitflag | ((uint32_t(1) << fwdRegIdx));
+    }
 
     uint32_t       wallNghBitflag;
     Classification classification;
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp
index 165dcff5..115125bd 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.cpp
@@ -41,6 +41,7 @@ auto Config::toString() const -> std::string
 
     s << "......... computeType " << c.computeType << std::endl;
     s << "........... storeType " << c.storeType << std::endl;
+    s << "............... curve " << c.curve << std::endl;
 
     s << ". ............... occ " << Neon::skeleton::OccUtils::toString(c.occ) << std::endl;
     s << "....... transfer Mode " << Neon::set::TransferModeUtils::toString(c.transferMode) << std::endl;
@@ -60,43 +61,58 @@ auto Config::parseArgs(const int argc, char* argv[])
     auto& config = *this;
 
     auto cli =
-        (
-            clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device ids to use",
-            clipp::required("--deviceIds") & clipp::integers("gpus", config.devices) % "Device ids to use",
-            clipp::option("--grid") & clipp::value("grid", config.gridType) % "Could be dGrid, eGrid, bGrid",
-            clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain",
-            clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters",
-            clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations",
-            clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run.",
-            clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename",
-
-            clipp::option("--computeFP") & clipp::value("computeFP", config.computeType) % "Could be double or float",
-            clipp::option("--storageFP") & clipp::value("storageFP", config.storeType) % "Could be double or float",
-
-            (
-                (clipp::option("--sOCC").set(config.occ, Neon::skeleton::Occ::standard) % "Standard OCC") |
-                (clipp::option("--nOCC").set(config.occ, Neon::skeleton::Occ::none) % "No OCC (on by default)")),
-            (
-                (clipp::option("--put").set(config.transferMode, Neon::set::TransferMode::put) % "Set transfer mode to PUT") |
-                (clipp::option("--get").set(config.transferMode, Neon::set::TransferMode::get) % "Set transfer mode to GET (on by default)")),
-            (
-                (clipp::option("--huLattice").set(config.stencilSemantic, Neon::set::StencilSemantic::streaming) % "Halo update with lattice semantic (on by default)") |
-                (clipp::option("--huGrid").set(config.stencilSemantic, Neon::set::StencilSemantic::standard) % "Halo update with grid semantic ")),
-            (
-                (clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode") |
-                (clipp::option("--visual").set(config.benchmark, false) % "Run export partial data")),
-
-            (
-                clipp::option("--vti").set(config.vti, true) % "Standard OCC")
+        (clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device ids to use",
+         clipp::required("--deviceIds") & clipp::integers("gpus", config.devices) % "Device ids to use",
+         clipp::option("--grid") & clipp::value("grid", config.gridType) % "Could be dGrid, eGrid, bGrid",
+         clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain",
+         clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters",
+         clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations",
+         clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run.",
+         clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename",
+
+         clipp::option("--computeFP") & clipp::value("computeFP", config.computeType) % "Could be double or float",
+         clipp::option("--storageFP") & clipp::value("storageFP", config.storeType) % "Could be double or float",
+
+         clipp::option("--curve") & clipp::value("curve", config.curve) % "Could be sweep (the default), morton, or hilber",
+         (
+             (clipp::option("--sOCC").set(config.occ, Neon::skeleton::Occ::standard) % "Standard OCC") |
+             (clipp::option("--nOCC").set(config.occ, Neon::skeleton::Occ::none) % "No OCC (on by default)")),
+         (
+             (clipp::option("--put").set(config.transferMode, Neon::set::TransferMode::put) % "Set transfer mode to PUT") |
+             (clipp::option("--get").set(config.transferMode, Neon::set::TransferMode::get) % "Set transfer mode to GET (on by default)")),
+         (
+             (clipp::option("--huLattice").set(config.stencilSemantic, Neon::set::StencilSemantic::streaming) % "Halo update with lattice semantic (on by default)") |
+             (clipp::option("--huGrid").set(config.stencilSemantic, Neon::set::StencilSemantic::standard) % "Halo update with grid semantic ")),
+         (
+             (clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode") |
+             (clipp::option("--visual").set(config.benchmark, false) % "Run export partial data")),
+
+         (
+             clipp::option("--vti").set(config.vti, true) % "Standard OCC")
 
         );
 
+
     if (!clipp::parse(argc, argv, cli)) {
         auto fmt = clipp::doc_formatting{}.doc_column(31);
         std::cout << make_man_page(cli, argv[0], fmt) << '\n';
         return -1;
     }
 
+    if (config.curve == "sweep")
+        config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep;
+    if (config.curve == "morton")
+        config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::morton;
+    if (config.curve == "hilbert")
+        config.spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::hilbert;
+
+    if (config.curve != "sweep" && config.curve != "morton" && config.curve != "hilbert") {
+        auto fmt = clipp::doc_formatting{}.doc_column(31);
+        std::cout << config.curve << " is not a supported configuration" << std::endl;
+        std::cout << make_man_page(cli, argv[0], fmt) << '\n';
+        return -1;
+    }
+
     helpSetLbmParameters();
 
     return 0;
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h
index af32972e..18695ce4 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Config.h
@@ -3,6 +3,7 @@
 #include <string>
 #include <vector>
 #include "Neon/core/tools/clipp.h"
+#include "Neon/domain/tools/SpaceCurves.h"
 #include "Neon/skeleton/Skeleton.h"
 
 template <typename ComputeType>
@@ -16,28 +17,29 @@ struct LbmParameters
 
 struct Config
 {
-    double                     Re = 100.;            // Reynolds number
-    double                     ulb = 0.04;           // Velocity in lattice units
-    int                        N = 160;              // Number of nodes in x-direction
-    bool                       benchmark = false;    // Run in benchmark mode ?
-    double                     max_t = 10.0;         // Non-benchmark mode: Total time in dim.less units
-    int                        outFrequency = 200;   // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages)
-    int                        dataFrequency = 0;    // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump)
-    int                        benchIniIter = 1000;  // Benchmark mode: Number of warmup iterations
-    int                        benchMaxIter = 2000;  // Benchmark mode: Total number of iterations
-    int                        repetitions = 1;      // Benchmark mode: number of time the test is run
-    std::string                deviceType = "gpu";
-    std::vector<int>           devices = std::vector<int>(0);                // Devices for the execution
-    std::string                reportFile = "lbm-lid-driven-cavity-flow";    // Report file name
-    std::string                gridType = "dGrid";                           // Neon grid type
-    Neon::skeleton::Occ        occ = Neon::skeleton::Occ::none;              // Neon OCC type
-    Neon::set::TransferMode    transferMode = Neon::set::TransferMode::get;  // Neon transfer mode for halo update
-    Neon::set::StencilSemantic stencilSemantic = Neon::set::StencilSemantic::streaming;
-    bool                       vti = false;  // Export vti file
-    std::string                computeType = "double";
-    std::string                storeType = "double";
-
-    LbmParameters<double> mLbmParameters;
+    double                                       Re = 100.;            // Reynolds number
+    double                                       ulb = 0.04;           // Velocity in lattice units
+    int                                          N = 160;              // Number of nodes in x-direction
+    bool                                         benchmark = false;    // Run in benchmark mode ?
+    double                                       max_t = 10.0;         // Non-benchmark mode: Total time in dim.less units
+    int                                          outFrequency = 200;   // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages)
+    int                                          dataFrequency = 0;    // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump)
+    int                                          benchIniIter = 1000;  // Benchmark mode: Number of warmup iterations
+    int                                          benchMaxIter = 2000;  // Benchmark mode: Total number of iterations
+    int                                          repetitions = 1;      // Benchmark mode: number of time the test is run
+    std::string                                  deviceType = "gpu";
+    std::vector<int>                             devices = std::vector<int>(0);                // Devices for the execution
+    std::string                                  reportFile = "lbm-lid-driven-cavity-flow";    // Report file name
+    std::string                                  gridType = "dGrid";                           // Neon grid type
+    Neon::skeleton::Occ                          occ = Neon::skeleton::Occ::none;              // Neon OCC type
+    Neon::set::TransferMode                      transferMode = Neon::set::TransferMode::get;  // Neon transfer mode for halo update
+    Neon::set::StencilSemantic                   stencilSemantic = Neon::set::StencilSemantic::streaming;
+    bool                                         vti = false;  // Export vti file
+    std::string                                  computeType = "double";
+    std::string                                  storeType = "double";
+    std::string                                  curve = "sweep";
+    Neon::domain::tool::spaceCurves::EncoderType spaceCurve = Neon::domain::tool::spaceCurves::EncoderType::sweep;
+    LbmParameters<double>                        mLbmParameters;
 
     auto toString()
         const -> std::string;
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h
new file mode 100644
index 00000000..ce5f69a2
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainerFactory.h
@@ -0,0 +1,33 @@
+#include "CellType.h"
+#include "D3Q19.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Containter.h"
+
+namespace pull {
+template <typename Precision_,
+          typename Lattice,
+          typename Grid>
+struct ContainerFactory
+{
+};
+}  // namespace pull
+
+namespace push {
+template <typename Precision_,
+          typename Lattice,
+          typename Grid>
+struct ContainerFactory
+{
+};
+}  // namespace push
+
+namespace common {
+template <typename Precision_,
+          typename Lattice,
+          typename Grid>
+struct ContainerFactory
+{
+};
+}  // namespace common
+#include "ContainersD3Q19.h"
+#include "ContainersD3Q27.h"
\ No newline at end of file
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h
new file mode 100644
index 00000000..fcbda83d
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q19.h
@@ -0,0 +1,392 @@
+#pragma once
+
+#include "CellType.h"
+#include "D3Q19.h"
+#include "DeviceD3Q19.h"
+#include "Methods.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Containter.h"
+
+namespace pull {
+/**
+ * Specialization for D3Q19
+ */
+template <typename Precision_, typename Grid_>
+struct ContainerFactory<Precision_,
+                        D3Q19<Precision_>,
+                        Grid_>
+{
+    using Lattice = D3Q19<Precision_>;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+    using PullFunctions = pull::DeviceD3Q19<Precision, Grid>;
+    using CommonFunctions = common::DeviceD3Q19<Precision, Grid>;
+
+    static auto
+    iteration(Neon::set::StencilSemantic stencilSemantic,
+              const PopField&            fInField /*!      Input population field */,
+              const CellTypeField&       cellTypeField /*! Cell type field     */,
+              const Compute              omega /*!         LBM omega parameter */,
+              PopField&                  fOutField /*!     Output Population field */)
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = fInField.getGrid().newContainer(
+            "D3Q19_TwoPop_Pull",
+            [&, omega](Neon::set::Loader& L) -> auto {
+                auto&       fIn = L.load(fInField,
+                                         Neon::Pattern::STENCIL, stencilSemantic);
+                auto&       fOut = L.load(fOutField);
+                const auto& cellInfoPartition = L.load(cellTypeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                    CellType cellInfo = cellInfoPartition(gidx, 0);
+                    if (cellInfo.classification == CellType::bulk) {
+
+                        Storage popIn[Lattice::Q];
+                        PullFunctions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn);
+
+                        Compute                rho;
+                        std::array<Compute, 3> u{.0, .0, .0};
+                        CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u);
+
+                        Compute usqr = 1.5 * (u[0] * u[0] +
+                                              u[1] * u[1] +
+                                              u[2] * u[2]);
+
+                        PullFunctions::collideBgkUnrolled(gidx,
+                                                          popIn,
+                                                          rho, u,
+                                                          usqr, omega,
+                                                          NEON_OUT fOut);
+                    }
+                };
+            });
+        return container;
+    }
+
+};
+}  // namespace pull
+namespace push {
+/**
+ * Specialization for D3Q19
+ */
+template <typename Precision_, typename Grid_>
+struct ContainerFactory<Precision_,
+                        D3Q19<Precision_>,
+                        Grid_>
+{
+    using Lattice = D3Q19<Precision_>;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+    using PushFunctions = push::DeviceD3Q19<Precision, Grid>;
+    using CommonFunctions = common::DeviceD3Q19<Precision, Grid>;
+
+    static auto
+    iteration(Neon::set::StencilSemantic stencilSemantic,
+              const PopField&            fInField /*!      Input population field */,
+              const CellTypeField&       cellTypeField /*! Cell type field     */,
+              const Compute              omega /*!         LBM omega parameter */,
+              PopField&                  fOutField /*!     Output Population field */)
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = fInField.getGrid().newContainer(
+            "D3Q19_TwoPop",
+            [&, omega](Neon::set::Loader& L) -> auto {
+                auto&       fIn = L.load(fInField,
+                                         Neon::Pattern::STENCIL, stencilSemantic);
+                auto&       fOut = L.load(fOutField);
+                const auto& cellInfoPartition = L.load(cellTypeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                    CellType cellInfo = cellInfoPartition(gidx, 0);
+                    if (cellInfo.classification == CellType::bulk) {
+
+                        Storage popIn[Lattice::Q];
+                        PushFunctions::localLoad(gidx, fIn, NEON_OUT popIn);
+
+                        Compute                rho;
+                        std::array<Compute, 3> u{.0, .0, .0};
+                        CommonFunctions::macroscopic(popIn,
+                                                     NEON_OUT rho, NEON_OUT u);
+
+                        Compute usqr = 1.5 * (u[0] * u[0] +
+                                              u[1] * u[1] +
+                                              u[2] * u[2]);
+
+                        CommonFunctions::collideBgkUnrolled(gidx,
+                                                            rho, u,
+                                                            usqr, omega,
+                                                            NEON_IO popIn);
+
+                        PushFunctions::pushStream(gidx, cellInfo.wallNghBitflag, popIn, NEON_OUT fOut);
+                    }
+                };
+            });
+        return container;
+    }
+
+
+    static auto
+    computeWallNghMask(const CellTypeField& infoInField,
+                       CellTypeField&       infoOutpeField)
+
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = infoInField.getGrid().newContainer(
+            "LBM_iteration",
+            [&](Neon::set::Loader& L) -> auto {
+                auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL);
+                auto& infoOut = L.load(infoOutpeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                    CellType cellType = infoIn(gidx, 0);
+                    cellType.wallNghBitflag = 0;
+
+                    if (cellType.classification == CellType::bulk) {
+                        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) {
+                            if constexpr (GOMemoryId != Lattice::Memory::center) {
+                                constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId];
+                                constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x;
+                                constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y;
+                                constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z;
+
+                                CellType nghCellType = infoIn.template getNghData<BKx, BKy, BKz>(gidx, 0, CellType::undefined)();
+                                if (nghCellType.classification != CellType::bulk) {
+                                    cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId));
+                                }
+                            }
+                        });
+
+                        infoOut(gidx, 0) = cellType;
+                    }
+                };
+            });
+        return container;
+    }
+};
+}  // namespace push
+namespace common {
+/**
+ * Specialization for D3Q19
+ */
+template <typename Precision_, typename Grid_>
+struct ContainerFactory<Precision_,
+                        D3Q19<Precision_>,
+                        Grid_>
+{
+    using Lattice = D3Q19<Precision_>;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+    using PullFunctions = pull::DeviceD3Q19<Precision, Grid>;
+    using PushFunctions = push::DeviceD3Q19<Precision, Grid>;
+    using CommonFunctions = common::DeviceD3Q19<Precision, Grid>;
+
+    template <int method>
+    static auto
+    iteration(Neon::set::StencilSemantic stencilSemantic,
+              const PopField&            fInField /*!      Input population field */,
+              const CellTypeField&       cellTypeField /*! Cell type field     */,
+              const Compute              omega /*!         LBM omega parameter */,
+              PopField&                  fOutField /*!     Output Population field */)
+        -> Neon::set::Container
+    {
+        if constexpr (method == int(Method::push)) {
+            using Factory = push::ContainerFactory<Precision_,
+                                                   D3Q19<Precision_>,
+                                                   Grid_>;
+            return Factory::iteration(stencilSemantic,
+                                      fInField,
+                                      fOutField,
+                                      omega,
+                                      fOutField);
+        }
+        if constexpr (method == int(Method::pull)) {
+            using Factory = pull::ContainerFactory<Precision_,
+                                                   D3Q19<Precision_>,
+                                                   Grid_>;
+            return Factory::iteration(stencilSemantic,
+                                      fInField,
+                                      fOutField,
+                                      omega,
+                                      fOutField);
+        }
+    }
+
+
+    static auto
+    computeWallNghMask(const CellTypeField& infoInField,
+                       CellTypeField&       infoOutpeField)
+
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = infoInField.getGrid().newContainer(
+            "LBM_iteration",
+            [&](Neon::set::Loader& L) -> auto {
+                auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL);
+                auto& infoOut = L.load(infoOutpeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                    CellType cellType = infoIn(gidx, 0);
+                    cellType.wallNghBitflag = 0;
+
+                    if (cellType.classification == CellType::bulk) {
+                        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) {
+                            if constexpr (GOMemoryId != Lattice::Memory::center) {
+                                constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId];
+                                constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x;
+                                constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y;
+                                constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z;
+
+                                CellType nghCellType = infoIn.template getNghData<BKx, BKy, BKz>(gidx, 0, CellType::undefined)();
+                                if (nghCellType.classification != CellType::bulk) {
+                                    cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId));
+                                }
+                            }
+                        });
+
+                        infoOut(gidx, 0) = cellType;
+                    }
+                };
+            });
+        return container;
+    }
+
+
+    static auto
+    computeRhoAndU([[maybe_unused]] const PopField& fInField /*!   inpout population field */,
+                   const CellTypeField&             cellTypeField /*!       Cell type field     */,
+                   Rho&                             rhoField /*!  output Population field */,
+                   U&                               uField /*!  output Population field */)
+
+        -> Neon::set::Container
+    {
+        Neon::set::Container container =
+            fInField.getGrid().newContainer(
+                "LBM_iteration",
+                [&](Neon::set::Loader& L) -> auto {
+                    auto& fIn = L.load(fInField,
+                                       Neon::Pattern::STENCIL);
+                    auto& rhoXpu = L.load(rhoField);
+                    auto& uXpu = L.load(uField);
+
+                    const auto& cellInfoPartition = L.load(cellTypeField);
+
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                        CellType               cellInfo = cellInfoPartition(gidx, 0);
+                        Compute                rho = 0;
+                        std::array<Compute, 3> u{.0, .0, .0};
+
+                        Storage                popIn[Lattice::Q];
+                        CommonFunctions::localLoad(gidx, fIn, NEON_OUT popIn);
+
+                        if (cellInfo.classification == CellType::bulk) {
+                            CommonFunctions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u);
+                        } else {
+                            if (cellInfo.classification == CellType::movingWall) {
+                                rho = 1.0;
+                                u = std::array<Compute, 3>{static_cast<Compute>(popIn[0]) / static_cast<Compute>(6. * 1. / 18.),
+                                                           static_cast<Compute>(popIn[1]) / static_cast<Compute>(6. * 1. / 18.),
+                                                           static_cast<Compute>(popIn[2]) / static_cast<Compute>(6. * 1. / 18.)};
+                            }
+                        }
+
+                        rhoXpu(gidx, 0) = static_cast<Storage>(rho);
+                        uXpu(gidx, 0) = static_cast<Storage>(u[0]);
+                        uXpu(gidx, 1) = static_cast<Storage>(u[1]);
+                        uXpu(gidx, 2) = static_cast<Storage>(u[2]);
+                    };
+                });
+        return container;
+    }
+
+    static auto
+    problemSetup(PopField&       fInField /*!   inpout population field */,
+                 PopField&       fOutField,
+                 CellTypeField&  cellTypeField,
+                 Neon::double_3d ulid,
+                 double          ulb)
+
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = fInField.getGrid().newContainer(
+            "LBM_iteration",
+            [&, ulid, ulb](Neon::set::Loader& L) -> auto {
+                auto& fIn = L.load(fInField, Neon::Pattern::MAP);
+                auto& fOut = L.load(fOutField, Neon::Pattern::MAP);
+                auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                    const auto globalIdx = fIn.getGlobalIndex(gidx);
+                    const auto domainDim = fIn.getDomainSize();
+
+                    CellType flagVal;
+                    flagVal.classification = CellType::bulk;
+                    flagVal.wallNghBitflag = 0;
+
+                    typename Lattice::Precision::Storage popVal = 0;
+
+                    if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 ||
+                        globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 ||
+                        globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) {
+                        flagVal.classification = CellType::bounceBack;
+
+                        if (globalIdx.y == domainDim.y - 1) {
+                            flagVal.classification = CellType::movingWall;
+                        }
+
+                        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                            if (globalIdx.y == domainDim.y - 1) {
+                                popVal = -6. * Lattice::Memory::template getT<q>() * ulb *
+                                         (Lattice::Memory::template getDirection<q>().v[0] * ulid.v[0] +
+                                          Lattice::Memory::template getDirection<q>().v[1] * ulid.v[1] +
+                                          Lattice::Memory::template getDirection<q>().v[2] * ulid.v[2]);
+                            } else {
+                                popVal = 0;
+                            }
+                            fIn(gidx, q) = popVal;
+                            fOut(gidx, q) = popVal;
+                        });
+                    } else {
+                        flagVal.classification = CellType::bulk;
+                        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                            fIn(gidx, q) = Lattice::Memory::template getT<q>();
+                            fOut(gidx, q) = Lattice::Memory::template getT<q>();
+                        });
+                    }
+                    cellInfoPartition(gidx, 0) = flagVal;
+                };
+            });
+        return container;
+    }
+};
+}  // namespace common
\ No newline at end of file
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h
new file mode 100644
index 00000000..d5d024ea
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/ContainersD3Q27.h
@@ -0,0 +1,227 @@
+#pragma once
+
+#include "CellType.h"
+#include "D3Q27.h"
+#include "DeviceD3Q27.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Containter.h"
+#if 0
+/**
+ * Specialization for D3Q27
+ */
+template <typename Precision_, typename Grid_>
+struct ContainerFactory<Precision_,
+                        D3Q27<Precision_>,
+                        Grid_>
+{
+    using Lattice = D3Q27<Precision_>;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+    using Functions = DeviceD3Q19<Precision, Grid>;
+
+    static auto
+    iteration(Neon::set::StencilSemantic stencilSemantic,
+              const PopField&            fInField /*!      Input population field */,
+              const CellTypeField&       cellTypeField /*! Cell type field     */,
+              const Compute              omega /*!         LBM omega parameter */,
+              PopField&                  fOutField /*!     Output Population field */)
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = fInField.getGrid().newContainer(
+            "D3Q19_TwoPop",
+            [&, omega](Neon::set::Loader& L) -> auto {
+                auto&       fIn = L.load(fInField,
+                                         Neon::Pattern::STENCIL, stencilSemantic);
+                auto&       fOut = L.load(fOutField);
+                const auto& cellInfoPartition = L.load(cellTypeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                    CellType cellInfo = cellInfoPartition(gidx, 0);
+                    if (cellInfo.classification == CellType::bulk) {
+
+                        Storage popIn[Lattice::Q];
+                        Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn);
+
+                        Compute                rho;
+                        std::array<Compute, 3> u{.0, .0, .0};
+                        Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u);
+
+                        Compute usqr = 1.5 * (u[0] * u[0] +
+                                              u[1] * u[1] +
+                                              u[2] * u[2]);
+
+                        Functions::collideBgkUnrolled(gidx,
+                                                      popIn,
+                                                      rho, u,
+                                                      usqr, omega,
+                                                      NEON_OUT fOut);
+                    }
+                };
+            });
+        return container;
+    }
+
+
+    static auto
+    computeWallNghMask(const CellTypeField& infoInField,
+                       CellTypeField&       infoOutpeField)
+
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = infoInField.getGrid().newContainer(
+            "LBM_iteration",
+            [&](Neon::set::Loader& L) -> auto {
+                auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL);
+                auto& infoOut = L.load(infoOutpeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                    CellType cellType = infoIn(gidx, 0);
+                    cellType.wallNghBitflag = 0;
+
+                    if (cellType.classification == CellType::bulk) {
+                        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) {
+                            if constexpr (GOMemoryId != Lattice::Memory::center) {
+                                constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId];
+                                constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x;
+                                constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y;
+                                constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z;
+
+                                CellType nghCellType = infoIn.template getNghData<BKx, BKy, BKz>(gidx, 0, CellType::undefined)();
+                                if (nghCellType.classification != CellType::bulk) {
+                                    cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << GOMemoryId));
+                                }
+                            }
+                        });
+
+                        infoOut(gidx, 0) = cellType;
+                    }
+                };
+            });
+        return container;
+    }
+
+
+    static auto
+    computeRhoAndU([[maybe_unused]] const PopField& fInField /*!   inpout population field */,
+                   const CellTypeField&             cellTypeField /*!       Cell type field     */,
+                   Rho&                             rhoField /*!  output Population field */,
+                   U&                               uField /*!  output Population field */)
+
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = fInField.getGrid().newContainer(
+            "LBM_iteration",
+            [&](Neon::set::Loader& L) -> auto {
+                auto& fIn = L.load(fInField,
+                                   Neon::Pattern::STENCIL);
+                auto& rhoXpu = L.load(rhoField);
+                auto& uXpu = L.load(uField);
+
+                const auto& cellInfoPartition = L.load(cellTypeField);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                    CellType               cellInfo = cellInfoPartition(gidx, 0);
+                    Compute                rho = 0;
+                    std::array<Compute, 3> u{.0, .0, .0};
+                    Storage                popIn[Lattice::Q];
+
+                    if (cellInfo.classification == CellType::bulk) {
+
+                        Functions::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popIn);
+                        Functions::macroscopic(popIn, NEON_OUT rho, NEON_OUT u);
+
+                    } else {
+                        if (cellInfo.classification == CellType::movingWall) {
+                            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GORegisterId) {
+                                if constexpr (GORegisterId == Lattice::Registers::center) {
+                                    popIn[Lattice::Registers::center] = fIn(gidx, Lattice::Memory::center);
+                                } else {
+                                    popIn[GORegisterId] = fIn(gidx, Lattice::Memory::template mapFromRegisters<GORegisterId>());
+                                }
+                            });
+
+                            rho = 1.0;
+                            u = std::array<Compute, 3>{static_cast<Compute>(popIn[0]) / static_cast<Compute>(6. * 1. / 18.),
+                                                       static_cast<Compute>(popIn[1]) / static_cast<Compute>(6. * 1. / 18.),
+                                                       static_cast<Compute>(popIn[2]) / static_cast<Compute>(6. * 1. / 18.)};
+                        }
+                    }
+
+                    rhoXpu(gidx, 0) = static_cast<Storage>(rho);
+                    uXpu(gidx, 0) = static_cast<Storage>(u[0]);
+                    uXpu(gidx, 1) = static_cast<Storage>(u[1]);
+                    uXpu(gidx, 2) = static_cast<Storage>(u[2]);
+                };
+            });
+        return container;
+    }
+
+    static auto
+    problemSetup(PopField&       fInField /*!   inpout population field */,
+                 PopField&       fOutField,
+                 CellTypeField&  cellTypeField,
+                 Neon::double_3d ulid,
+                 double          ulb)
+
+        -> Neon::set::Container
+    {
+        Neon::set::Container container = fInField.getGrid().newContainer(
+            "LBM_iteration",
+            [&, ulid, ulb](Neon::set::Loader& L) -> auto {
+                auto& fIn = L.load(fInField, Neon::Pattern::MAP);
+                auto& fOut = L.load(fOutField, Neon::Pattern::MAP);
+                auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP);
+
+                return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                    const auto globlalIdx = fIn.getGlobalIndex(gidx);
+                    const auto domainDim = fIn.getDomainSize();
+                    CellType   flagVal;
+                    flagVal.classification = CellType::bulk;
+                    flagVal.wallNghBitflag = 0;
+                    typename Lattice::Precision::Storage val = 0;
+
+                    if (globlalIdx.x == 0 || globlalIdx.x == domainDim.x - 1 ||
+                        globlalIdx.y == 0 || globlalIdx.y == domainDim.y - 1 ||
+                        globlalIdx.z == 0 || globlalIdx.z == domainDim.z - 1) {
+                        flagVal.classification = CellType::bounceBack;
+
+                        if (globlalIdx.y == domainDim.y - 1) {
+                            flagVal.classification = CellType::movingWall;
+                        }
+
+                        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                            if (globlalIdx.y == domainDim.y - 1) {
+                                val = -6. * Lattice::Memory::template getT<q>() * ulb *
+                                      (Lattice::Memory::template getDirection<q>().v[0] * ulid.v[0] +
+                                       Lattice::Memory::template getDirection<q>().v[1] * ulid.v[1] +
+                                       Lattice::Memory::template getDirection<q>().v[2] * ulid.v[2]);
+                            } else {
+                                val = 0;
+                            }
+                            fIn(gidx, q) = val;
+                            fOut(gidx, q) = val;
+                        });
+                    } else {
+                        flagVal.classification = CellType::bulk;
+                        cellInfoPartition(gidx, 0) = flagVal;
+                        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                            fIn(gidx, q) = Lattice::Memory::template getT<q>();
+                            fOut(gidx, q) = Lattice::Memory::template getT<q>();
+                        });
+                    }
+                };
+            });
+        return container;
+    }
+};
+#endif
\ No newline at end of file
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h
index 15e8e0b1..4f9d4c8b 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q19.h
@@ -3,113 +3,72 @@
 #include "Neon/Neon.h"
 #include "Neon/set/Backend.h"
 #include "Neon/set/memory/memSet.h"
+#include "Precision.h"
 
-template <typename StorageFP, typename ComputeFP>
-struct D3Q19Template
+
+/** In each lattice we define two indexing schema
+ * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code.
+ * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields.
+ *
+ * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions.
+ *
+ */
+template <typename Precision_>
+struct D3Q19
 {
    public:
+    D3Q19() = delete;
+
     static constexpr int Q = 19; /** number of directions */
     static constexpr int D = 3;  /** Space dimension */
+    using Precision = Precision_;
+    using Self = D3Q19<Precision>;
 
-    static constexpr int centerDirection = 9; /** Position of direction {0,0,0} */
-    static constexpr int goRangeBegin = 0;    /** Symmetry is represented as "go" direction and the "back" their opposite */
-    static constexpr int goRangeEnd = 8;
-    static constexpr int goBackOffset = 10; /** Offset to compute apply symmetry */
-
+    static constexpr int RegisterMapping = 1;
+    static constexpr int MemoryMapping = 2;
 
-    explicit D3Q19Template(const Neon::Backend& backend)
+    struct Registers
     {
-        // The discrete velocities of the Lattice mesh.
-        c_vect = std::vector<Neon::index_3d>(
-            {
-                {-1, 0, 0} /*!  0  Symmetry first section (GO) */,
-                {0, -1, 0} /*!  1  */,
-                {0, 0, -1} /*!  2  */,
-                {-1, -1, 0} /*! 3  */,
-                {-1, 1, 0} /*!  4  */,
-                {-1, 0, -1} /*! 5  */,
-                {-1, 0, 1} /*!  6  */,
-                {0, -1, -1} /*! 7  */,
-                {0, -1, 1} /*!  8  */,
-                {0, 0, 0} /*!   9  The center */,
-                {1, 0, 0} /*!   10 Symmetry mirror section (BK) */,
-                {0, 1, 0} /*!   11 */,
-                {0, 0, 1} /*!   12 */,
-                {1, 1, 0} /*!   13 */,
-                {1, -1, 0} /*!  14 */,
-                {1, 0, 1} /*!   15 */,
-                {1, 0, -1} /*!  16 */,
-                {0, 1, 1} /*!   17 */,
-                {0, 1, -1} /*!  18 */,
-            });
-
-        auto c_neon = backend.devSet().newMemSet<Neon::index_3d>(
-            Neon::DataUse::HOST_DEVICE,
-            1,
-            Neon::MemoryOptions(),
-            backend.devSet().newDataSet<size_t>([&](Neon::SetIdx const&, auto& val) {
-                val = c_vect.size();
-            }));
-
-        for (Neon::SetIdx i = 0; i < backend.devSet().setCardinality(); i++) {
-            for (int j = 0; j < int(c_vect.size()); j++) {
-                c_neon.eRef(i, j).x = static_cast<int8_t>(c_vect[j].x);
-                c_neon.eRef(i, j).y = static_cast<int8_t>(c_vect[j].y);
-                c_neon.eRef(i, j).z = static_cast<int8_t>(c_vect[j].z);
-            }
-        }
-        // The opposite of a given direction.
-        std::vector<int> opp_vect = {
-            10 /*!  0   */,
-            11 /*! 1  */,
-            12 /*! 2  */,
-            13 /*! 3  */,
-            14 /*! 4  */,
-            15 /*! 5  */,
-            16 /*! 6  */,
-            17 /*! 7  */,
-            18 /*! 8  */,
-            9 /*!  9 */,
-            0 /*!  10  */,
-            1 /*!  11 */,
-            2 /*!  12 */,
-            3 /*!  13 */,
-            4 /*!  14 */,
-            5 /*!  15 */,
-            6 /*!  16 */,
-            7 /*!  17 */,
-            8 /*!  18 */,
-        };
+        using Self = D3Q19<Precision>::Registers;
+        static constexpr std::array<const Neon::index_3d, Q> stencil{
+            Neon::index_3d(-1, 0, 0),
+            Neon::index_3d(0, -1, 0),
+            Neon::index_3d(0, 0, -1),
+            Neon::index_3d(-1, -1, 0),
+            Neon::index_3d(-1, 1, 0),
+            Neon::index_3d(-1, 0, -1),
+            Neon::index_3d(-1, 0, 1),
+            Neon::index_3d(0, -1, -1),
+            Neon::index_3d(0, -1, 1),
+            Neon::index_3d(0, 0, 0),
+            Neon::index_3d(1, 0, 0),
+            Neon::index_3d(0, 1, 0),
+            Neon::index_3d(0, 0, 1),
+            Neon::index_3d(1, 1, 0),
+            Neon::index_3d(1, -1, 0),
+            Neon::index_3d(1, 0, 1),
+            Neon::index_3d(1, 0, -1),
+            Neon::index_3d(0, 1, 1),
+            Neon::index_3d(0, 1, -1)};
+
+        static constexpr int center = 9; /** Position of direction {0,0,0} */
 
-        {  // Check correctness of opposite
-            for (int i = 0; i < static_cast<int>(c_vect.size()); i++) {
-                auto point = c_vect[i];
-                auto opposite = point * -1;
-                if (opposite != c_vect[opp_vect[i]]) {
-                    Neon::NeonException exp("");
-                    exp << "Incompatible opposite";
-                    NEON_THROW(exp);
+        template <int go>
+        static constexpr auto getOpposite()
+            -> int
+        {
+            auto opposite3d = stencil[go] * -1;
+            for (int i = 0; i < Q; ++i) {
+                if (stencil[i] == opposite3d) {
+                    return i;
                 }
             }
         }
 
-        this->opp = backend.devSet().newMemSet<int>(
-            Neon::DataUse::HOST_DEVICE,
-            1,
-            Neon::MemoryOptions(),
-            backend.devSet().newDataSet<size_t>([&](Neon::SetIdx const&, auto& val) {
-                val = opp_vect.size();
-            }));
-
-
-        for (Neon::SetIdx i = 0; i < backend.devSet().setCardinality(); i++) {
-            for (size_t j = 0; j < opp_vect.size(); j++) {
-                this->opp.eRef(i, j, 0) = opp_vect[j];
-            }
-        }
+        static constexpr std::array<const int, Q> opposite{
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8};
 
-        // The lattice weights.
-        t_vect = {
+        static constexpr std::array<const typename Precision::Storage, Q> t{
             1. / 18. /*!  0   */,
             1. / 18. /*!  1   */,
             1. / 18. /*!  2   */,
@@ -128,48 +87,195 @@ struct D3Q19Template
             1. / 36. /*!  15  */,
             1. / 36. /*!  16  */,
             1. / 36. /*!  17  */,
-            1. / 36. /*!  18  */,
+            1. / 36. /*!  18  */
         };
 
-        this->t = backend.devSet().newMemSet<StorageFP>(
-            Neon::DataUse::HOST_DEVICE,
-            1,
-            Neon::MemoryOptions(),
-            backend.devSet().newDataSet<size_t>([&](Neon::SetIdx const&, auto&val) {
-                val= opp_vect.size();
-            }));
+        static constexpr int fwdRegIdxListLen = (Q-1)/2;
+        static constexpr std::array<const int, fwdRegIdxListLen> fwdRegIdxList{0, 1, 2, 3, 4, 5, 6, 7, 8};
 
+        template <int tegIdx, typename Compute>
+        static inline NEON_CUDA_HOST_DEVICE auto
+        getCk_u(std::array<Compute, 3> const& u) -> Compute
+        {
+            if constexpr (tegIdx == 0 || tegIdx == 9) {
+                return u[0];
+            }
+            if constexpr (tegIdx == 1 || tegIdx == 10) {
+                return u[1];
+            }
+            if constexpr (tegIdx == 2 || tegIdx == 11) {
+                return u[2];
+            }
+            if constexpr (tegIdx == 3 || tegIdx == 12) {
+                return u[0] + u[1];
+            }
+            if constexpr (tegIdx == 4 || tegIdx == 13) {
+                return u[0] - u[1];
+            }
+            if constexpr (tegIdx == 5 || tegIdx == 14) {
+                return u[0] + u[2];
+            }
+            if constexpr (tegIdx == 6 || tegIdx == 15) {
 
-        for (Neon::SetIdx i = 0; i < backend.devSet().setCardinality(); i++) {
-            for (size_t j = 0; j < t_vect.size(); j++) {
-                this->t.eRef(i, j, 0) = t_vect[j];
+                return u[0] - u[2];
+            }
+            if constexpr (tegIdx == 7 || tegIdx == 16) {
+
+                return u[1] + u[2];
+            }
+            if constexpr (tegIdx == 8 || tegIdx == 17) {
+                return u[1] - u[2];
             }
         }
+    };
+
+    struct Memory
+    {
+        using Self = D3Q19<Precision>::Memory;
+
+        static constexpr std::array<const Neon::index_3d, Q> stencil{
+            Neon::index_3d(-1, 0, 0),
+            Neon::index_3d(0, -1, 0),
+            Neon::index_3d(0, 0, -1),
+            Neon::index_3d(-1, -1, 0),
+            Neon::index_3d(-1, 1, 0),
+            Neon::index_3d(-1, 0, -1),
+            Neon::index_3d(-1, 0, 1),
+            Neon::index_3d(0, -1, -1),
+            Neon::index_3d(0, -1, 1),
+            Neon::index_3d(0, 0, 0),
+            Neon::index_3d(1, 0, 0),
+            Neon::index_3d(0, 1, 0),
+            Neon::index_3d(0, 0, 1),
+            Neon::index_3d(1, 1, 0),
+            Neon::index_3d(1, -1, 0),
+            Neon::index_3d(1, 0, 1),
+            Neon::index_3d(1, 0, -1),
+            Neon::index_3d(0, 1, 1),
+            Neon::index_3d(0, 1, -1)};
+
+
+        static constexpr int center = 9; /** Position of direction {0,0,0} */
+
+        static constexpr std::array<const int, Q> toRegisters{
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+
+        static constexpr std::array<const int, Q> toMemory{
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
 
-        if (backend.runtime() == Neon::Runtime::stream) {
-            this->c.template update<Neon::run_et::et::sync>(backend.streamSet(0), Neon::DeviceType::CUDA);
-            this->opp.template update<Neon::run_et::et::sync>(backend.streamSet(0), Neon::DeviceType::CUDA);
-            this->t.template update<Neon::run_et::et::sync>(backend.streamSet(0), Neon::DeviceType::CUDA);
+
+        template <int go>
+        NEON_CUDA_HOST_DEVICE static constexpr auto mapToRegisters()
+            -> int
+        {
+            return toRegisters[go];
         }
-    }
 
+        template <int go>
+        NEON_CUDA_HOST_DEVICE static constexpr auto mapFromRegisters()
+            -> int
+        {
+            return toMemory[go];
+        }
 
-    template <int go>
-    static constexpr auto getOpposite()
-        -> int
-    {
-        if constexpr (go == centerDirection)
-            return centerDirection;
-        if constexpr (go <= goRangeEnd)
-            return go + goBackOffset;
-        if constexpr (go <= goRangeEnd + goBackOffset)
-            return go - goBackOffset;
-    }
+        template <int go>
+        NEON_CUDA_HOST_DEVICE static constexpr auto getOpposite()
+            -> int
+        {
+            return opposite[go];
+        }
+
+        static constexpr std::array<const int, Q> opposite{
+            10, 11, 12, 13, 14, 15, 16, 17, 18, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8};
+
+        template <int go>
+        static constexpr auto helpGetValueforT()
+            -> typename Precision::Storage
+        {
+            auto goInRegisterSpace = Self::template mapToRegisters<go>();
+            return Registers::t[goInRegisterSpace];
+        }
+
+        template <int fwMemIdx_>
+        struct MemMapper
+        {
+            constexpr static int fwMemIdx = fwMemIdx_;
+            constexpr static int fwX = Memory::stencil[fwMemIdx].x;
+            constexpr static int fwY = Memory::stencil[fwMemIdx].y;
+            constexpr static int fwZ = Memory::stencil[fwMemIdx].z;
 
+            constexpr static int bkMemIdx = Memory::opposite[fwMemIdx];
+            constexpr static int bkX = Memory::stencil[bkMemIdx].x;
+            constexpr static int bkY = Memory::stencil[bkMemIdx].y;
+            constexpr static int bkZ = Memory::stencil[bkMemIdx].z;
 
-    Neon::set::MemSet<Neon::int8_3d> c;
-    Neon::set::MemSet<int>           opp;
-    Neon::set::MemSet<StorageFP>     t;
-    std::vector<double>              t_vect;
-    std::vector<Neon::index_3d>      c_vect;
+            constexpr static int fwRegIdx = Memory::template mapToRegisters<fwMemIdx>();
+            constexpr static int centerRegIdx = Registers::center;
+            constexpr static int centerMemIdx = Memory::center;
+        };
+
+        template <int fwRegIdx_>
+        struct RegMapper
+        {
+            constexpr static int fwRegIdx = fwRegIdx_;
+            constexpr static int bkRegIdx = Registers::opposite[fwRegIdx];
+            constexpr static int fwMemIdx = Registers::template mapToMemory<fwRegIdx>();
+            constexpr static int bkMemIdx = Registers::template mapToMemory<bkRegIdx>();
+            constexpr static int centerRegIdx = Registers::center;
+            constexpr static int centerMemIdx = Memory::center;
+        };
+
+        static constexpr std::array<const typename Precision::Storage, Q> t{
+            1. / 18. /*!  0   */,
+            1. / 18. /*!  1   */,
+            1. / 18. /*!  2   */,
+            1. / 36. /*!  3   */,
+            1. / 36. /*!  4   */,
+            1. / 36. /*!  5   */,
+            1. / 36. /*!  6   */,
+            1. / 36. /*!  7   */,
+            1. / 36. /*!  8   */,
+            1. / 3. /*!   9  */,
+            1. / 18. /*!  10   */,
+            1. / 18. /*!  11  */,
+            1. / 18. /*!  12  */,
+            1. / 36. /*!  13  */,
+            1. / 36. /*!  14  */,
+            1. / 36. /*!  15  */,
+            1. / 36. /*!  16  */,
+            1. / 36. /*!  17  */,
+            1. / 36. /*!  18  */};
+
+        template <int direction>
+        NEON_CUDA_HOST_DEVICE static constexpr auto getT()
+            -> typename Precision::Storage
+        {
+            return t[direction];
+        }
+        template <int direction>
+        NEON_CUDA_HOST_DEVICE static constexpr auto getDirection()
+            -> typename Neon::index_3d
+        {
+            return stencil[direction];
+        }
+    };
+
+
+   public:
+    template <int mappingType>
+    static auto getDirectionAsVector()
+        -> std::vector<Neon::index_3d>
+    {
+        std::vector<Neon::index_3d> vec;
+        if constexpr (mappingType == RegisterMapping) {
+            for (auto const& a : Registers::stencil) {
+                vec.push_back(a);
+            }
+        } else if constexpr (mappingType == MemoryMapping) {
+            for (auto const& a : Memory::stencil) {
+                vec.push_back(a);
+            }
+        }
+        return vec;
+    }
 };
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h
new file mode 100644
index 00000000..9f2c7f95
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/D3Q27.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include "Neon/Neon.h"
+#include "Neon/set/Backend.h"
+#include "Neon/set/memory/memSet.h"
+#include "Precision.h"
+
+
+/** In each lattice we define two indexing schema
+ * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code.
+ * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields.
+ *
+ * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions.
+ *
+ */
+template <typename Precision_>
+struct D3Q27
+{
+   public:
+    D3Q27() = delete;
+
+    static constexpr int Q = 27; /** number of directions */
+    static constexpr int D = 3;  /** Space dimension */
+    using Precision = Precision_;
+    using Self = D3Q27<Precision>;
+
+    static constexpr int RegisterMapping = 1;
+    static constexpr int MemoryMapping = 2;
+
+    struct Registers
+    {
+        using Self = D3Q27<Precision>::Registers;
+        static constexpr std::array<const Neon::index_3d, Q> stencil{
+            Neon::index_3d(-1, 0, 0),
+            Neon::index_3d(0, -1, 0),
+            Neon::index_3d(0, 0, -1),
+            Neon::index_3d(-1, -1, 0),
+            Neon::index_3d(-1, 1, 0),
+            Neon::index_3d(-1, 0, -1),
+            Neon::index_3d(-1, 0, 1),
+            Neon::index_3d(0, -1, -1),
+            Neon::index_3d(0, -1, 1),
+            Neon::index_3d(-1, -1, -1),
+            Neon::index_3d(-1, -1, 1),
+            Neon::index_3d(-1, 1, -1),
+            Neon::index_3d(-1, 1, 1),
+            Neon::index_3d(0, 0, 0),
+            Neon::index_3d(1, 0, 0),
+            Neon::index_3d(0, 1, 0),
+            Neon::index_3d(0, 0, 1),
+            Neon::index_3d(1, 1, 0),
+            Neon::index_3d(1, -1, 0),
+            Neon::index_3d(1, 0, 1),
+            Neon::index_3d(1, 0, -1),
+            Neon::index_3d(0, 1, 1),
+            Neon::index_3d(0, 1, -1),
+            Neon::index_3d(1, 1, 1),
+            Neon::index_3d(1, 1, -1),
+            Neon::index_3d(1, -1, 1),
+            Neon::index_3d(1, -1, -1)};
+
+        static constexpr int center = 13; /** Position of direction {0,0,0} */
+
+        template <int go>
+        static constexpr auto getOpposite()
+            -> int
+        {
+            auto opposite3d = stencil[go] * -1;
+            for (int i = 0; i < Q; ++i) {
+                if (stencil[i] == opposite3d) {
+                    return i;
+                }
+            }
+        }
+
+        static constexpr std::array<const int, Q> opposite{
+            14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+            13,
+            0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12
+        };
+
+        static constexpr std::array<const typename Precision::Storage, Q> t{
+            2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54.,
+            1. / 216., 1. / 216., 1. / 216., 1. / 216.,
+            8. / 27.,
+            2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54.,
+            1. / 216., 1. / 216., 1. / 216., 1. / 216.};
+    };
+
+    struct Memory
+    {
+        using Self = D3Q27<Precision>::Memory;
+        static constexpr std::array<const Neon::index_3d, Q> stencil{
+            Neon::index_3d(-1, 0, 0),
+            Neon::index_3d(0, -1, 0),
+            Neon::index_3d(0, 0, -1),
+            Neon::index_3d(-1, -1, 0),
+            Neon::index_3d(-1, 1, 0),
+            Neon::index_3d(-1, 0, -1),
+            Neon::index_3d(-1, 0, 1),
+            Neon::index_3d(0, -1, -1),
+            Neon::index_3d(0, -1, 1),
+            Neon::index_3d(-1, -1, -1),
+            Neon::index_3d(-1, -1, 1),
+            Neon::index_3d(-1, 1, -1),
+            Neon::index_3d(-1, 1, 1),
+            Neon::index_3d(0, 0, 0),
+            Neon::index_3d(1, 0, 0),
+            Neon::index_3d(0, 1, 0),
+            Neon::index_3d(0, 0, 1),
+            Neon::index_3d(1, 1, 0),
+            Neon::index_3d(1, -1, 0),
+            Neon::index_3d(1, 0, 1),
+            Neon::index_3d(1, 0, -1),
+            Neon::index_3d(0, 1, 1),
+            Neon::index_3d(0, 1, -1),
+            Neon::index_3d(1, 1, 1),
+            Neon::index_3d(1, 1, -1),
+            Neon::index_3d(1, -1, 1),
+            Neon::index_3d(1, -1, -1)};
+
+
+        static constexpr int center = 13;       /** Position of direction {0,0,0} */
+   
+        template <int go>
+        static constexpr auto mapToRegisters()
+            -> int
+        {
+            auto direction = stencil[go];
+            for (int i = 0; i < Q; ++i) {
+                if (Registers::stencil[i] == direction) {
+                    return i;
+                }
+            }
+        }
+
+        template <int go>
+        static constexpr auto mapFromRegisters()
+            -> int
+        {
+            auto direction = Registers::stencil[go];
+            for (int i = 0; i < Q; ++i) {
+                if (Self::stencil[i] == direction) {
+                    return i;
+                }
+            }
+        }
+
+        template <int go>
+        static constexpr auto getOpposite()
+            -> int
+        {
+            auto opposite3d = stencil[go] * -1;
+            for (int i = 0; i < Q; ++i) {
+                if (stencil[i] == opposite3d) {
+                    return i;
+                }
+            }
+        }
+
+        static constexpr std::array<const int, Q> opposite{
+            14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+            13,
+            0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12
+        };
+
+        template <int go>
+        static constexpr auto helpGetValueforT()
+            -> typename Precision::Storage
+        {
+            auto goInRegisterSpace = Self::template mapToRegisters<go>();
+            return Registers::t[goInRegisterSpace];
+        }
+
+        static constexpr std::array<const typename Precision::Storage, Q> t{
+            2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54.,
+            1. / 216., 1. / 216., 1. / 216., 1. / 216.,
+            8. / 27.,
+            2. / 27., 2. / 27., 2. / 27., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54., 1. / 54.,
+            1. / 216., 1. / 216., 1. / 216., 1. / 216.};
+    };
+
+   public:
+    template <int mappingType>
+    static auto getDirectionAsVector()
+        -> std::vector<Neon::index_3d>
+    {
+        std::vector<Neon::index_3d> vec;
+        if constexpr (mappingType == RegisterMapping) {
+            for (auto const& a : Registers::stencil) {
+                vec.push_back(a);
+            }
+        } else if constexpr (mappingType == MemoryMapping) {
+            for (auto const& a : Memory::stencil) {
+                vec.push_back(a);
+            }
+        }
+        return vec;
+    }
+};
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h
new file mode 100644
index 00000000..fff6f2b3
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q19.h
@@ -0,0 +1,214 @@
+#pragma once
+#include "CellType.h"
+#include "D3Q19.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Containter.h"
+
+namespace pull {
+template <typename Precision_, typename Grid_>
+struct DeviceD3Q19
+{
+    using Lattice = D3Q19<Precision_>;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    pullStream(Idx const&                          gidx,
+               const uint32_t&                     wallBitFlag,
+               typename PopField::Partition const& fin,
+               NEON_OUT Storage                    popIn[Lattice::Q])
+    {
+        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx) {
+            using M = typename Lattice::template MappersIdxSetWithFwdMem<fwMemIdx>;
+
+            if constexpr (fwMemIdx == Lattice::Memory::center) {
+                popIn[M::centerRegIdx] = fin(gidx, M::centerMemIdx);
+            } else {
+                if (CellType::isWall<M::bkMemIdx>()) {
+                    popIn[M::fwRegIdx] = fin(gidx, M::bkMemIdx) +
+                                         fin.template getNghData<M::bkX, M::bkY, M::bkZ>(gidx, M::bkMemIdx)();
+                } else {
+                    popIn[M::fwRegIdx] = fin.template getNghData<M::bkX, M::bkY, M::bkZ>(gidx, fwMemIdx)();
+                }
+            }
+        });
+    }
+};
+
+#undef CAST_TO_COMPUTE
+}  // namespace pull
+
+namespace push {
+template <typename Precision_, typename Grid_>
+struct DeviceD3Q19
+{
+    using Lattice = D3Q19<Precision_>;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    pushStream(Idx const&                                   gidx,
+               const uint32_t&                              wallNghBitFlag,
+               NEON_OUT Storage                             pOut[Lattice::Q],
+               NEON_OUT typename PopField::Partition const& fOut)
+    {
+        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx_) {
+            using M = typename Lattice::template MappersIdxSetWithFwdMem<fwMemIdx_>;
+
+            if constexpr (M::fwMemIdx == M::centerMemIdx) {
+                fOut(gidx, M::fwMemIdx) = pOut[M::fwRegIdx];
+            } else {
+                if (CellType::isWall<M::fwRegIdx>()) {
+                    // fout(i, opp[k]) =
+                    //      pop_out +
+                    //      f(nb, k);
+                    fOut(gidx, M::bkMemIdx) =
+                        pOut[M::fwdRegIdx] +
+                        fOut.template getNghData<M::fwX, M::fwY, M::fwZ>(gidx, M::fwMemIdx)();
+                } else {
+                    // fout(nb,                                 k)         = pop_out;
+                    fOut.writeNgh<M::fwX, M::fwY, M::fwZ>(gidx, M::fwMemIdx, pOut[M::fwdRegIdx]);
+                }
+            }
+        });
+    }
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    localLoad(Idx const&                                  gidx,
+              NEON_IN typename PopField::Partition const& fOut,
+              Storage NEON_RESTRICT                       pOut[Lattice::Q])
+    {
+        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto fwMemIdx_) {
+            using M = typename Lattice::template MappersIdxSetWithFwdMem<fwMemIdx_>;
+            pOut[M::fwdRegIdx] = fOut(gidx, M::fwMemIdx);
+        });
+    }
+};
+}  // namespace push
+
+
+namespace common {
+template <typename Precision_, typename Grid_>
+struct DeviceD3Q19
+{
+    using Lattice = D3Q19<Precision_>;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    macroscopic(const Storage     pop[Lattice::Q],
+                NEON_OUT Compute& rho,
+                NEON_OUT std::array<Compute, 3>& u)
+        -> void
+    {
+
+#define POP(IDX) static_cast<Compute>(pop[IDX])
+        const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6);
+        const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16);
+        const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18);
+
+        const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14);
+        const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18);
+
+        const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18);
+        const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17);
+#undef POP
+
+        rho = X_M1 + X_P1 + X_0;
+        u[0] = (X_P1 - X_M1) / rho;
+        u[1] = (Y_P1 - Y_M1) / rho;
+        u[2] = (Z_P1 - Z_M1) / rho;
+    }
+
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    collideBgkUnrolled(Idx const&                    i /*!     Compute iterator   */,
+                       Compute const&                rho /*!   Density            */,
+                       std::array<Compute, 3> const& u /*!     Velocity           */,
+                       Compute const&                usqr /*!  Usqr               */,
+                       Compute const&                omega /*! Omega              */,
+                       NEON_IO Storage               pop[Lattice::Q])
+
+        -> void
+    {
+
+        constexpr Compute c1over18 = 1. / 18.;
+        constexpr Compute c1over36 = 1. / 36.;
+        constexpr Compute c4dot5 = 4.5;
+        constexpr Compute c3 = 3.;
+        constexpr Compute c1 = 1.;
+        constexpr Compute c6 = 6.;
+
+        constexpr int regCenter = Lattice::Registers::center;
+        constexpr int regFir = Lattice::Registers::center;
+
+        Neon::ConstexprFor<0, Lattice::Registers::fwdRegIdxListLen, 1>(
+            [&](auto fwdRegIdxListIdx) {
+                using M = typename Lattice::template RegMapper<fwdRegIdxListIdx>;
+                using T = typename Lattice::Registers;
+
+                Compute eqFw;
+                Compute eqBk;
+
+                const Compute ck_u = T::template getCk_u<M::fwRegIdx, Compute>(u);
+                // double eq = rho * t[k] *
+                //             (1. +
+                //             3. * ck_u +
+                //             4.5 * ck_u * ck_u -
+                //             usqr);
+                eqFw = rho * T::t[M::fwRegIdx] *
+                       (c1 +
+                        c3 * ck_u +
+                        c4dot5 * ck_u * ck_u -
+                        usqr);
+
+                // double eqopp = eq - 6.* rho * t[k] * ck_u;
+                eqBk = eqFw -
+                       c6 * rho * c1over36 * T::t[M::fwRegIdx] * ck_u;
+
+                // pop_out       = (1. - omega) * fin(i, k)                              + omega * eq;
+                pop[M::fwRegIdx] = (c1 - omega) * static_cast<Compute>(pop[M::fwRegIdx]) + omega * eqFw;
+                // pop_out_opp   = (1. - omega) * fin(i, opp[k])                         + omega * eqopp;
+                pop[M::bkRegIdx] = (c1 - omega) * static_cast<Compute>(pop[M::bkRegIdx]) + omega * eqBk;
+            });
+        {  // Center;
+            using T = typename Lattice::Registers;
+            using M = typename Lattice::template RegMapper<Lattice::Registers::center>;
+            //            eq       = rho * t[k]              * (1. - usqr);
+            const Compute eqCenter = rho * T::t[M::fwRegIdx] * (c1 - usqr);
+            //                   fout(i, k) = (1. - omega) * fin(i, k)                              + omega * eq;
+            pop[Lattice::Registers::center] = (c1 - omega) * static_cast<Compute>(pop[M::fwRegIdx]) + omega * eqCenter;
+        }
+    }
+};
+}  // namespace common
\ No newline at end of file
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h
new file mode 100644
index 00000000..f977492b
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/DeviceD3Q27.h
@@ -0,0 +1,217 @@
+#pragma once
+#include "CellType.h"
+#include "D3Q27.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Containter.h"
+
+
+template <typename Precision_, typename Grid_>
+struct DeviceD3Q27
+{
+    using Lattice = D3Q27<Precision_>;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    pullStream(Idx const&                          gidx,
+               const uint32_t&                     wallBitFlag,
+               typename PopField::Partition const& fin,
+               NEON_OUT Storage                    popIn[Lattice::Q])
+    {
+
+        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) {
+            if constexpr (GOMemoryId == Lattice::Memory::center) {
+                popIn[Lattice::Registers::center] = fin(gidx, Lattice::Memory::center);
+            } else {
+                constexpr int BKMemoryId = Lattice::Memory::opposite[GOMemoryId];
+                constexpr int BKx = Lattice::Memory::stencil[BKMemoryId].x;
+                constexpr int BKy = Lattice::Memory::stencil[BKMemoryId].y;
+                constexpr int BKz = Lattice::Memory::stencil[BKMemoryId].z;
+                constexpr int GORegistersId = Lattice::Memory::template mapToRegisters<GOMemoryId>();
+
+                if (wallBitFlag & (uint32_t(1) << GOMemoryId)) {
+                    popIn[GORegistersId] =
+                        fin(gidx, BKMemoryId) +
+                        fin.template getNghData<BKx, BKy, BKz>(gidx, BKMemoryId)();
+                } else {
+                    popIn[GORegistersId] =
+                        fin.template getNghData<BKx, BKy, BKz>(gidx, GOMemoryId)();
+                }
+            }
+        });
+    }
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    macroscopic(const Storage     pop[Lattice::Q],
+                NEON_OUT Compute& rho,
+                NEON_OUT std::array<Compute, 3>& u)
+        -> void
+    {
+
+#define POP(IDX) static_cast<Compute>(pop[IDX])
+        const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6) + POP(9) + POP(10) + POP(11) + POP(12);
+        const Compute X_P1 = POP(14) + POP(17) + POP(18) + POP(19) + POP(20) + POP(23) + POP(24) + POP(25) + POP(26);
+        const Compute X_0 = POP(1) + POP(2) + POP(7) + POP(8) + POP(13) + POP(15) + POP(16) + POP(21) + POP(22);
+
+        const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(9) + POP(10) + POP(18) + POP(25) + POP(26);
+        const Compute Y_P1 = POP(15) + POP(17) + POP(21) + POP(22) + POP(23) + POP(24) + POP(4) + POP(11) + POP(12);
+
+        const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(9) + POP(11) + POP(20) + POP(22) + POP(24) + POP(26);
+        const Compute Z_P1 = POP(16) + POP(19) + POP(21) + POP(23) + POP(25) + POP(6) + POP(8) + POP(10) + POP(12);
+#undef POP
+
+        rho = X_M1 + X_P1 + X_0;
+        u[0] = (X_P1 - X_M1) / rho;
+        u[0] = (Y_P1 - Y_M1) / rho;
+        u[0] = (Z_P1 - Z_M1) / rho;
+    }
+
+
+    static inline NEON_CUDA_HOST_DEVICE auto
+    collideBgkUnrolled(Idx const&                    i /*!     Compute iterator   */,
+                       const Storage                 pop[Lattice::Q],
+                       Compute const&                rho /*!   Density            */,
+                       std::array<Compute, 3> const& u /*!     Velocity           */,
+                       Compute const&                usqr /*!  Usqr               */,
+                       Compute const&                omega /*! Omega              */,
+                       typename PopField::Partition& fOut /*!  Population         */)
+
+        -> void
+    {
+        const Compute cku1 = u[0] + u[1];
+        const Compute cku2 = -u[0] + u[1];
+        const Compute cku3 = u[0] + u[2];
+        const Compute cku4 = -u[0] + u[2];
+        const Compute cku5 = u[1] + u[2];
+        const Compute cku6 = -u[1] + u[2];
+        const Compute cku7 = u[0] + u[1] + u[2];
+        const Compute cku8 = -u[0] + u[1] + u[2];
+        const Compute cku9 = u[0] - u[1] + u[2];
+        const Compute cku0 = u[0] + u[1] - u[2];
+
+        std::array<Compute, Lattice::Q> feqRM;
+
+        constexpr int F000 = 13;
+        constexpr int FM00 = 0;
+        constexpr int F0M0 = 1;
+        constexpr int F00M = 2;
+        constexpr int FMM0 = 3;
+        constexpr int FMP0 = 4;
+        constexpr int FM0M = 5;
+        constexpr int FM0P = 6;
+        constexpr int F0MM = 7;
+        constexpr int F0MP = 8;
+        constexpr int FMMM = 9;
+        constexpr int FMMP = 10;
+        constexpr int FMPM = 11;
+        constexpr int FMPP = 12;
+        constexpr int FP00 = 14;
+        constexpr int F0P0 = 15;
+        constexpr int F00P = 16;
+        constexpr int FPP0 = 17;
+        constexpr int FPM0 = 18;
+        constexpr int FP0P = 19;
+        constexpr int FP0M = 20;
+        constexpr int F0PP = 21;
+        constexpr int F0PM = 22;
+        constexpr int FPPP = 23;
+        constexpr int FPPM = 24;
+        constexpr int FPMP = 25;
+        constexpr int FPMM = 26;
+
+        constexpr Compute c1over18 = 1. / 18.;
+        constexpr Compute c1over36 = 1. / 36.;
+        constexpr Compute c4dot5 = 4.5;
+        constexpr Compute c3 = 3.;
+        constexpr Compute c1 = 1.;
+        constexpr Compute c6 = 6.;
+
+        feqRM[F000] = rho * Lattice::Registers::t[F000] * (c1- usqr);
+
+        feqRM[FM00] = rho * Lattice::Registers::t[FM00] * (c1- c3* u[0] + c4dot5* u[0] * u[0] - usqr);
+        feqRM[FP00] = rho * Lattice::Registers::t[FP00] * (c6 * u[0]) + feqRM[FM00];
+
+        feqRM[F0M0] = rho * Lattice::Registers::t[F0M0] * (c1- c3* u[1] + c4dot5* u[1] * u[1] - usqr);
+        feqRM[F0P0] = rho * Lattice::Registers::t[F0P0] * (c6 * u[1]) + feqRM[F0M0];
+
+        feqRM[F00M] = rho * Lattice::Registers::t[F00M] * (c1- c3* u[2] + c4dot5* u[2] * u[2] - usqr);
+        feqRM[F00P] = rho * Lattice::Registers::t[F00P] * (c6 * u[2]) + feqRM[F00M];
+
+        feqRM[FMM0] = rho * Lattice::Registers::t[FMM0] * (c1- c3* cku1 + c4dot5* cku1 * cku1 - usqr);
+        feqRM[FPP0] = rho * Lattice::Registers::t[FPP0] * (c6 * cku1) + feqRM[FMM0];
+        feqRM[FPM0] = rho * Lattice::Registers::t[FPM0] * (c1- c3* cku2 + c4dot5* cku2 * cku2 - usqr);
+        feqRM[FMP0] = rho * Lattice::Registers::t[FMP0] * (c6 * cku2) + feqRM[FPM0];
+
+        feqRM[FM0M] = rho * Lattice::Registers::t[FM0M] * (c1- c3* cku3 + c4dot5* cku3 * cku3 - usqr);
+        feqRM[FP0P] = rho * Lattice::Registers::t[FP0P] * (c6 * cku3) + feqRM[FM0M];
+        feqRM[FP0M] = rho * Lattice::Registers::t[FP0M] * (c1- c3* cku4 + c4dot5* cku4 * cku4 - usqr);
+        feqRM[FM0P] = rho * Lattice::Registers::t[FM0P] * (c6 * cku4) + feqRM[FP0M];
+
+        feqRM[F0MM] = rho * Lattice::Registers::t[F0MM] * (c1- c3* cku5 + c4dot5* cku5 * cku5 - usqr);
+        feqRM[F0PP] = rho * Lattice::Registers::t[F0PP] * (c6 * cku5) + feqRM[F0MM];
+        feqRM[F0PM] = rho * Lattice::Registers::t[F0PM] * (c1- c3* cku6 + c4dot5* cku6 * cku6 - usqr);
+        feqRM[F0MP] = rho * Lattice::Registers::t[F0MP] * (c6 * cku6) + feqRM[F0PM];
+
+        feqRM[FMMM] = rho * Lattice::Registers::t[FMMM] * (c1- c3* cku7 + c4dot5* cku7 * cku7 - usqr);
+        feqRM[FPPP] = rho * Lattice::Registers::t[FPPP] * (c6 * cku7) + feqRM[FMMM];
+        feqRM[FPMM] = rho * Lattice::Registers::t[FPMM] * (c1- c3* cku8 + c4dot5* cku8 * cku8 - usqr);
+        feqRM[FMPP] = rho * Lattice::Registers::t[FMPP] * (c6 * cku8) + feqRM[FPMM];
+        feqRM[FMPM] = rho * Lattice::Registers::t[FMPM] * (c1- c3* cku9 + c4dot5* cku9 * cku9 - usqr);
+        feqRM[FPMP] = rho * Lattice::Registers::t[FPMP] * (c6 * cku9) + feqRM[FMPM];
+        feqRM[FMMP] = rho * Lattice::Registers::t[FMMP] * (c1- c3* cku0 + c4dot5* cku0 * cku0 - usqr);
+        feqRM[FPPM] = rho * Lattice::Registers::t[FPPM] * (c6 * cku0) + feqRM[FMMP];
+
+        // BGK Collision based on the second-order equilibrium
+        std::array<Compute, Lattice::Q>  foutRM;
+
+        foutRM[F000] = (c1- omega) * static_cast<Compute>(pop[F000]) + omega * feqRM[F000];
+
+        foutRM[FP00] = (c1- omega) * static_cast<Compute>(pop[FP00]) + omega * feqRM[FP00];
+        foutRM[FM00] = (c1- omega) * static_cast<Compute>(pop[FM00]) + omega * feqRM[FM00];
+
+        foutRM[F0P0] = (c1- omega) * static_cast<Compute>(pop[F0P0]) + omega * feqRM[F0P0];
+        foutRM[F0M0] = (c1- omega) * static_cast<Compute>(pop[F0M0]) + omega * feqRM[F0M0];
+
+        foutRM[F00P] = (c1- omega) * static_cast<Compute>(pop[F00P]) + omega * feqRM[F00P];
+        foutRM[F00M] = (c1- omega) * static_cast<Compute>(pop[F00M]) + omega * feqRM[F00M];
+
+        foutRM[FPP0] = (c1- omega) * static_cast<Compute>(pop[FPP0]) + omega * feqRM[FPP0];
+        foutRM[FMP0] = (c1- omega) * static_cast<Compute>(pop[FMP0]) + omega * feqRM[FMP0];
+        foutRM[FPM0] = (c1- omega) * static_cast<Compute>(pop[FPM0]) + omega * feqRM[FPM0];
+        foutRM[FMM0] = (c1- omega) * static_cast<Compute>(pop[FMM0]) + omega * feqRM[FMM0];
+
+        foutRM[FP0P] = (c1- omega) * static_cast<Compute>(pop[FP0P]) + omega * feqRM[FP0P];
+        foutRM[FM0P] = (c1- omega) * static_cast<Compute>(pop[FM0P]) + omega * feqRM[FM0P];
+        foutRM[FP0M] = (c1- omega) * static_cast<Compute>(pop[FP0M]) + omega * feqRM[FP0M];
+        foutRM[FM0M] = (c1- omega) * static_cast<Compute>(pop[FM0M]) + omega * feqRM[FM0M];
+
+        foutRM[F0PP] = (c1- omega) * static_cast<Compute>(pop[F0PP]) + omega * feqRM[F0PP];
+        foutRM[F0MP] = (c1- omega) * static_cast<Compute>(pop[F0MP]) + omega * feqRM[F0MP];
+        foutRM[F0PM] = (c1- omega) * static_cast<Compute>(pop[F0PM]) + omega * feqRM[F0PM];
+        foutRM[F0MM] = (c1- omega) * static_cast<Compute>(pop[F0MM]) + omega * feqRM[F0MM];
+
+        foutRM[FPPP] = (c1- omega) * static_cast<Compute>(pop[FPPP]) + omega * feqRM[FPPP];
+        foutRM[FMPP] = (c1- omega) * static_cast<Compute>(pop[FMPP]) + omega * feqRM[FMPP];
+        foutRM[FPMP] = (c1- omega) * static_cast<Compute>(pop[FPMP]) + omega * feqRM[FPMP];
+        foutRM[FPPM] = (c1- omega) * static_cast<Compute>(pop[FPPM]) + omega * feqRM[FPPM];
+        foutRM[FMMP] = (c1- omega) * static_cast<Compute>(pop[FMMP]) + omega * feqRM[FMMP];
+        foutRM[FMPM] = (c1- omega) * static_cast<Compute>(pop[FMPM]) + omega * feqRM[FMPM];
+        foutRM[FPMM] = (c1- omega) * static_cast<Compute>(pop[FPMM]) + omega * feqRM[FPMM];
+        foutRM[FMMM] = (c1- omega) * static_cast<Compute>(pop[FMMM]) + omega * feqRM[FMMM];
+
+        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto GOMemoryId) {
+            fOut(i, GOMemoryId) = static_cast<Storage>(foutRM[Lattice::Memory::template mapToRegisters<GOMemoryId>()]);
+        });
+    }
+};
+
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h
deleted file mode 100644
index b92d9acc..00000000
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmIteration.h
+++ /dev/null
@@ -1,101 +0,0 @@
-#include "CellType.h"
-#include "D3Q19.h"
-#include "LbmTools.h"
-#include "Neon/Neon.h"
-#include "Neon/set/Backend.h"
-#include "Neon/set/Containter.h"
-#include "Neon/skeleton/Skeleton.h"
-
-template <typename DKQW,
-          typename PopulationField,
-          typename LbmComputeType>
-struct LbmSkeleton
-{
-};
-
-
-template <typename PopulationField,
-          typename LbmComputeType>
-struct LbmIterationD3Q19
-{
-    using LbmStoreType = typename PopulationField::Type;
-    using CellTypeField = typename PopulationField::Grid::template Field<CellType, 1>;
-    using D3Q19 = D3Q19Template<LbmStoreType, LbmComputeType>;
-    using LbmTools = LbmContainers<D3Q19, PopulationField, LbmComputeType>;
-
-
-    LbmIterationD3Q19(Neon::set::StencilSemantic stencilSemantic,
-                      Neon::skeleton::Occ        occ,
-                      Neon::set::TransferMode    transfer,
-                      PopulationField&           fIn /*!   inpout population field */,
-                      PopulationField&           fOut,
-                      CellTypeField&             cellTypeField /*!       Cell type field     */,
-                      LbmComputeType             omega /*! LBM omega parameter */)
-    {
-        pop[0] = fIn;
-        pop[1] = fOut;
-
-        setupSkeletons(0, stencilSemantic, occ, transfer, pop[0], pop[1], cellTypeField, omega);
-        setupSkeletons(1, stencilSemantic, occ, transfer, pop[1], pop[0], cellTypeField, omega);
-
-        parity = 0;
-    }
-    auto getInput()
-        -> PopulationField&
-    {
-        return pop[parity];
-    }
-
-    auto getOutput()
-        -> PopulationField&
-    {
-        int other = parity == 0 ? 1 : 0;
-        return pop[other];
-    }
-
-    auto run()
-        -> void
-    {
-        lbmTwoPop[parity].run();
-        updateParity();
-    }
-
-    auto sync()
-        -> void
-    {
-        pop[0].getBackend().syncAll();
-    }
-
-   private:
-    auto updateParity()
-        -> void
-    {
-        parity = parity == 0 ? 1 : 0;
-    }
-
-    auto setupSkeletons(int                        target,
-                        Neon::set::StencilSemantic stencilSemantic,
-                        Neon::skeleton::Occ        occ,
-                        Neon::set::TransferMode    transfer,
-                        PopulationField&           inField /*!   inpout population field */,
-                        PopulationField&           outField,
-                        CellTypeField&             cellTypeField /*!       Cell type field     */,
-                        LbmComputeType             omega /*! LBM omega parameter */)
-    {
-        std::vector<Neon::set::Container> ops;
-        lbmTwoPop[target] = Neon::skeleton::Skeleton(inField.getBackend());
-        Neon::skeleton::Options opt(occ, transfer);
-        ops.push_back(LbmTools::iteration(stencilSemantic,
-                                          inField,
-                                          cellTypeField,
-                                          omega,
-                                          outField));
-        std::stringstream appName;
-        appName << "LBM_iteration_" << std::to_string(target);
-        lbmTwoPop[target].sequence(ops, appName.str(), opt);
-    }
-
-    Neon::skeleton::Skeleton lbmTwoPop[2];
-    PopulationField          pop[2];
-    int                      parity;
-};
\ No newline at end of file
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h
new file mode 100644
index 00000000..22ae8177
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmSkeleton.h
@@ -0,0 +1,117 @@
+#include "CellType.h"
+#include "ContainerFactory.h"
+#include "ContainersD3Q19.h"
+#include "D3Q19.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Backend.h"
+#include "Neon/set/Containter.h"
+#include "Neon/skeleton/Skeleton.h"
+
+template <typename Method_,
+          typename Precision_,
+          typename Lattice,
+          typename Grid>
+struct LbmSkeleton
+{
+};
+
+
+template <typename Method_,
+          typename Precision_,
+          typename Grid_>
+struct LbmSkeleton<Method_,
+                   Precision_,
+                   D3Q19<Precision_>,
+                   Grid_>
+{
+    using Lattice = D3Q19<Precision_>;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+    using ContainerFactory = common::ContainerFactory<Precision, Lattice, Grid>;
+
+    LbmSkeleton(Neon::set::StencilSemantic stencilSemantic,
+                Neon::skeleton::Occ        occ,
+                Neon::set::TransferMode    transfer,
+                PopField&                  fIn /*!   inpout population field */,
+                PopField&                  fOut,
+                CellTypeField&             cellTypeField /*!       Cell type field     */,
+                Compute                    omega /*! LBM omega parameter */)
+    {
+        pop[0] = fIn;
+        pop[1] = fOut;
+
+        setupSkeletons(0, stencilSemantic, occ, transfer, pop[0], pop[1], cellTypeField, omega);
+        setupSkeletons(1, stencilSemantic, occ, transfer, pop[1], pop[0], cellTypeField, omega);
+
+        parity = 0;
+    }
+
+    auto getInput()
+        -> PopField&
+    {
+        return pop[parity];
+    }
+
+    auto getOutput()
+        -> PopField&
+    {
+        int other = parity == 0 ? 1 : 0;
+        return pop[other];
+    }
+
+    auto run()
+        -> void
+    {
+        lbmTwoPop[parity].run();
+        updateParity();
+    }
+
+    auto sync()
+        -> void
+    {
+        pop[0].getBackend().syncAll();
+    }
+
+   private:
+    auto updateParity()
+        -> void
+    {
+        parity = parity == 0 ? 1 : 0;
+    }
+
+    auto setupSkeletons(int                        target,
+                        Neon::set::StencilSemantic stencilSemantic,
+                        Neon::skeleton::Occ        occ,
+                        Neon::set::TransferMode    transfer,
+                        PopField&                  inField /*!   inpout population field */,
+                        PopField&                  outField,
+                        CellTypeField&             cellTypeField /*!       Cell type field     */,
+                        Compute                    omega /*! LBM omega parameter */)
+    {
+        std::vector<Neon::set::Container> ops;
+        lbmTwoPop[target] = Neon::skeleton::Skeleton(inField.getBackend());
+        Neon::skeleton::Options opt(occ, transfer);
+        ops.push_back(ContainerFactory::template iteration<Method_>(stencilSemantic,
+                                                  inField,
+                                                  cellTypeField,
+                                                  omega,
+                                                  outField));
+        std::stringstream appName;
+        appName << "LBM_iteration_" << std::to_string(target);
+        lbmTwoPop[target].sequence(ops, appName.str(), opt);
+    }
+
+    Neon::skeleton::Skeleton lbmTwoPop[2];
+    PopField                 pop[2];
+    int                      parity;
+};
\ No newline at end of file
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h
similarity index 97%
rename from benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h
rename to benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h
index 5728a5d3..489b3782 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/LbmTools.h
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/LbmToolsTemplateOnly.h
@@ -8,7 +8,7 @@
 template <typename Lattice,
           typename PopulationField,
           typename LbmComputeType>
-struct LbmContainers
+struct LbmContainersTemplateOnly
 {
 };
 
@@ -19,13 +19,13 @@ struct LbmContainers
  */
 template <typename PopulationField,
           typename LbmComputeType>
-struct LbmContainers<D3Q19Template<typename PopulationField::Type, LbmComputeType>,
-                     PopulationField,
-                     LbmComputeType>
+struct LbmContainersTemplateOnly<D3Q19<typename PopulationField::Type, LbmComputeType>,
+                                 PopulationField,
+                                 LbmComputeType>
 {
     using LbmStoreType = typename PopulationField::Type;
     using CellTypeField = typename PopulationField::Grid::template Field<CellType, 1>;
-    using Lattice = D3Q19Template<LbmStoreType, LbmComputeType>;
+    using Lattice = D3Q19<LbmStoreType, LbmComputeType>;
     using Idx = typename PopulationField::Idx;
     using Grid = typename PopulationField::Grid;
     using Rho = typename Grid::template Field<LbmStoreType, 1>;
@@ -36,21 +36,21 @@ struct LbmContainers<D3Q19Template<typename PopulationField::Type, LbmComputeTyp
         { /*GO*/                                                                                                        \
             if (wallBitFlag & (uint32_t(1) << GOid)) {                                                                  \
                 /*std::cout << "cell " << i.mLocation << " direction " << GOid << " opposite " << BKid << std::endl; */ \
-                popIn[GOid] = fin(i, BKid);                                                                             \
+                popIn[GOid] = fin.template read<BKid>(gidx);                                                            \
             } else {                                                                                                    \
-                popIn[GOid] = fin.template nghVal<BKx, BKy, BKz>(i, GOid, 0.0).value;                                   \
+                popIn[GOid] = fin.template nghVal<BKx, BKy, BKz, GOid>(gidx).value;                                     \
             }                                                                                                           \
         }                                                                                                               \
         { /*BK*/                                                                                                        \
             if (wallBitFlag & (uint32_t(1) << BKid)) {                                                                  \
-                popIn[BKid] = fin(i, GOid);                                                                             \
+                popIn[BKid] = fin.template read<GOid>(gidx);                                                            \
             } else {                                                                                                    \
-                popIn[BKid] = fin.template nghVal<GOx, GOy, GOz>(i, BKid, 0.0).value;                                   \
+                popIn[BKid] = fin.template nghVal<GOx, GOy, GOz, BKid>(gidx).value;                                     \
             }                                                                                                           \
         }                                                                                                               \
     }
     static inline NEON_CUDA_HOST_DEVICE auto
-    loadPopulation(Idx const&                                 i,
+    loadPopulation(Idx const&                                 gidx,
                    const uint32_t&                            wallBitFlag,
                    typename PopulationField::Partition const& fin,
                    NEON_OUT LbmStoreType                      popIn[19])
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h
new file mode 100644
index 00000000..4d3bf178
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Methods.h
@@ -0,0 +1,8 @@
+#pragma once
+
+enum class Method
+{
+    push,
+    pull,
+    aa
+};
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h
index be94ab76..7e6697ef 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Metrics.h
@@ -23,6 +23,12 @@ void recordBackend(Neon::Backend& bk,
     report.recordBk(bk);
 }
 
+void recordGrid(Neon::domain::interface::GridBase& g,
+                   Report&        report)
+{
+    report.recordGrid(g);
+}
+
 }  // namespace
 
 
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h
new file mode 100644
index 00000000..a45ff69e
--- /dev/null
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Precision.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "Neon/Neon.h"
+#include "Neon/set/Backend.h"
+#include "Neon/set/memory/memSet.h"
+
+template <typename StorageFP,
+          typename ComputeFP>
+struct Precision
+{
+    using Storage = StorageFP;
+    using Compute = ComputeFP;
+};
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h b/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h
index 565a9108..4ca0827b 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Repoert.h
@@ -3,7 +3,7 @@
 #include <string>
 #include <vector>
 #include "Config.h"
-
+#include "Neon/domain/interface/GridBase.h"
 struct Report
 {
     Neon::Report mReport;
@@ -36,4 +36,5 @@ struct Report
     auto save()
         -> void;
     void recordBk(Neon::Backend& backend);
+    void recordGrid(Neon::domain::interface::GridBase& g);
 };
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp b/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp
index 2e88f907..049d1735 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/Report.cpp
@@ -29,6 +29,7 @@ Report::Report(const Config& c)
 
     mReport.addMember("computeType", c.computeType);
     mReport.addMember("storeType", c.storeType);
+    mReport.addMember("spaceCurve", Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(c.spaceCurve));
 
 
     mReport.addMember("occ", Neon::skeleton::OccUtils::toString(c.occ));
@@ -100,3 +101,8 @@ void Report::recordBk(Neon::Backend& backend)
 {
     backend.toReport(mReport);
 }
+
+void Report::recordGrid(Neon::domain::interface::GridBase& g)
+{
+    g.toReport(mReport, true);
+}
\ No newline at end of file
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
index c603415c..146f108f 100644
--- a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
+++ b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
@@ -2,10 +2,11 @@
 #include "D3Q19.h"
 #include "Neon/domain/bGrid.h"
 #include "Neon/domain/dGrid.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 #include "Neon/domain/eGrid.h"
 
 #include "CellType.h"
-#include "LbmIteration.h"
+#include "LbmSkeleton.h"
 #include "Metrics.h"
 #include "Repoert.h"
 
@@ -14,15 +15,28 @@ namespace CavityTwoPop {
 int backendWasReported = false;
 
 namespace details {
-template <typename Grid,
-          typename StorageFP,
-          typename ComputeFP>
+template <typename Method_,
+          typename Grid,
+          typename Storage_,
+          typename Compute_>
 auto run(Config& config,
          Report& report) -> void
 {
-    using Lattice = D3Q19Template<StorageFP, ComputeFP>;
-    using PopulationField = typename Grid::template Field<StorageFP, Lattice::Q>;
+    using Storage = Storage_;
+    using Compute = Compute_;
+    using Precision = Precision<Storage, Compute>;
+    using Lattice = D3Q19<Precision>;
+    using PopulationField = typename Grid::template Field<Storage, Lattice::Q>;
 
+    using PopField = typename Grid::template Field<typename Precision::Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using RhoField = typename Grid::template Field<typename Precision::Storage, 1>;
+    using UField = typename Grid::template Field<typename Precision::Storage, 3>;
+
+    using Skeleton = LbmSkeleton<Precision, Lattice, Grid>;
+    using ContainerFactory = ContainerFactory<Precision, Lattice, Grid>;
 
     Neon::Backend bk = [&] {
         if (config.deviceType == "cpu") {
@@ -38,49 +52,50 @@ auto run(Config& config,
         NEON_THROW(exce);
     }();
 
-    if (!backendWasReported) {
-        metrics::recordBackend(bk, report);
-        backendWasReported = true;
-    }
 
     Neon::double_3d ulid(1., 0., 0.);
-    Lattice         lattice(bk);
-
     // Neon Grid and Fields initialization
     auto [start, clock_iter] = metrics::restartClock(bk, true);
     Grid grid(
         bk, {config.N, config.N, config.N},
         [](const Neon::index_3d&) { return true; },
-        lattice.c_vect);
+        Lattice::template getDirectionAsVector<Lattice::MemoryMapping>(),
+        1.0, 0.0,
+        config.spaceCurve);
 
-    PopulationField pop0 = grid.template newField<StorageFP, Lattice::Q>("Population", Lattice::Q, StorageFP(0.0));
-    PopulationField pop1 = grid.template newField<StorageFP, Lattice::Q>("Population", Lattice::Q, StorageFP(0.0));
+    if (!backendWasReported) {
+        metrics::recordBackend(bk, report);
+        metrics::recordGrid(grid, report);
+        backendWasReported = true;
+    }
 
-    typename Grid::template Field<StorageFP, 1> rho;
-    typename Grid::template Field<StorageFP, 3> u;
+    PopulationField pop0 = grid.template newField<Storage, Lattice::Q>("Population", Lattice::Q, Storage(0.0));
+    PopulationField pop1 = grid.template newField<Storage, Lattice::Q>("Population", Lattice::Q, Storage(0.0));
+
+    typename Grid::template Field<Storage, 1> rho;
+    typename Grid::template Field<Storage, 3> u;
 
     if (!config.benchmark) {
         std::cout << "Allocating rho and u" << std::endl;
-        rho = grid.template newField<StorageFP, 1>("rho", 1, StorageFP(0.0));
-        u = grid.template newField<StorageFP, 3>("u", 3, StorageFP(0.0));
+        rho = grid.template newField<Storage, 1>("rho", 1, Storage(0.0));
+        u = grid.template newField<Storage, 3>("u", 3, Storage(0.0));
     }
 
 
     CellType defaultCelltype;
     auto     flag = grid.template newField<CellType, 1>("Material", 1, defaultCelltype);
-    auto     lbmParameters = config.getLbmParameters<ComputeFP>();
+    auto     lbmParameters = config.getLbmParameters<Compute>();
 
-    LbmIterationD3Q19<PopulationField, ComputeFP>
-        iteration(config.stencilSemantic,
-                  config.occ,
-                  config.transferMode,
-                  pop0,
-                  pop1,
-                  flag,
-                  lbmParameters.omega);
+    Skeleton iteration(config.stencilSemantic,
+                       config.occ,
+                       config.transferMode,
+                       pop0,
+                       pop1,
+                       flag,
+                       lbmParameters.omega);
 
     auto exportRhoAndU = [&bk, &rho, &u, &iteration, &flag, &grid, &ulid](int iterationId) {
-        if ((iterationId) % 100 == 0) {
+        if ((iterationId) % 1 == 0) {
             auto& f = iteration.getInput();
             {
                 bk.syncAll();
@@ -91,7 +106,7 @@ auto run(Config& config,
                 bk.syncAll();
             }
 
-            auto container = LbmContainers<Lattice, PopulationField, ComputeFP>::computeRhoAndU(f, flag, rho, u);
+            auto container = ContainerFactory::computeRhoAndU(f, flag, rho, u);
             container.run(Neon::Backend::mainStreamIdx);
             u.updateHostData(Neon::Backend::mainStreamIdx);
             rho.updateHostData(Neon::Backend::mainStreamIdx);
@@ -105,7 +120,8 @@ auto run(Config& config,
             u.ioToVtk("u_" + iterIdStr, "u", false);
             rho.ioToVtk("rho_" + iterIdStr, "rho", false);
             // iteration.getInput().ioToVtk("pop_" + iterIdStr, "u", false);
-            // flag.ioToVtk("flag_" + iterIdStr, "u", false);
+            flag.template ioToVtk<int>("flag_" + iterIdStr, "flag", false);
+            flag.template ioToVtk<int>("flag_" + iterIdStr, "flag", false);
 
             std::vector<std::pair<double, double>> xPosVal;
             std::vector<std::pair<double, double>> yPosVal;
@@ -162,71 +178,20 @@ auto run(Config& config,
 
         Neon::index_3d dim(config.N, config.N, config.N);
 
-        const auto& t = lattice.t_vect;
-        const auto& c = lattice.c_vect;
-
-        inPop.forEachActiveCell([&c, &t, &dim, &flag, &ulid, &config](const Neon::index_3d& idx,
-                                                                      const int&            k,
-                                                                      StorageFP&            val) {
-            val = t.at(k);
-
-            if (idx.x == 0 || idx.x == dim.x - 1 ||
-                idx.y == 0 || idx.y == dim.y - 1 ||
-                idx.z == 0 || idx.z == dim.z - 1) {
-
-                if (idx.y == dim.y - 1) {
-                    val = -6. * t.at(k) * config.ulb *
-                          (c.at(k).v[0] * ulid.v[0] +
-                           c.at(k).v[1] * ulid.v[1] +
-                           c.at(k).v[2] * ulid.v[2]);
-                } else {
-                    val = 0;
-                }
-            }
-        });
-
-        outPop.forEachActiveCell([&c, &t, &dim, &flag, &ulid, &config](const Neon::index_3d& idx,
-                                                                       const int&            k,
-                                                                       StorageFP&            val) {
-            val = t.at(k);
-
-            if (idx.x == 0 || idx.x == dim.x - 1 ||
-                idx.y == 0 || idx.y == dim.y - 1 ||
-                idx.z == 0 || idx.z == dim.z - 1) {
-
-                if (idx.y == dim.y - 1) {
-                    val = -6. * t.at(k) * config.ulb *
-                          (c.at(k).v[0] * ulid.v[0] +
-                           c.at(k).v[1] * ulid.v[1] +
-                           c.at(k).v[2] * ulid.v[2]);
-                } else {
-                    val = 0;
-                }
-            }
-        });
-
-        flag.forEachActiveCell([&dim](const Neon::index_3d& idx,
-                                      const int&,
-                                      CellType& flagVal) {
-            flagVal.classification = CellType::bulk;
-            flagVal.wallNghBitflag = 0;
-
-            if (idx.x == 0 || idx.x == dim.x - 1 ||
-                idx.y == 0 || idx.y == dim.y - 1 ||
-                idx.z == 0 || idx.z == dim.z - 1) {
+        //        const auto& t = Lattice::Memory::t;
+        //        const auto& c = Lattice::Memory::stencil;
 
-                flagVal.classification = CellType::bounceBack;
-
-                if (idx.y == dim.y - 1) {
-                    flagVal.classification = CellType::movingWall;
-                }
-            }
-        });
+        ContainerFactory::problemSetup(inPop,
+                                       outPop,
+                                       flag,
+                                       ulid,
+                                       config.ulb)
+            .run(Neon::Backend::mainStreamIdx);
 
-        inPop.updateDeviceData(Neon::Backend::mainStreamIdx);
-        outPop.updateDeviceData(Neon::Backend::mainStreamIdx);
 
-        flag.updateDeviceData(Neon::Backend::mainStreamIdx);
+        inPop.updateHostData(Neon::Backend::mainStreamIdx);
+        outPop.updateHostData(Neon::Backend::mainStreamIdx);
+        flag.updateHostData(Neon::Backend::mainStreamIdx);
         {
             bk.syncAll();
             flag.newHaloUpdate(Neon::set::StencilSemantic::standard /*semantic*/,
@@ -236,7 +201,7 @@ auto run(Config& config,
             bk.syncAll();
         }
 
-        auto container = LbmContainers<Lattice, PopulationField, ComputeFP>::computeWallNghMask(flag, flag);
+        auto container = ContainerFactory::computeWallNghMask(flag, flag);
         container.run(Neon::Backend::mainStreamIdx);
         bk.syncAll();
     }
@@ -275,15 +240,15 @@ auto run(Config& config,
     metrics::recordMetrics(bk, config, report, start, clock_iter);
 }
 
-template <typename Grid, typename StorageFP>
+template <typename Grid, typename Storage>
 auto runFilterComputeType(Config& config, Report& report) -> void
 {
     if (config.computeType == "double") {
-        return run<Grid, StorageFP, double>(config, report);
-    }
-    if (config.computeType == "float") {
-        return run<Grid, StorageFP, float>(config, report);
+        return run<Grid, Storage, double>(config, report);
     }
+//    if (config.computeType == "float") {
+//        return run<Grid, Storage, float>(config, report);
+//    }
     NEON_DEV_UNDER_CONSTRUCTION("");
 }
 
@@ -295,23 +260,96 @@ auto runFilterStoreType(Config& config,
     if (config.storeType == "double") {
         return runFilterComputeType<Grid, double>(config, report);
     }
-    if (config.storeType == "float") {
-        return runFilterComputeType<Grid, float>(config, report);
-    }
+//    if (config.storeType == "float") {
+//        return runFilterComputeType<Grid, float>(config, report);
+//    }
+    NEON_DEV_UNDER_CONSTRUCTION("");
 }
 }  // namespace details
 
+#ifdef NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS
+constexpr bool skipTest = false;
+#else
+constexpr bool skipTest = false;
+#endif
+
 auto run(Config& config,
          Report& report) -> void
 {
     if (config.gridType == "dGrid") {
         return details::runFilterStoreType<Neon::dGrid>(config, report);
     }
-    if (config.gridType == "eGrid") {
-        return details::runFilterStoreType<Neon::eGrid>(config, report);
-    }
-    if (config.gridType == "bGrid") {
-        return details::runFilterStoreType<Neon::bGrid>(config, report);
-    }
+//    if (config.gridType == "eGrid") {
+//        if constexpr (!skipTest) {
+//            return details::runFilterStoreType<Neon::eGrid>(config, report);
+//        } else {
+//            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+//        }
+//    }
+//    if (config.gridType == "bGrid" || config.gridType == "bGrid_8_8_8") {
+//        return details::runFilterStoreType<Neon::bGrid>(config, report);
+//    }
+//    if (config.gridType == "bGrid_4_4_4") {
+//        if constexpr (!skipTest) {
+//            using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>;
+//            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+//            return details::runFilterStoreType<Grid>(config, report);
+//        } else {
+//            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+//        }
+//    }
+//    if (config.gridType == "bGrid_2_2_2") {
+//        if constexpr (!skipTest) {
+//            using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>;
+//            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+//            return details::runFilterStoreType<Grid>(config, report);
+//        } else {
+//            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+//        }
+//    }
+    //    if (config.gridType == "bGrid_32_8_4") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "bGrid_32_8_4") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "bGrid_32_2_8") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "bGrid_32_8_2") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "dGridSoA") {
+    //        if constexpr (!skipTest) {
+    //            return details::runFilterStoreType<Neon::domain::details::dGridSoA::dGridSoA>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    NEON_THROW_UNSUPPORTED_OPERATION("Unknown grid type: " + config.gridType);
 }
 }  // namespace CavityTwoPop
diff --git a/benchmarks/lbm/CMakeLists.txt b/benchmarks/lbm/CMakeLists.txt
new file mode 100644
index 00000000..7f0c1415
--- /dev/null
+++ b/benchmarks/lbm/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
+
+SET(APP "lbm")
+
+file(GLOB_RECURSE SrcFiles src/*.*)
+
+add_executable(${APP} ${SrcFiles})
+
+target_link_libraries(${APP}
+		PUBLIC libNeonDomain
+		PUBLIC libNeonSkeleton)
+
+set_target_properties(${APP} PROPERTIES
+		CUDA_SEPARABLE_COMPILATION ON
+		CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+target_compile_options(${APP} INTERFACE
+		$<$<COMPILE_LANGUAGE:CXX>:${NeonCXXFlags}>
+		$<$<COMPILE_LANGUAGE:CUDA>:${NeonCUDAFlags}>
+		)
+
+add_custom_command(
+		TARGET ${APP}  POST_BUILD
+		COMMAND ${CMAKE_COMMAND} -E copy
+		${CMAKE_CURRENT_SOURCE_DIR}/${APP}.sh
+		${CMAKE_BINARY_DIR}/bin/${APP}.sh)
+
+add_custom_command(
+		TARGET ${APP}  POST_BUILD
+		COMMAND ${CMAKE_COMMAND} -E copy
+		${CMAKE_CURRENT_SOURCE_DIR}/${APP}.py
+		${CMAKE_BINARY_DIR}/bin/${APP}.py
+)
\ No newline at end of file
diff --git a/benchmarks/lbm/lbm.py b/benchmarks/lbm/lbm.py
new file mode 100644
index 00000000..730dd05c
--- /dev/null
+++ b/benchmarks/lbm/lbm.py
@@ -0,0 +1,132 @@
+deviceType_LIST = 'cpu gpu'.split()
+deviceIds_LIST= "0 1 2 3 4 5 6 7".split()
+grid_LIST= "dGrid bGrid_4_4_4".split()
+domainSize_LIST= "64 128 192 256 320 384 448 512".split()
+computeFP_LIST= "double float".split()
+storageFP_LIST= "double float".split()
+occ_LIST="none".split()
+transferMode_LIST= "get put".split()
+stencilSemantic_LIST= "grid streaming".split()
+spaceCurve_LIST= "sweep morton hilbert".split()
+collision_LIST = "bgk kbc".split()
+streamingMethod_LIST= "push pull aa".split()
+lattice_LIST= "d3q19 d3q27".split()
+
+warmupIter_INT = 10
+repetitions_INT = 5
+maxIter_INT = 10000
+
+import subprocess
+import sys
+
+
+def printProgressBar(value, label):
+    n_bar = 40  # size of progress bar
+    max = 100
+    j = value / max
+    sys.stdout.write('\r')
+    bar = '█' * int(n_bar * j)
+    bar = bar + '-' * int(n_bar * (1 - j))
+
+    sys.stdout.write(f"{label.ljust(10)} | [{bar:{n_bar}s}] {int(100 * j)}% ")
+    sys.stdout.flush()
+
+
+def countAll():
+    counter = 0
+    for DEVICE_TYPE in deviceType_LIST:
+        DEVICE_SET_LIST = [deviceIds_LIST[0]]
+        if DEVICE_TYPE == 'gpu':
+            for DEVICE in deviceIds_LIST[1:]:
+                DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
+        for DEVICE_SET in DEVICE_SET_LIST:
+            for OCC in occ_LIST:
+                for DOMAIN_SIZE in domainSize_LIST:
+                    for STORAGE_FP in storageFP_LIST:
+                        for COMPUTE_FP in computeFP_LIST:
+                            for GRID in grid_LIST:
+                                for CURVE in spaceCurve_LIST:
+                                    for LATTICE in lattice_LIST:
+                                        for TRANSFERMODE in transferMode_LIST:
+                                            for STENCILSEMANTIC in stencilSemantic_LIST:
+                                                for COLLISION in collision_LIST:
+                                                    if LATTICE != "d3q27" and COLLISION == 'kbc':
+                                                        continue
+                                                    for STREAMINGMETHOD in streamingMethod_LIST:
+                                                        if STREAMINGMETHOD != 'pull' and len(DEVICE_SET_LIST) != 1:
+                                                            continue
+                                                        if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                                            continue
+                                                        if STORAGE_FP == 'float' and COMPUTE_FP == 'double':
+                                                            continue
+
+                                                        counter += 1
+    return counter
+
+
+SAMPLES = countAll()
+counter = 0
+command = './lbm'
+# command = 'echo'
+with open(command + '.log', 'w') as fp:
+    for DEVICE_TYPE in deviceType_LIST:
+        DEVICE_SET_LIST = [deviceIds_LIST[0]]
+        if DEVICE_TYPE == 'gpu':
+            for DEVICE in deviceIds_LIST[1:]:
+                DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
+        for DEVICE_SET in DEVICE_SET_LIST:
+            for OCC in occ_LIST:
+                for DOMAIN_SIZE in domainSize_LIST:
+                    for STORAGE_FP in storageFP_LIST:
+                        for COMPUTE_FP in computeFP_LIST:
+                            for GRID in grid_LIST:
+                                for CURVE in spaceCurve_LIST:
+                                    for LATTICE in lattice_LIST:
+                                        for TRANSFERMODE in transferMode_LIST:
+                                            for STENCILSEMANTIC in stencilSemantic_LIST:
+                                                for COLLISION in collision_LIST:
+                                                    if LATTICE != "d3q27" and COLLISION == 'kbc':
+                                                        continue
+                                                    for STREAMINGMETHOD in streamingMethod_LIST:
+                                                        if STREAMINGMETHOD != 'pull' and len(DEVICE_SET_LIST) != 1:
+                                                            continue
+                                                        if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                                            continue
+                                                        if STORAGE_FP == 'float' and COMPUTE_FP == 'double':
+                                                            continue
+
+                                                        parameters = []
+                                                        parameters.append('--deviceType ' + DEVICE_TYPE)
+                                                        parameters.append('--deviceIds ' + DEVICE_SET)
+                                                        parameters.append('--grid ' + GRID)
+                                                        parameters.append('--domain-size ' + DOMAIN_SIZE)
+                                                        parameters.append('--max-iter ' + str(maxIter_INT))
+                                                        parameters.append('--report-filename ' + 'lbm')
+                                                        parameters.append('--computeFP ' + COMPUTE_FP)
+                                                        parameters.append('--storageFP ' + STORAGE_FP)
+                                                        parameters.append('--occ ' + OCC)
+                                                        parameters.append('--transferMode ' + TRANSFERMODE)
+                                                        parameters.append('--stencilSemantic ' + STENCILSEMANTIC)
+                                                        parameters.append('--spaceCurve ' + CURVE)
+                                                        parameters.append('--collision ' + COLLISION)
+                                                        parameters.append('--streamingMethod ' + STREAMINGMETHOD)
+                                                        parameters.append('--lattice ' + LATTICE)
+                                                        parameters.append('--benchmark ')
+                                                        parameters.append('--warmup-iter ' + str(warmupIter_INT))
+                                                        parameters.append('--repetitions ' + str(repetitions_INT))
+
+                                                        commandList = []
+                                                        commandList.append(command)
+                                                        for el in parameters:
+                                                            for s in el.split():
+                                                                commandList.append(s)
+
+                                                        fp.write("\n-------------------------------------------\n")
+                                                        fp.write(' '.join(commandList))
+                                                        fp.write("\n-------------------------------------------\n")
+                                                        fp.flush()
+                                                        print(' '.join(commandList))
+                                                        subprocess.run(commandList, text=True, stdout=fp)
+
+                                                        counter += 1
+                                                        printProgressBar(counter * 100.0 / SAMPLES, 'Progress')
diff --git a/benchmarks/lbm/lbm.sh b/benchmarks/lbm/lbm.sh
new file mode 100644
index 00000000..7cc5108c
--- /dev/null
+++ b/benchmarks/lbm/lbm.sh
@@ -0,0 +1,30 @@
+set -x
+
+DOMAIN_SIZE_LIST="64 128 192 256 320 384 448 512"
+GRID_LIST="dGrid bGrid eGrid"
+STORAGE_FP_LIST="double float"
+COMPUTE_FP_LIST="double float"
+OCC="nOCC"
+
+for DOMAIN_SIZE in ${DOMAIN_SIZE_LIST}; do
+  for STORAGE_FP in ${STORAGE_FP_LIST}; do
+    for COMPUTE_FP in ${COMPUTE_FP_LIST}; do
+      for GRID in ${GRID_LIST}; do
+
+        if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then
+          continue
+        fi
+
+        echo ./lbm-lid-driven-cavity-flow \
+          --deviceType gpu --deviceIds 0 \
+          --grid "${GRID}" \
+          --domain-size "${DOMAIN_SIZE}" \
+          --warmup-iter 10 --max-iter 100 --repetitions 5 \
+          --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \
+          --computeFP "${COMPUTE_FP}" \
+          --storageFP "${STORAGE_FP}" \
+          --${OCC} --benchmark
+      done
+    done
+  done
+done
diff --git a/benchmarks/lbm/src/CellType.h b/benchmarks/lbm/src/CellType.h
new file mode 100644
index 00000000..47c0397b
--- /dev/null
+++ b/benchmarks/lbm/src/CellType.h
@@ -0,0 +1,56 @@
+#pragma once
+
+struct CellType
+{
+    enum Classification : int
+    {
+        bounceBack,
+        movingWall,
+        bulk,
+        undefined
+    };
+
+    NEON_CUDA_HOST_DEVICE CellType(int dummy = 0)
+    {
+        (void)dummy;
+        classification = bulk;
+        wallNghBitflag = 0;
+    }
+
+    NEON_CUDA_HOST_DEVICE explicit CellType(Classification c, uint32_t n)
+    {
+        classification = c;
+        wallNghBitflag = n;
+    }
+
+    NEON_CUDA_HOST_DEVICE explicit CellType(Classification c)
+    {
+        classification = c;
+        wallNghBitflag = 0;
+    }
+
+    // Converting to int to exportVti
+    operator int() const { return int(classification); }
+
+    template <int fwdRegQ>
+    NEON_CUDA_HOST_DEVICE static auto isWall(const uint32_t& wallNghBitFlag)
+        -> bool
+    {
+        return wallNghBitFlag & (uint32_t(1) << fwdRegQ);
+    }
+
+    NEON_CUDA_HOST_DEVICE auto setWall(int fwdRegIdx)
+        -> void
+    {
+        wallNghBitflag = wallNghBitflag | ((uint32_t(1) << fwdRegIdx));
+    }
+
+    uint32_t       wallNghBitflag;
+    Classification classification;
+};
+
+std::ostream& operator<<(std::ostream& os, const CellType& dt)
+{
+    os << static_cast<double>(dt.classification);
+    return os;
+}
\ No newline at end of file
diff --git a/benchmarks/lbm/src/Collision.cpp b/benchmarks/lbm/src/Collision.cpp
new file mode 100644
index 00000000..3f7510cd
--- /dev/null
+++ b/benchmarks/lbm/src/Collision.cpp
@@ -0,0 +1,127 @@
+#include "Collision.h"
+
+
+auto CollisionUtils::toString(Collision occ) -> std::string
+{
+    switch (occ) {
+        case Collision::bgk: {
+            return "bgk";
+        }
+        case Collision::kbc: {
+            return "kbc";
+        }
+    }
+    NEON_THROW_UNSUPPORTED_OPTION("");
+}
+
+auto CollisionUtils::fromString(const std::string& occ) -> Collision
+{
+    std::array<Collision, nOptions> occs{Collision::bgk, Collision::kbc};
+    for (auto a : occs) {
+        if (toString(a) == occ) {
+            return a;
+        }
+    }
+    NEON_THROW_UNSUPPORTED_OPTION("");
+}
+
+auto CollisionUtils::getOptions() -> std::array<Collision, nOptions>
+{
+    std::array<Collision, nOptions> opts = {Collision::bgk, Collision::kbc};
+    return opts;
+}
+
+CollisionUtils::Cli::Cli()
+{
+    mSet = false;
+}
+
+CollisionUtils::Cli::Cli(std::string s)
+{
+    set(s);
+}
+
+CollisionUtils::Cli::Cli(Collision model)
+{
+    mOption = model;
+    mSet = true;
+}
+
+auto CollisionUtils::Cli::getOption() const -> Collision
+{
+    if (!mSet) {
+        std::stringstream errorMsg;
+        errorMsg << "Collision model was not set.";
+        NEON_ERROR(errorMsg.str());
+    }
+    return mOption;
+}
+
+auto CollisionUtils::Cli::getOptionStr() const -> std::string
+{
+    if (!mSet) {
+        std::stringstream errorMsg;
+        errorMsg << "Collision model was not set.";
+        NEON_ERROR(errorMsg.str());
+    }
+    return CollisionUtils::toString(mOption);
+}
+
+auto CollisionUtils::Cli::set(const std::string& opt)
+    -> void
+{
+    try {
+        mOption = CollisionUtils::fromString(opt);
+    } catch (...) {
+        std::stringstream errorMsg;
+        errorMsg << "Collision: " << opt << " is not a valid option (valid options are {";
+        auto options = CollisionUtils::getOptions();
+        int  i = 0;
+        for (auto o : options) {
+            if (i != 0) {
+                errorMsg << ", " << CollisionUtils::toString(o);
+            }
+            errorMsg << CollisionUtils::toString(o);
+            i = 1;
+        }
+        errorMsg << "})";
+        NEON_ERROR(errorMsg.str());
+    }
+    mSet = true;
+}
+
+auto CollisionUtils::Cli::getAllOptionsStr() const -> std::string
+{
+    std::stringstream s;
+    auto              options = CollisionUtils::getOptions();
+    int               i = 0;
+    for (auto o : options) {
+        if (i != 0) {
+            s << ", ";
+        }
+        s << CollisionUtils::toString(o);
+        i = 1;
+    }
+    std::string msg = s.str();
+    return msg;
+}
+
+
+auto CollisionUtils::Cli::getDoc() const -> std::string
+{
+    std::stringstream s;
+    s << getAllOptionsStr();
+    s << " default: " << CollisionUtils::toString(getOption());
+    return s.str();
+}
+
+auto CollisionUtils::Cli::addToReport(Neon::Report& report) const -> void
+{
+    report.addMember("Collision", CollisionUtils::toString(this->getOption()));
+}
+
+auto CollisionUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void
+{
+    report.addMember("Collision", CollisionUtils::toString(this->getOption()), &subBlock);
+}
+
diff --git a/benchmarks/lbm/src/Collision.h b/benchmarks/lbm/src/Collision.h
new file mode 100644
index 00000000..c022a018
--- /dev/null
+++ b/benchmarks/lbm/src/Collision.h
@@ -0,0 +1,43 @@
+#pragma once
+#include "Neon/Report.h"
+#include "Neon/set/Backend.h"
+#include "Neon/set/Containter.h"
+
+
+enum class Collision
+{
+    bgk,
+    kbc
+};
+
+struct CollisionUtils
+{
+    static constexpr int nOptions = 2;
+
+    static auto toString(Collision occ) -> std::string;
+    static auto fromString(const std::string& occ) -> Collision;
+    static auto getOptions() -> std::array<Collision, nOptions>;
+
+    struct Cli
+    {
+        explicit Cli(std::string);
+        explicit Cli(Collision model);
+        Cli();
+
+        auto getOption() const -> Collision;
+        auto getOptionStr() const -> std::string;
+
+        auto set(const std::string& opt) -> void;
+        auto getAllOptionsStr() const -> std::string;
+        auto getDoc() const -> std::string;
+
+        auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void;
+        auto addToReport(Neon::Report& report) const -> void;
+
+       private:
+        bool mSet = false;
+        Collision  mOption;
+    };
+};
+
+
diff --git a/benchmarks/lbm/src/Config.cpp b/benchmarks/lbm/src/Config.cpp
new file mode 100644
index 00000000..ae30c720
--- /dev/null
+++ b/benchmarks/lbm/src/Config.cpp
@@ -0,0 +1,151 @@
+#include "Config.h"
+#include <string>
+#include <vector>
+
+auto Config::toString() const -> std::string
+{
+    std::stringstream s;
+    const Config&     c = *this;
+
+    auto vecToSting = [](const std::vector<int>& v) {
+        std::stringstream s;
+        bool              firstTime = true;
+        for (auto e : v) {
+            if (firstTime) {
+                firstTime = false;
+            } else {
+                s << " ";
+            }
+            s << std::to_string(e);
+        }
+        return s.str();
+    };
+
+    s << "\n==>[Neon Runtime Parameters]" << std::endl;
+    s << ".......... deviceType " << c.deviceType << std::endl;
+    s << ".......... numDevices " << c.devices.size() << std::endl;
+    s << "............. devices " << vecToSting(c.devices) << std::endl;
+    s << ".......... reportFile " << c.reportFile << std::endl;
+    s << "............ gridType " << c.gridType << std::endl;
+
+    s << ".......... spaceCurve " << c.spaceCurveCli.getStringOption() << std::endl;
+    s << "................. occ " << c.occCli.getStringOption() << std::endl;
+    s << "........ transferMode " << c.transferModeCli.getStringOption() << std::endl;
+    s << "..... stencilSemantic " << c.stencilSemanticCli.getStringOption() << std::endl;
+
+    s << "\n==>[LBM Implementation]" << std::endl;
+    s << "............. lattice " << c.lattice << std::endl;
+    s << ".... streaming method " << c.streamingMethod << std::endl;
+    s << "........... collision " << c.collisionCli.getOptionStr() << std::endl;
+    s << "......... computeType " << c.computeTypeStr << std::endl;
+    s << "........... storeType " << c.storeTypeStr << std::endl;
+
+    s << "\n==>[Physics Parameters]" << std::endl;
+    s << ".................. Re " << c.Re << std::endl;
+    s << "................. ulb " << c.ulb << std::endl;
+    s << "................... N " << c.N << std::endl;
+    s << "................. nu " << mLbmParameters.nu << std::endl;
+    s << ".............. omega " << mLbmParameters.omega << std::endl;
+    s << "................. dx " << mLbmParameters.dx << std::endl;
+    s << "................. dt " << mLbmParameters.dt << std::endl;
+
+    s << "\n==>[Test Parameters]" << std::endl;
+    s << "........... benchmark " << c.benchmark << std::endl;
+    s << "............... max_t " << c.max_t << std::endl;
+    s << "................. vti " << c.vti << std::endl;
+    s << "........ benchIniIter " << c.benchIniIter << std::endl;
+    s << "........ benchMaxIter " << c.benchMaxIter << std::endl;
+
+
+    return s.str();
+}
+
+auto Config::parseArgs(const int argc, char* argv[])
+    -> int
+{
+    auto& config = *this;
+
+    auto cli =
+        (
+
+            clipp::required("--deviceType") & clipp::value("deviceType", config.deviceType) % "Device type (cpu or gpu)",
+            clipp::required("--deviceIds") & clipp::integers("ids", config.devices) % "Device ids",
+
+            clipp::option("--grid") & clipp::value("grid", config.gridType) % Config::getOptionList(config.gridTypeOptions, config.gridType),
+            clipp::option("--domain-size") & clipp::integer("domain_size", config.N) % "Voxels along each dimension of the cube domain",
+            clipp::option("--max-iter") & clipp::integer("max_iter", config.benchMaxIter) % "Maximum solver iterations",
+            clipp::option("--report-filename ") & clipp::value("keeper_filename", config.reportFile) % "Output perf keeper filename",
+
+            clipp::option("--computeFP") & clipp::value("computeFP", config.computeTypeStr) % Config::getOptionList(config.gridTypeOptions, config.gridType),
+            clipp::option("--storageFP") & clipp::value("storageFP", config.storeTypeStr) % "double, float",
+
+            clipp::option("--occ") & clipp::value("occ")([&config](const std::string& s) { config.occCli.set(s); }) % config.occCli.getDoc(),
+            clipp::option("--transferMode") & clipp::value("transferMode")([&config](const std::string& s) { config.transferModeCli.set(s); }) % config.transferModeCli.getDoc(),
+            clipp::option("--stencilSemantic") & clipp::value("stencilSemantic")([&config](const std::string& s) { config.stencilSemanticCli.set(s); }) % config.stencilSemanticCli.getDoc(),
+            clipp::option("--spaceCurve") & clipp::value("spaceCurve")([&config](const std::string& s) { config.spaceCurveCli.set(s); }) % config.spaceCurveCli.getDoc(),
+            clipp::option("--collision") & clipp::value("collision")([&config](const std::string& s) { config.collisionCli.set(s); }) % config.collisionCli.getDoc(),
+
+            clipp::option("--streamingMethod") & clipp::value("streamingMethod", config.streamingMethod) % Config::getOptionList(config.streamingMethodOption, config.streamingMethod),
+            clipp::option("--lattice") & clipp::value("lattice", config.lattice) % Config::getOptionList(config.latticeOptions, config.lattice),
+            (
+                (
+                    clipp::option("--benchmark").set(config.benchmark, true) % "Run benchmark mode",
+                    clipp::option("--warmup-iter") & clipp::integer("warmup_iter", config.benchIniIter) % "Number of iteration for warm up. max_iter = warmup_iter + timed_iters",
+                    clipp::option("--repetitions") & clipp::integer("repetitions", config.repetitions) % "Number of times the benchmark is run."
+
+                    ) |
+                (clipp::option("--vti") & clipp::integer("OutputFrequency", config.vti) % "Voxels along each dimension of the cube domain"))
+
+        );
+
+
+    if (!clipp::parse(argc, argv, cli)) {
+        auto fmt = clipp::doc_formatting{}.doc_column(31);
+        std::cout << make_man_page(cli, argv[0], fmt) << '\n';
+        std::cout << '\n';
+        std::cout << '\n';
+        std::cout << "Export example" << '\n';
+        std::cout << "./lbm --deviceType cpu --deviceIds 0  --grid dGrid  --domain-size 100 --max-iter 2000 --nOCC --huGrid --vti 1" << '\n';
+        std::cout << "Benchmark example " << '\n';
+        std::cout << "./lbm --deviceType gpu --deviceIds 0 1 2 3 4  --grid dGrid  --domain-size 100 --max-iter 2000 --computeFP double --storageFP double --nOCC --huGrid --benchmark --warmup-iter 10 --repetitions 5" << '\n';
+
+        std::cout <<" ./lbm --deviceType gpu\\\n"
+                     "     --deviceIds 0\\\n"
+                     "     --grid dGrid\\\n"
+                     "     --domain-size 100\\\n"
+                     "     --max-iter 1000\\\n"
+                     "     --computeFP float\\\n"
+                     "     --storageFP float\\\n"
+                     "     --occ none\\\n"
+                     "     --transferMode put\\\n"
+                     "     --stencilSemantic grid\\\n"
+                     "     --spaceCurve sweep\\\n"
+                     "     --collision bgk\\\n"
+                     "     --streamingMethod pull\\\n"
+                     "     --lattice d3q19\\\n"
+                     "     --vti 10";
+
+        return -1;
+    }
+
+    helpSetLbmParameters();
+
+    std::stringstream s;
+    for (int i = 0; i < argc; i++) {
+        s << argv[i];
+        if (i + 1 != argc) {
+            s << " ";
+        }
+    }
+    mArgv = s.str();
+
+    return 0;
+}
+
+auto Config::helpSetLbmParameters() -> void
+{
+    mLbmParameters.nu = ulb * static_cast<double>(N - 2) / Re;
+    mLbmParameters.omega = 1. / (3. * mLbmParameters.nu + 0.5);
+    mLbmParameters.dx = 1. / static_cast<double>(N - 2);
+    mLbmParameters.dt = mLbmParameters.dx * ulb;
+}
diff --git a/benchmarks/lbm/src/Config.h b/benchmarks/lbm/src/Config.h
new file mode 100644
index 00000000..b5a1607a
--- /dev/null
+++ b/benchmarks/lbm/src/Config.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include "Collision.h"
+#include "Neon/core/tools/clipp.h"
+#include "Neon/domain/tools/SpaceCurves.h"
+#include "Neon/skeleton/Skeleton.h"
+
+template <typename ComputeType>
+struct LbmParameters
+{
+    ComputeType nu = 0;
+    ComputeType omega = 0;
+    ComputeType dx = 0;
+    ComputeType dt = 0;
+};
+
+struct Config
+{
+    double Re = 100.;            // Reynolds number
+    double ulb = 0.04;           // Velocity in lattice units
+    int    N = 160;              // Number of nodes in x-direction
+    bool   benchmark = false;    // Run in benchmark mode ?
+    double max_t = 10.0;         // Non-benchmark mode: Total time in dim.less units
+    int    benchIniIter = 1000;  // Benchmark mode: Number of warmup iterations
+    int    benchMaxIter = 2000;  // Benchmark mode: Total number of iterations
+    int    repetitions = 1;      // Benchmark mode: number of time the test is run
+
+    std::string      deviceType = "gpu";
+    std::vector<int> devices = std::vector<int>(0);              // Devices for the execution
+    std::string      reportFile = "lbm-lid-driven-cavity-flow";  // Report file name
+
+    std::vector<std::string> gridTypeOptions = {"dGrid", "eGrid", "bGrid"};
+    std::string              gridType = gridTypeOptions[0];  // Neon grid type
+
+    Neon::skeleton::OccUtils::Cli                         occCli{Neon::skeleton::Occ::none};              // Neon OCC type
+    Neon::set::TransferModeUtils::Cli                     transferModeCli{Neon::set::TransferMode::get};  // Neon transfer mode for halo update
+    Neon::set::StencilSemanticUtils::Cli                  stencilSemanticCli{Neon::set::StencilSemantic::lattice};
+    Neon::domain::tool::spaceCurves::EncoderTypeUtil::Cli spaceCurveCli{Neon::domain::tool::spaceCurves::EncoderType::sweep};
+    CollisionUtils::Cli                                   collisionCli{Collision::bgk};
+    int                                                   vti = 0;  // Export vti file
+
+    std::vector<std::string> computeTypeOptions = {"double", "float"};
+    std::string              computeTypeStr = computeTypeOptions[0];
+
+    std::vector<std::string> storeTypeOptions = {"double", "float"};
+    std::string              storeTypeStr = storeTypeOptions[0];
+
+
+    std::vector<std::string> latticeOptions = {"d3q19", "d3q27"};
+    std::string              lattice = latticeOptions[0];
+
+    std::vector<std::string> streamingMethodOption = {"push", "pull"};
+    std::string              streamingMethod = "push";
+
+    LbmParameters<double> mLbmParameters;
+
+    std::string mArgv;
+
+    auto getOptionList(std::vector<std::string> list, std::string defaultVal) -> std::string
+    {
+        std::stringstream s;
+        for (int i = 0; i < int(list.size()); i++) {
+            s << list[i];
+            if (list[i] == defaultVal) {
+                s << " (default) ";
+            }
+        }
+        return s.str();
+    }
+
+    auto check(std::vector<std::string> list, std::string userValue) -> bool
+    {
+        for (int i = 0; i < int(list.size()); i++) {
+            if (list[i] == userValue) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    auto toString()
+        const -> std::string;
+
+    auto parseArgs(int argc, char* argv[])
+        -> int;
+
+    template <class ComputeType>
+    auto getLbmParameters()
+        -> LbmParameters<ComputeType>
+    {
+        LbmParameters<ComputeType> output;
+        output.nu = static_cast<ComputeType>(mLbmParameters.nu);
+        output.omega = static_cast<ComputeType>(mLbmParameters.omega);
+        output.dx = static_cast<ComputeType>(mLbmParameters.dx);
+        output.dt = static_cast<ComputeType>(mLbmParameters.dt);
+
+        return output;
+    }
+
+   private:
+    auto helpSetLbmParameters()
+        -> void;
+};
diff --git a/benchmarks/lbm/src/ContainersD3QXX.h b/benchmarks/lbm/src/ContainersD3QXX.h
new file mode 100644
index 00000000..bb4adb0d
--- /dev/null
+++ b/benchmarks/lbm/src/ContainersD3QXX.h
@@ -0,0 +1,616 @@
+#pragma once
+
+#include "./Methods.h"
+#include "CellType.h"
+#include "D3Q19.h"
+#include "DeviceD3QXX.h"
+#include "Methods.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Containter.h"
+
+/**
+ * Specialization for D3Q19
+ */
+template <typename Precision_, typename Grid_, typename Lattice_, Collision CollisionId>
+struct ContainerFactoryD3QXX
+{
+    using Lattice = Lattice_;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+    //    using PullFunctions = pull::DeviceD3Q19<Precision, Grid>;
+    //    using CommonFunctions = common::DeviceD3Q19<Precision, Grid>;
+    using Device = DeviceD3QXX<Precision, Grid, Lattice>;
+
+    struct AA
+    {
+        struct Even
+        {
+            // collide
+
+            static auto
+            iteration(const CellTypeField& cellTypeField /*! Cell type field     */,
+                      const Compute        omega /*!         LBM omega parameter */,
+                      NEON_IO PopField&    fpopField /*!     Output Population field */)
+                -> Neon::set::Container
+            {
+                Neon::set::Container container = fpopField.getGrid().newContainer(
+                    "D3Q19_TwoPop_Pull",
+                    [&, omega](Neon::set::Loader& L) -> auto {
+                        auto&                          popMem = L.load(fpopField);
+                        const auto&                    cellInfoPartition = L.load(cellTypeField);
+                        [[maybe_unused]] const Compute beta = omega * 0.5;
+                        [[maybe_unused]] const Compute invBeta = 1.0 / beta;
+
+                        return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                            [[maybe_unused]] const Compute capturedOmega = omega;
+                            [[maybe_unused]] const Compute capturedInvBeta = invBeta;
+
+                            CellType cellInfo = cellInfoPartition(gidx, 0);
+                            if (cellInfo.classification == CellType::bulk) {
+
+                                Storage popRegisters[Lattice::Q];
+                                Device::Common::localLoad(gidx, popMem, NEON_OUT popRegisters);
+
+                                Compute                rho;
+                                std::array<Compute, 3> u{.0, .0, .0};
+                                Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u);
+
+                                Compute usqr = 1.5 * (u[0] * u[0] +
+                                                      u[1] * u[1] +
+                                                      u[2] * u[2]);
+
+                                if constexpr (CollisionId == Collision::bgk) {
+                                    Device::Common::collideBgkUnrolled(rho, u,
+                                                                       usqr, capturedOmega,
+                                                                       NEON_IO popRegisters);
+                                }
+                                if constexpr (CollisionId == Collision::kbc) {
+                                    Device::Common::collideKBCUnrolled(rho, u,
+                                                                       usqr, capturedOmega,
+                                                                       capturedInvBeta,
+                                                                       NEON_IO popRegisters);
+                                }
+                                Device::Common::localStoreOpposite(gidx, popRegisters, popMem);
+                            }
+                        };
+                    });
+                return container;
+            }
+
+            static auto
+            computeRhoAndU([[maybe_unused]] const PopField& fInField /*!   inpout population field */,
+                           const CellTypeField&             cellTypeField /*!       Cell type field     */,
+                           Rho&                             rhoField /*!  output Population field */,
+                           U&                               uField /*!  output Population field */)
+
+                -> Neon::set::Container
+            {
+                return Push::computeRhoAndU(fInField, cellTypeField, rhoField, uField);
+            }
+        };
+        struct Odd
+        {
+            // pullStream - collide - pushStream
+
+            static auto
+            iteration(const CellTypeField& cellTypeField /*! Cell type field     */,
+                      const Compute        omega /*!         LBM omega parameter */,
+                      NEON_IO PopField&    fpopField /*!     Output Population field */)
+                -> Neon::set::Container
+            {
+                Neon::set::Container container = fpopField.getGrid().newContainer(
+                    "D3Q19_TwoPop_Pull",
+                    [&, omega](Neon::set::Loader& L) -> auto {
+                        auto&                          fpop = L.load(fpopField);
+                        const auto&                    cellInfoPartition = L.load(cellTypeField);
+                        [[maybe_unused]] const Compute beta = omega * 0.5;
+                        [[maybe_unused]] const Compute invBeta = 1.0 / beta;
+
+                        return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                            [[maybe_unused]] const Compute capturedOmega = omega;
+                            [[maybe_unused]] const Compute capturedInvBeta = invBeta;
+
+                            CellType cellInfo = cellInfoPartition(gidx, 0);
+                            if (cellInfo.classification == CellType::bulk) {
+
+                                Storage popRegisters[Lattice::Q];
+                                Device::AA::pullStream(gidx, cellInfo.wallNghBitflag, fpop, NEON_OUT popRegisters);
+
+                                Compute                rho;
+                                std::array<Compute, 3> u{.0, .0, .0};
+                                Device::Common::macroscopic(popRegisters,
+                                                            NEON_OUT rho, NEON_OUT u);
+
+                                Compute usqr = 1.5 * (u[0] * u[0] +
+                                                      u[1] * u[1] +
+                                                      u[2] * u[2]);
+
+
+                                if constexpr (CollisionId == Collision::bgk) {
+                                    Device::Common::collideBgkUnrolled(rho, u,
+                                                                       usqr, capturedOmega,
+                                                                       NEON_IO popRegisters);
+                                }
+                                if constexpr (CollisionId == Collision::kbc) {
+                                    Device::Common::collideKBCUnrolled(rho, u,
+                                                                       usqr, capturedOmega,
+                                                                       capturedInvBeta,
+                                                                       NEON_IO popRegisters);
+                                }
+                                Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popRegisters, NEON_OUT fpop);
+                            }
+                        };
+                    });
+                return container;
+            }
+
+            static auto
+            computeRhoAndU([[maybe_unused]] const PopField& fInField /*!   inpout population field */,
+                           const CellTypeField&             cellTypeField /*!       Cell type field     */,
+                           Rho&                             rhoField /*!  output Population field */,
+                           U&                               uField /*!  output Population field */)
+
+                -> Neon::set::Container
+            {
+                return Pull::computeRhoAndU(fInField, cellTypeField, rhoField, uField);
+            }
+        };
+    };
+
+    struct Pull
+    {
+        static auto
+        iteration(Neon::set::StencilSemantic stencilSemantic,
+                  const PopField&            fInField /*!      Input population field */,
+                  const CellTypeField&       cellTypeField /*! Cell type field     */,
+                  const Compute              omega /*!         LBM omega parameter */,
+                  PopField&                  fOutField /*!     Output Population field */)
+            -> Neon::set::Container
+        {
+            Neon::set::Container container = fInField.getGrid().newContainer(
+                "D3Q19_TwoPop_Pull",
+                [&, omega](Neon::set::Loader& L) -> auto {
+                    auto&                          fIn = L.load(fInField,
+                                                                Neon::Pattern::STENCIL, stencilSemantic);
+                    auto&                          fOut = L.load(fOutField);
+                    const auto&                    cellInfoPartition = L.load(cellTypeField);
+                    [[maybe_unused]] const Compute beta = omega * 0.5;
+                    [[maybe_unused]] const Compute invBeta = 1.0 / beta;
+
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                        [[maybe_unused]] const Compute capturedOmega = omega;
+                        [[maybe_unused]] const Compute capturedInvBeta = invBeta;
+
+                        CellType cellInfo = cellInfoPartition(gidx, 0);
+                        if (cellInfo.classification == CellType::bulk) {
+
+                            Storage popRegisters[Lattice::Q];
+                            Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popRegisters);
+
+                            Compute                rho;
+                            std::array<Compute, 3> u{.0, .0, .0};
+                            Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u);
+
+                            Compute usqr = 1.5 * (u[0] * u[0] +
+                                                  u[1] * u[1] +
+                                                  u[2] * u[2]);
+
+                            if constexpr (CollisionId == Collision::bgk) {
+                                Device::Common::collideBgkUnrolled(rho, u,
+                                                                   usqr, capturedOmega,
+                                                                   NEON_IO popRegisters);
+                            }
+                            if constexpr (CollisionId == Collision::kbc) {
+                                Device::Common::collideKBCUnrolled(rho, u,
+                                                                   usqr, capturedOmega,
+                                                                   capturedInvBeta,
+                                                                   NEON_IO popRegisters);
+                            }
+                            Device::Common::localStore(gidx, popRegisters, fOut);
+                        }
+                    };
+                });
+            return container;
+        }
+
+        static auto
+        localCollide(const PopField&      fInField /*!      Input population field */,
+                     const CellTypeField& cellTypeField /*! Cell type field     */,
+                     const Compute        omega /*!         LBM omega parameter */,
+                     PopField&            fOutField /*!     Output Population field */)
+            -> Neon::set::Container
+        {
+            Neon::set::Container container = fInField.getGrid().newContainer(
+                "D3Q19_TwoPop_Pull",
+                [&, omega](Neon::set::Loader& L) -> auto {
+                    auto&                          fIn = L.load(fInField);
+                    auto&                          fOut = L.load(fOutField);
+                    const auto&                    cellInfoPartition = L.load(cellTypeField);
+                    [[maybe_unused]] const Compute beta = omega * 0.5;
+                    [[maybe_unused]] const Compute invBeta = 1.0 / beta;
+
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                        [[maybe_unused]] const Compute capturedOmega = omega;
+                        [[maybe_unused]] const Compute capturedInvBeta = invBeta;
+
+                        CellType cellInfo = cellInfoPartition(gidx, 0);
+                        if (cellInfo.classification == CellType::bulk) {
+
+                            Storage popRegisters[Lattice::Q];
+                            Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters);
+
+                            Compute                rho;
+                            std::array<Compute, 3> u{.0, .0, .0};
+                            Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u);
+
+                            Compute usqr = 1.5 * (u[0] * u[0] +
+                                                  u[1] * u[1] +
+                                                  u[2] * u[2]);
+
+                            if constexpr (CollisionId == Collision::bgk) {
+                                Device::Common::collideBgkUnrolled(rho, u,
+                                                                   usqr, capturedOmega,
+                                                                   NEON_IO popRegisters);
+                            }
+                            if constexpr (CollisionId == Collision::kbc) {
+                                Device::Common::collideKBCUnrolled(rho, u,
+                                                                   usqr, capturedOmega,
+                                                                   capturedInvBeta,
+                                                                   NEON_IO popRegisters);
+                            }
+                            Device::Common::localStore(gidx, popRegisters, fOut);
+                        }
+                    };
+                });
+            return container;
+        }
+
+        static auto
+        computeRhoAndU([[maybe_unused]] const PopField& fInField /*!   inpout population field */,
+                       const CellTypeField&             cellTypeField /*!       Cell type field     */,
+                       Rho&                             rhoField /*!  output Population field */,
+                       U&                               uField /*!  output Population field */)
+
+            -> Neon::set::Container
+        {
+
+            Neon::set::Container container =
+                fInField.getGrid().newContainer(
+                    "LBM_iteration",
+                    [&](Neon::set::Loader& L) -> auto {
+                        auto& fIn = L.load(fInField,
+                                           Neon::Pattern::STENCIL);
+                        auto& rhoXpu = L.load(rhoField);
+                        auto& uXpu = L.load(uField);
+
+                        const auto& cellInfoPartition = L.load(cellTypeField);
+
+                        return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                            CellType               cellInfo = cellInfoPartition(gidx, 0);
+                            Compute                rho = 0;
+                            std::array<Compute, 3> u{.0, .0, .0};
+
+                            Storage popRegisters[Lattice::Q];
+
+                            if (cellInfo.classification == CellType::bulk) {
+                                Storage popRegisters[Lattice::Q];
+                                Device::Pull::pullStream(gidx, cellInfo.wallNghBitflag, fIn, NEON_OUT popRegisters);
+                                Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u);
+                            } else {
+                                Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters);
+                                if (cellInfo.classification == CellType::movingWall) {
+                                    rho = 1.0;
+                                    u = std::array<Compute, 3>{static_cast<Compute>(popRegisters[0]) / static_cast<Compute>(6. * 1. / 18.),
+                                                               static_cast<Compute>(popRegisters[1]) / static_cast<Compute>(6. * 1. / 18.),
+                                                               static_cast<Compute>(popRegisters[2]) / static_cast<Compute>(6. * 1. / 18.)};
+                                }
+                            }
+
+                            rhoXpu(gidx, 0) = static_cast<Storage>(rho);
+                            uXpu(gidx, 0) = static_cast<Storage>(u[0]);
+                            uXpu(gidx, 1) = static_cast<Storage>(u[1]);
+                            uXpu(gidx, 2) = static_cast<Storage>(u[2]);
+                        };
+                    });
+            return container;
+        }
+    };
+    struct Push
+    {
+        static auto
+        iteration(Neon::set::StencilSemantic stencilSemantic,
+                  const PopField&            fInField /*!      Input population field */,
+                  const CellTypeField&       cellTypeField /*! Cell type field     */,
+                  const Compute              omega /*!         LBM omega parameter */,
+                  PopField&                  fOutField /*!     Output Population field */)
+            -> Neon::set::Container
+        {
+            Neon::set::Container container = fInField.getGrid().newContainer(
+                "LBM-iteration",
+                [=](Neon::set::Loader& L) -> auto {
+                    auto&       fIn = L.load(fInField,
+                                             Neon::Pattern::STENCIL, stencilSemantic);
+                    auto        fOut = L.load(fOutField);
+                    const auto& cellInfoPartition = L.load(cellTypeField);
+
+                    [[maybe_unused]] const Compute beta = omega * 0.5;
+                    [[maybe_unused]] const Compute invBeta = 1.0 / beta;
+
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                        [[maybe_unused]] const Compute capturedOmega = omega;
+                        [[maybe_unused]] const Compute capturedInvBeta = invBeta;
+
+                        CellType cellInfo = cellInfoPartition(gidx, 0);
+                        if (cellInfo.classification == CellType::bulk) {
+
+                            Storage popRegisters[Lattice::Q];
+                            Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters);
+
+                            Compute                rho;
+                            std::array<Compute, 3> u{.0, .0, .0};
+                            Device::Common::macroscopic(popRegisters,
+                                                        NEON_OUT rho, NEON_OUT u);
+
+                            Compute usqr = 1.5 * (u[0] * u[0] +
+                                                  u[1] * u[1] +
+                                                  u[2] * u[2]);
+
+
+                            if constexpr (CollisionId == Collision::bgk) {
+                                Device::Common::collideBgkUnrolled(rho, u,
+                                                                   usqr, capturedOmega,
+                                                                   NEON_IO popRegisters);
+                            }
+                            if constexpr (CollisionId == Collision::kbc) {
+                                Device::Common::collideKBCUnrolled(rho, u,
+                                                                   usqr, capturedOmega,
+                                                                   capturedInvBeta,
+                                                                   NEON_IO popRegisters);
+                            }
+                            Device::Push::pushStream(gidx, cellInfo.wallNghBitflag, popRegisters, NEON_OUT fOut);
+                        }
+                    };
+                });
+            return container;
+        }
+
+        static auto
+        computeRhoAndU([[maybe_unused]] const PopField& fInField /*!   inpout population field */,
+                       const CellTypeField&             cellTypeField /*!       Cell type field     */,
+                       Rho&                             rhoField /*!  output Population field */,
+                       U&                               uField /*!  output Population field */)
+
+            -> Neon::set::Container
+        {
+
+            Neon::set::Container container =
+                fInField.getGrid().newContainer(
+                    "LBM_iteration",
+                    [&](Neon::set::Loader& L) -> auto {
+                        auto& fIn = L.load(fInField,
+                                           Neon::Pattern::STENCIL);
+                        auto& rhoXpu = L.load(rhoField);
+                        auto& uXpu = L.load(uField);
+
+                        const auto& cellInfoPartition = L.load(cellTypeField);
+
+                        return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                            CellType               cellInfo = cellInfoPartition(gidx, 0);
+                            Compute                rho = 0;
+                            std::array<Compute, 3> u{.0, .0, .0};
+
+                            Storage popRegisters[Lattice::Q];
+                            Device::Common::localLoad(gidx, fIn, NEON_OUT popRegisters);
+
+                            if (cellInfo.classification == CellType::bulk) {
+                                Device::Common::macroscopic(popRegisters, NEON_OUT rho, NEON_OUT u);
+                            } else {
+                                if (cellInfo.classification == CellType::movingWall) {
+                                    rho = 1.0;
+                                    u = std::array<Compute, 3>{static_cast<Compute>(popRegisters[0]) / static_cast<Compute>(6. * 1. / 18.),
+                                                               static_cast<Compute>(popRegisters[1]) / static_cast<Compute>(6. * 1. / 18.),
+                                                               static_cast<Compute>(popRegisters[2]) / static_cast<Compute>(6. * 1. / 18.)};
+                                }
+                            }
+
+                            rhoXpu(gidx, 0) = static_cast<Storage>(rho);
+                            uXpu(gidx, 0) = static_cast<Storage>(u[0]);
+                            uXpu(gidx, 1) = static_cast<Storage>(u[1]);
+                            uXpu(gidx, 2) = static_cast<Storage>(u[2]);
+                        };
+                    });
+            return container;
+        }
+    };
+    struct Common
+    {
+
+
+
+        static auto
+        computeWallNghMask(const CellTypeField& infoInField,
+                           CellTypeField&       infoOutpeField)
+
+            -> Neon::set::Container
+        {
+            Neon::set::Container container = infoInField.getGrid().newContainer(
+                "LBM_iteration",
+                [&](Neon::set::Loader& L) -> auto {
+                    auto& infoIn = L.load(infoInField, Neon::Pattern::STENCIL);
+                    auto& infoOut = L.load(infoOutpeField);
+
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                        CellType cellType = infoIn(gidx, 0);
+                        cellType.wallNghBitflag = 0;
+
+                        if (cellType.classification == CellType::bulk) {
+                            Neon::ConstexprFor<0, Lattice::Q, 1>([&, gidx](auto fwdRegIdx) {
+                                using M = typename Lattice::template RegisterMapper<fwdRegIdx>;
+                                if constexpr (M::centerMemQ != M::fwdMemQ) {
+                                    CellType nghCellType = infoIn.template getNghData<M::fwdMemQX, M::fwdMemQY, M::fwdMemQZ>(gidx, 0, CellType::undefined)();
+                                    if (nghCellType.classification == CellType::bounceBack ||
+                                        nghCellType.classification == CellType::movingWall) {
+                                        cellType.wallNghBitflag = cellType.wallNghBitflag | ((uint32_t(1) << M::fwdMemQ));
+                                    }
+                                }
+                            });
+                            infoOut(gidx, 0) = cellType;
+                        }
+                    };
+                });
+            return container;
+        }
+
+
+        template <typename UserLambda>
+        static auto
+        userSettingBc(UserLambda     userLambda,
+                      PopField&      pField,
+                      CellTypeField& cellTypeField /*! Cell type field     */)
+            -> Neon::set::Container
+        {
+            Neon::set::Container container = pField.getGrid().newContainer(
+                "UserSettingBc",
+                [&](Neon::set::Loader& L) -> auto {
+                    auto& p = L.load(pField, Neon::Pattern::MAP);
+                    auto& flag = L.load(cellTypeField, Neon::Pattern::MAP);
+
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                        const auto               globalIdx = p.getGlobalIndex(gidx);
+                        Storage                  pValues[Lattice::Q];
+                        CellType::Classification cellClass;
+                        userLambda(globalIdx, pValues, cellClass);
+
+                        CellType flagVal(cellClass);
+                        flag(gidx, 0) = flagVal;
+
+                        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                            using M = typename Lattice::template RegisterMapper<q>;
+                            p(gidx, M::fwdMemQ) = pValues[M::fwdRegQ];
+                        });
+                    };
+                });
+            return container;
+        }
+
+        static auto
+        copyPopulation(PopField& fInField,
+                       PopField& foutField)
+            -> Neon::set::Container
+        {
+            Neon::set::Container container = fInField.getGrid().newContainer(
+                "LBM_iteration",
+                [&](Neon::set::Loader& L) -> auto {
+                    auto const& pIn = L.load(fInField, Neon::Pattern::MAP);
+                    auto&       pOut = L.load(foutField, Neon::Pattern::MAP);
+
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                        Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                            pOut(gidx, q) = pIn(gidx, q);
+                        });
+                    };
+                });
+            return container;
+        }
+
+
+        static auto
+        problemSetup(PopField&       fInField /*!   inpout population field */,
+                     PopField&       fOutField,
+                     CellTypeField&  cellTypeField,
+                     Neon::double_3d ulid,
+                     double          ulb)
+
+            -> Neon::set::Container
+        {
+            Neon::set::Container container = fInField.getGrid().newContainer(
+                "LBM_iteration",
+                [&, ulid, ulb](Neon::set::Loader& L) -> auto {
+                    auto& fIn = L.load(fInField, Neon::Pattern::MAP);
+                    auto& fOut = L.load(fOutField, Neon::Pattern::MAP);
+                    auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP);
+
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                        const auto globalIdx = fIn.getGlobalIndex(gidx);
+                        const auto domainDim = fIn.getDomainSize();
+
+                        CellType flagVal;
+                        flagVal.classification = CellType::bulk;
+                        flagVal.wallNghBitflag = 0;
+
+                        typename Lattice::Precision::Storage popVal = 0;
+
+                        if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 ||
+                            globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 ||
+                            globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) {
+                            flagVal.classification = CellType::bounceBack;
+
+                            if (globalIdx.y == domainDim.y - 1) {
+                                flagVal.classification = CellType::movingWall;
+                            }
+
+                            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                                if (globalIdx.y == domainDim.y - 1) {
+                                    popVal = -6. * Lattice::Memory::template getT<q>() * ulb *
+                                             (Lattice::Memory::template getDirection<q>().v[0] * ulid.v[0] +
+                                              Lattice::Memory::template getDirection<q>().v[1] * ulid.v[1] +
+                                              Lattice::Memory::template getDirection<q>().v[2] * ulid.v[2]);
+                                } else {
+                                    popVal = 0;
+                                }
+                                fIn(gidx, q) = popVal;
+                                fOut(gidx, q) = popVal;
+                            });
+                        } else {
+                            flagVal.classification = CellType::bulk;
+                            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                                fIn(gidx, q) = Lattice::Memory::template getT<q>();
+                                fOut(gidx, q) = Lattice::Memory::template getT<q>();
+                            });
+                        }
+                        cellInfoPartition(gidx, 0) = flagVal;
+                    };
+                });
+            return container;
+        }
+
+        static auto
+        setToEquilibrium(PopField&      fOutField,
+                         CellTypeField& cellTypeField)
+            -> Neon::set::Container
+        {
+            Neon::set::Container container = fOutField.getGrid().newContainer(
+                "LBM_setToEquilibrium",
+                [&](Neon::set::Loader& L) -> auto {
+                    auto& fOut = L.load(fOutField, Neon::Pattern::MAP);
+                    auto& cellInfoPartition = L.load(cellTypeField, Neon::Pattern::MAP);
+
+                    return [=] NEON_CUDA_HOST_DEVICE(const typename PopField::Idx& gidx) mutable {
+                        {  // All pints are pre-set to bulk
+                            CellType flagVal;
+                            flagVal.classification = CellType::bulk;
+                            cellInfoPartition(gidx, 0) = flagVal;
+                        }
+
+                        {  // All cells are pre-set to Equilibrium
+                            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                                using M = typename Lattice::template RegisterMapper<q>;
+                                fOut(gidx, M::fwdMemQ) = Lattice::Registers::template getT<M::fwdRegQ>();
+                            });
+                        }
+                    };
+                });
+            return container;
+        }
+    };
+};
\ No newline at end of file
diff --git a/benchmarks/lbm/src/D3Q19.h b/benchmarks/lbm/src/D3Q19.h
new file mode 100644
index 00000000..11a3408a
--- /dev/null
+++ b/benchmarks/lbm/src/D3Q19.h
@@ -0,0 +1,385 @@
+#pragma once
+
+#include "Neon/Neon.h"
+#include "Neon/set/Backend.h"
+#include "Neon/set/memory/memSet.h"
+#include "Precision.h"
+
+
+/** In each lattice we define two indexing schema
+ * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code.
+ * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields.
+ *
+ * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions.
+ *
+ */
+template <typename Precision_>
+struct D3Q19
+{
+   public:
+    D3Q19() = delete;
+
+    static constexpr int Q = 19; /** number of directions */
+    static constexpr int D = 3;  /** Space dimension */
+    using Precision = Precision_;
+    using Self = D3Q19<Precision>;
+
+    static constexpr int RegisterMapping = 1;
+    static constexpr int MemoryMapping = 2;
+
+    struct Registers
+    {
+
+        using Self = D3Q19<Precision>::Registers;
+
+        static constexpr int center = 9; /** Position of direction {0,0,0} */
+
+        template <int myQ, int myXYZ>
+        static constexpr auto getVelocityComponent() -> int
+        {
+            static_assert(myQ < Q);
+            static_assert(myXYZ < 3);
+
+#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \
+    if constexpr ((myQ) == (QQ)) {       \
+        if constexpr ((myXYZ) == 0) {    \
+            return XXX;                  \
+        }                                \
+        if constexpr ((myXYZ) == 1) {    \
+            return YYY;                  \
+        }                                \
+        if constexpr ((myXYZ) == 2) {    \
+            return ZZZ;                  \
+        }                                \
+    }
+
+            ADD_COMPONENT(0, -1, 0, 0)
+            ADD_COMPONENT(1, 0, -1, 0)
+            ADD_COMPONENT(2, 0, 0, -1)
+            ADD_COMPONENT(3, -1, -1, 0)
+            ADD_COMPONENT(4, -1, 1, 0)
+            ADD_COMPONENT(5, -1, 0, -1)
+            ADD_COMPONENT(6, -1, 0, 1)
+            ADD_COMPONENT(7, 0, -1, -1)
+            ADD_COMPONENT(8, 0, -1, 1)
+            ADD_COMPONENT(9, 0, 0, 0)
+            ADD_COMPONENT(10, 1, 0, 0)
+            ADD_COMPONENT(11, 0, 1, 0)
+            ADD_COMPONENT(12, 0, 0, 1)
+            ADD_COMPONENT(13, 1, 1, 0)
+            ADD_COMPONENT(14, 1, -1, 0)
+            ADD_COMPONENT(15, 1, 0, 1)
+            ADD_COMPONENT(16, 1, 0, -1)
+            ADD_COMPONENT(17, 0, 1, 1)
+            ADD_COMPONENT(18, 0, 1, -1)
+
+#undef ADD_COMPONENT
+        }
+
+        template <int myQ>
+        static constexpr auto getOpposite() -> int
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+            ADD_COMPONENT(0, 10)
+            ADD_COMPONENT(1, 11)
+            ADD_COMPONENT(2, 12)
+            ADD_COMPONENT(3, 13)
+            ADD_COMPONENT(4, 14)
+            ADD_COMPONENT(5, 15)
+            ADD_COMPONENT(6, 16)
+            ADD_COMPONENT(7, 17)
+            ADD_COMPONENT(8, 18)
+            ADD_COMPONENT(9, 9)
+            ADD_COMPONENT(10, 0)
+            ADD_COMPONENT(11, 1)
+            ADD_COMPONENT(12, 2)
+            ADD_COMPONENT(13, 3)
+            ADD_COMPONENT(14, 4)
+            ADD_COMPONENT(15, 5)
+            ADD_COMPONENT(16, 6)
+            ADD_COMPONENT(17, 7)
+            ADD_COMPONENT(18, 8)
+#undef ADD_COMPONENT
+        }
+
+        template <int myQ>
+        static constexpr auto getT() -> typename Precision::Storage
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+
+            ADD_COMPONENT(0, 1. / 18.)
+            ADD_COMPONENT(1, 1. / 18.)
+            ADD_COMPONENT(2, 1. / 18.)
+            ADD_COMPONENT(3, 1. / 36.)
+            ADD_COMPONENT(4, 1. / 36.)
+            ADD_COMPONENT(5, 1. / 36.)
+            ADD_COMPONENT(6, 1. / 36.)
+            ADD_COMPONENT(7, 1. / 36.)
+            ADD_COMPONENT(8, 1. / 36.)
+            ADD_COMPONENT(9, 1. / 3.)
+            ADD_COMPONENT(10, 1. / 18.)
+            ADD_COMPONENT(11, 1. / 18.)
+            ADD_COMPONENT(12, 1. / 18.)
+            ADD_COMPONENT(13, 1. / 36.)
+            ADD_COMPONENT(14, 1. / 36.)
+            ADD_COMPONENT(15, 1. / 36.)
+            ADD_COMPONENT(16, 1. / 36.)
+            ADD_COMPONENT(17, 1. / 36.)
+            ADD_COMPONENT(18, 1. / 36.)
+
+#undef ADD_COMPONENT
+        }
+
+
+        template <int q>
+        static constexpr auto getVelocity() -> const typename Neon::index_3d
+        {
+            return Neon::index_3d(getVelocityComponent<q, 0>,
+                                  getVelocityComponent<q, 1>,
+                                  getVelocityComponent<q, 2>);
+        }
+
+        // Identifying first half of the directions
+        // For each direction in the list, the opposite is not present.
+        // Center is also removed
+        static constexpr int                                  firstHalfQLen = (Q - 1) / 2;
+        static constexpr std::array<const int, firstHalfQLen> firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8};
+
+        template <int tegIdx, typename Compute>
+        static inline NEON_CUDA_HOST_DEVICE auto
+        getCk_u(std::array<Compute, 3> const& u)
+            -> Compute
+        {
+            if constexpr (tegIdx == 0 || tegIdx == 10) {
+                return -u[0];
+            }
+            if constexpr (tegIdx == 1 || tegIdx == 11) {
+                return -u[1];
+            }
+            if constexpr (tegIdx == 2 || tegIdx == 12) {
+                return -u[2];
+            }
+            if constexpr (tegIdx == 3 || tegIdx == 13) {
+                return -u[0] - u[1];
+            }
+            if constexpr (tegIdx == 4 || tegIdx == 14) {
+                return -u[0] + u[1];
+            }
+            if constexpr (tegIdx == 5 || tegIdx == 15) {
+                return -u[0] - u[2];
+            }
+            if constexpr (tegIdx == 6 || tegIdx == 16) {
+
+                return -u[0] + u[2];
+            }
+            if constexpr (tegIdx == 7 || tegIdx == 17) {
+
+                return -u[1] - u[2];
+            }
+            if constexpr (tegIdx == 8 || tegIdx == 18) {
+                return -u[1] + u[2];
+            }
+        }
+    };
+
+    struct Memory
+    {
+        using Self = D3Q19<Precision>::Memory;
+
+        template <int myQ, int myXYZ>
+        static constexpr auto getVelocityComponent() -> int
+        {
+            static_assert(myQ < Q);
+            static_assert(myXYZ < 3);
+
+#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \
+    if constexpr ((myQ) == (QQ)) {       \
+        if constexpr ((myXYZ) == 0) {    \
+            return XXX;                  \
+        }                                \
+        if constexpr ((myXYZ) == 1) {    \
+            return YYY;                  \
+        }                                \
+        if constexpr ((myXYZ) == 2) {    \
+            return ZZZ;                  \
+        }                                \
+    }
+            ADD_COMPONENT(0, -1, 0, 0)
+            ADD_COMPONENT(1, 0, -1, 0)
+            ADD_COMPONENT(2, 0, 0, -1)
+            ADD_COMPONENT(3, -1, -1, 0)
+            ADD_COMPONENT(4, -1, 1, 0)
+            ADD_COMPONENT(5, -1, 0, -1)
+            ADD_COMPONENT(6, -1, 0, 1)
+            ADD_COMPONENT(7, 0, -1, -1)
+            ADD_COMPONENT(8, 0, -1, 1)
+            ADD_COMPONENT(9, 0, 0, 0)
+            ADD_COMPONENT(10, 1, 0, 0)
+            ADD_COMPONENT(11, 0, 1, 0)
+            ADD_COMPONENT(12, 0, 0, 1)
+            ADD_COMPONENT(13, 1, 1, 0)
+            ADD_COMPONENT(14, 1, -1, 0)
+            ADD_COMPONENT(15, 1, 0, 1)
+            ADD_COMPONENT(16, 1, 0, -1)
+            ADD_COMPONENT(17, 0, 1, 1)
+            ADD_COMPONENT(18, 0, 1, -1)
+
+#undef ADD_COMPONENT
+        }
+
+
+        static constexpr int center = 9; /** Position of direction {0,0,0} */
+
+        template <int myQ>
+        static constexpr auto mapToRegisters()
+            -> int
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+            ADD_COMPONENT(0, 0)
+            ADD_COMPONENT(1, 1)
+            ADD_COMPONENT(2, 2)
+            ADD_COMPONENT(3, 3)
+            ADD_COMPONENT(4, 4)
+            ADD_COMPONENT(5, 5)
+            ADD_COMPONENT(6, 6)
+            ADD_COMPONENT(7, 7)
+            ADD_COMPONENT(8, 8)
+            ADD_COMPONENT(9, 9)
+            ADD_COMPONENT(10, 10)
+            ADD_COMPONENT(11, 11)
+            ADD_COMPONENT(12, 12)
+            ADD_COMPONENT(13, 13)
+            ADD_COMPONENT(14, 14)
+            ADD_COMPONENT(15, 15)
+            ADD_COMPONENT(16, 16)
+            ADD_COMPONENT(17, 17)
+            ADD_COMPONENT(18, 18)
+#undef ADD_COMPONENT
+        }
+
+        template <int myQ>
+        static constexpr auto mapToMemory()
+            -> int
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+            ADD_COMPONENT(0, 0)
+            ADD_COMPONENT(1, 1)
+            ADD_COMPONENT(2, 2)
+            ADD_COMPONENT(3, 3)
+            ADD_COMPONENT(4, 4)
+            ADD_COMPONENT(5, 5)
+            ADD_COMPONENT(6, 6)
+            ADD_COMPONENT(7, 7)
+            ADD_COMPONENT(8, 8)
+            ADD_COMPONENT(9, 9)
+            ADD_COMPONENT(10, 10)
+            ADD_COMPONENT(11, 11)
+            ADD_COMPONENT(12, 12)
+            ADD_COMPONENT(13, 13)
+            ADD_COMPONENT(14, 14)
+            ADD_COMPONENT(15, 15)
+            ADD_COMPONENT(16, 16)
+            ADD_COMPONENT(17, 17)
+            ADD_COMPONENT(18, 18)
+#undef ADD_COMPONENT
+        }
+
+        template <int myQ>
+        static constexpr auto getOpposite() -> int
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+            ADD_COMPONENT(0, 10)
+            ADD_COMPONENT(1, 11)
+            ADD_COMPONENT(2, 12)
+            ADD_COMPONENT(3, 13)
+            ADD_COMPONENT(4, 14)
+            ADD_COMPONENT(5, 15)
+            ADD_COMPONENT(6, 16)
+            ADD_COMPONENT(7, 17)
+            ADD_COMPONENT(8, 18)
+            ADD_COMPONENT(9, 9)
+            ADD_COMPONENT(10, 0)
+            ADD_COMPONENT(11, 1)
+            ADD_COMPONENT(12, 2)
+            ADD_COMPONENT(13, 3)
+            ADD_COMPONENT(14, 4)
+            ADD_COMPONENT(15, 5)
+            ADD_COMPONENT(16, 6)
+            ADD_COMPONENT(17, 7)
+            ADD_COMPONENT(18, 8)
+#undef ADD_COMPONENT
+        }
+    };
+
+
+    template <int fwdRegIdx_>
+    struct RegisterMapper
+    {
+        constexpr static int fwdRegQ = fwdRegIdx_;
+        constexpr static int bkwRegQ = Registers::template getOpposite<fwdRegQ>();
+        constexpr static int fwdMemQ = Memory::template mapToMemory<fwdRegQ>();
+        constexpr static int bkwMemQ = Memory::template mapToMemory<bkwRegQ>();
+        constexpr static int centerRegQ = Registers::center;
+        constexpr static int centerMemQ = Memory::center;
+
+        constexpr static int fwdMemQX = Memory::template getVelocityComponent<fwdMemQ, 0>();
+        constexpr static int fwdMemQY = Memory::template getVelocityComponent<fwdMemQ, 1>();
+        constexpr static int fwdMemQZ = Memory::template getVelocityComponent<fwdMemQ, 2>();
+
+        constexpr static int bkwMemQX = Memory::template getVelocityComponent<bkwMemQ, 0>();
+        constexpr static int bkwMemQY = Memory::template getVelocityComponent<bkwMemQ, 1>();
+        constexpr static int bkwMemQZ = Memory::template getVelocityComponent<bkwMemQ, 2>();
+    };
+
+
+   public:
+    template <int mappingType>
+    static auto getDirectionAsVector()
+        -> std::vector<Neon::index_3d>
+    {
+        std::vector<Neon::index_3d> vec;
+        if constexpr (mappingType == RegisterMapping) {
+            Neon::ConstexprFor<0, Q, 1>(
+                [&vec](auto q) {
+                    Neon::index_3d val(Registers::template getVelocityComponent<q, 0>(),
+                                       Registers::template getVelocityComponent<q, 1>(),
+                                       Registers::template getVelocityComponent<q, 2>());
+                    vec.push_back(val);
+                });
+        } else if constexpr (mappingType == MemoryMapping) {
+            Neon::ConstexprFor<0, Q, 1>(
+                [&vec](auto q) {
+                    Neon::index_3d val(Memory::template getVelocityComponent<q, 0>(),
+                                       Memory::template getVelocityComponent<q, 1>(),
+                                       Memory::template getVelocityComponent<q, 2>());
+                    vec.push_back(val);
+                });
+        }
+        return vec;
+    }
+};
\ No newline at end of file
diff --git a/benchmarks/lbm/src/D3Q27.h b/benchmarks/lbm/src/D3Q27.h
new file mode 100644
index 00000000..535dd2de
--- /dev/null
+++ b/benchmarks/lbm/src/D3Q27.h
@@ -0,0 +1,476 @@
+#pragma once
+
+#include "Neon/Neon.h"
+#include "Neon/set/Backend.h"
+#include "Neon/set/memory/memSet.h"
+#include "Precision.h"
+
+
+/** In each lattice we define two indexing schema
+ * - the first one works at the register level. For this one we relay on the same sequence of directions defined by the STLBM code.
+ * - the second one works at the memory (RAM) level and it defines how lattice direction are stored in Neon fields.
+ *
+ * We keep this two aspect separate to experiment on different order of directions in memory as the order will impact the number of halo update transitions.
+ *
+ */
+template <typename Precision_>
+struct D3Q27
+{
+   public:
+    D3Q27() = delete;
+
+    static constexpr int Q = 27; /** number of directions */
+    static constexpr int D = 3;  /** Space dimension */
+    using Precision = Precision_;
+    using Self = D3Q27<Precision>;
+
+    static constexpr int RegisterMapping = 1;
+    static constexpr int MemoryMapping = 2;
+
+    struct Registers
+    {
+
+        using Self = D3Q27<Precision>::Registers;
+
+        static constexpr int center = 13; /** Position of direction {0,0,0} */
+                                          // Identifying first half of the directions
+        // For each direction in the list, the opposite is not present.
+        // Center is also removed
+        static constexpr int                                  firstHalfQLen = (Q - 1) / 2;
+        static constexpr std::array<const int, firstHalfQLen> firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+
+        template <int myQ, int myXYZ>
+        static constexpr auto getVelocityComponent() -> int
+        {
+            static_assert(myQ < Q);
+            static_assert(myXYZ < 3);
+
+#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \
+    if constexpr ((myQ) == (QQ)) {       \
+        if constexpr ((myXYZ) == 0) {    \
+            return XXX;                  \
+        }                                \
+        if constexpr ((myXYZ) == 1) {    \
+            return YYY;                  \
+        }                                \
+        if constexpr ((myXYZ) == 2) {    \
+            return ZZZ;                  \
+        }                                \
+    }
+
+            ADD_COMPONENT(0, -1, 0, 0)
+            ADD_COMPONENT(1, 0, -1, 0)
+            ADD_COMPONENT(2, 0, 0, -1)
+            ADD_COMPONENT(3, -1, -1, 0)
+            ADD_COMPONENT(4, -1, 1, 0)
+            ADD_COMPONENT(5, -1, 0, -1)
+            ADD_COMPONENT(6, -1, 0, 1)
+            ADD_COMPONENT(7, 0, -1, -1)
+            ADD_COMPONENT(8, 0, -1, 1)
+            ADD_COMPONENT(9, -1, -1, -1)
+            ADD_COMPONENT(10, -1, -1, 1)
+            ADD_COMPONENT(11, -1, 1, -1)
+            ADD_COMPONENT(12, -1, 1, 1)
+            ADD_COMPONENT(13, 0, 0, 0)
+            ADD_COMPONENT(14, 1, 0, 0)
+            ADD_COMPONENT(15, 0, 1, 0)
+            ADD_COMPONENT(16, 0, 0, 1)
+            ADD_COMPONENT(17, 1, 1, 0)
+            ADD_COMPONENT(18, 1, -1, 0)
+            ADD_COMPONENT(19, 1, 0, 1)
+            ADD_COMPONENT(20, 1, 0, -1)
+            ADD_COMPONENT(21, 0, 1, 1)
+            ADD_COMPONENT(22, 0, 1, -1)
+            ADD_COMPONENT(23, 1, 1, 1)
+            ADD_COMPONENT(24, 1, 1, -1)
+            ADD_COMPONENT(25, 1, -1, 1)
+            ADD_COMPONENT(26, 1, -1, -1)
+
+#undef ADD_COMPONENT
+        }
+
+        template <int myQ>
+        static constexpr auto getOpposite() -> int
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+
+
+            ADD_COMPONENT(0, 14)
+            ADD_COMPONENT(1, 15)
+            ADD_COMPONENT(2, 16)
+            ADD_COMPONENT(3, 17)
+            ADD_COMPONENT(4, 18)
+            ADD_COMPONENT(5, 19)
+            ADD_COMPONENT(6, 20)
+            ADD_COMPONENT(7, 21)
+            ADD_COMPONENT(8, 22)
+            ADD_COMPONENT(9, 23)
+            ADD_COMPONENT(10, 24)
+            ADD_COMPONENT(11, 25)
+            ADD_COMPONENT(12, 26)
+            ADD_COMPONENT(13, 13)
+            ADD_COMPONENT(14, 0)
+            ADD_COMPONENT(15, 1)
+            ADD_COMPONENT(16, 2)
+            ADD_COMPONENT(17, 3)
+            ADD_COMPONENT(18, 4)
+            ADD_COMPONENT(19, 5)
+            ADD_COMPONENT(20, 6)
+            ADD_COMPONENT(21, 7)
+            ADD_COMPONENT(22, 8)
+            ADD_COMPONENT(23, 9)
+            ADD_COMPONENT(24, 10)
+            ADD_COMPONENT(25, 11)
+            ADD_COMPONENT(26, 12)
+
+
+#undef ADD_COMPONENT
+        }
+
+        template <int myQ>
+        static constexpr auto getT() -> typename Precision::Storage
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+
+            ADD_COMPONENT(0, 2. / 27.)
+            ADD_COMPONENT(1, 2. / 27.)
+            ADD_COMPONENT(2, 2. / 27.)
+            ADD_COMPONENT(3, 1. / 54.)
+            ADD_COMPONENT(4, 1. / 54.)
+            ADD_COMPONENT(5, 1. / 54.)
+            ADD_COMPONENT(6, 1. / 54.)
+            ADD_COMPONENT(7, 1. / 54.)
+            ADD_COMPONENT(8, 1. / 54.)
+            ADD_COMPONENT(9, 1. / 216.)
+            ADD_COMPONENT(10, 1. / 216.)
+            ADD_COMPONENT(11, 1. / 216.)
+            ADD_COMPONENT(12, 1. / 216.)
+            ADD_COMPONENT(13, 8. / 27.)
+            ADD_COMPONENT(14, 2. / 27.)
+            ADD_COMPONENT(15, 2. / 27.)
+            ADD_COMPONENT(16, 2. / 27.)
+            ADD_COMPONENT(17, 1. / 54.)
+            ADD_COMPONENT(18, 1. / 54.)
+            ADD_COMPONENT(19, 1. / 54.)
+            ADD_COMPONENT(20, 1. / 54.)
+            ADD_COMPONENT(21, 1. / 54.)
+            ADD_COMPONENT(22, 1. / 54.)
+            ADD_COMPONENT(23, 1. / 216.)
+            ADD_COMPONENT(24, 1. / 216.)
+            ADD_COMPONENT(25, 1. / 216.)
+            ADD_COMPONENT(26, 1. / 216.)
+
+#undef ADD_COMPONENT
+        }
+
+        template <int myQ, int mementumID>
+        static constexpr auto getMomentumComponet() -> typename Precision::Storage
+        {
+            static_assert(myQ < Q);
+            static_assert(mementumID < 6);
+
+#define ADD_COMPONENT(QQ, AA, BB, CC, DD, EE, FF) \
+    if constexpr ((myQ) == (QQ)) {                \
+        if constexpr ((mementumID) == 0) {        \
+            return AA;                            \
+        }                                         \
+        if constexpr ((mementumID) == 1) {        \
+            return BB;                            \
+        }                                         \
+        if constexpr ((mementumID) == 2) {        \
+            return CC;                            \
+        }                                         \
+        if constexpr ((mementumID) == 3) {        \
+            return DD;                            \
+        }                                         \
+        if constexpr ((mementumID) == 4) {        \
+            return EE;                            \
+        }                                         \
+        if constexpr ((mementumID) == 5) {        \
+            return FF;                            \
+        }                                         \
+    }
+
+            ADD_COMPONENT(0, 1, 0, 0, 0, 0, 0)
+            ADD_COMPONENT(1, 0, 0, 0, 1, 0, 0)
+            ADD_COMPONENT(2, 0, 0, 0, 0, 0, 1)
+            ADD_COMPONENT(3, 1, 1, 0, 1, 0, 0)
+            ADD_COMPONENT(4, 1, -1, 0, 1, 0, 0)
+            ADD_COMPONENT(5, 1, 0, 1, 0, 0, 1)
+            ADD_COMPONENT(6, 1, 0, -1, 0, 0, 1)
+            ADD_COMPONENT(7, 0, 0, 0, 1, 1, 1)
+            ADD_COMPONENT(8, 0, 0, 0, 1, -1, 1)
+            ADD_COMPONENT(9, 1, 1, 1, 1, 1, 1)
+            ADD_COMPONENT(10, 1, 1, -1, 1, -1, 1)
+            ADD_COMPONENT(11, 1, -1, 1, 1, -1, 1)
+            ADD_COMPONENT(12, 1, -1, -1, 1, 1, 1)
+            ADD_COMPONENT(13, 0, 0, 0, 0, 0, 0)
+            ADD_COMPONENT(14, 1, 0, 0, 0, 0, 0)
+            ADD_COMPONENT(15, 0, 0, 0, 1, 0, 0)
+            ADD_COMPONENT(16, 0, 0, 0, 0, 0, 1)
+            ADD_COMPONENT(17, 1, 1, 0, 1, 0, 0)
+            ADD_COMPONENT(18, 1, -1, 0, 1, 0, 0)
+            ADD_COMPONENT(19, 1, 0, 1, 0, 0, 1)
+            ADD_COMPONENT(20, 1, 0, -1, 0, 0, 1)
+            ADD_COMPONENT(21, 0, 0, 0, 1, 1, 1)
+            ADD_COMPONENT(22, 0, 0, 0, 1, -1, 1)
+            ADD_COMPONENT(23, 1, 1, 1, 1, 1, 1)
+            ADD_COMPONENT(24, 1, 1, -1, 1, -1, 1)
+            ADD_COMPONENT(25, 1, -1, 1, 1, -1, 1)
+            ADD_COMPONENT(26, 1, -1, -1, 1, 1, 1)
+
+#undef ADD_COMPONENT
+        }
+
+
+        template <int q>
+        static constexpr auto getVelocity() -> const typename Neon::index_3d
+        {
+            return Neon::index_3d(getVelocityComponent<q, 0>,
+                                  getVelocityComponent<q, 1>,
+                                  getVelocityComponent<q, 2>);
+        }
+
+        //        // Identifying first half of the directions
+        //        // For each direction in the list, the opposite is not present.
+        //        // Center is also removed
+        //        static constexpr int                                  firstHalfQLen = (Q - 1) / 2;
+        //        static constexpr std::array<const int, firstHalfQLen> firstHalfQList{0, 1, 2, 3, 4, 5, 6, 7, 8};
+    };
+
+    struct Memory
+    {
+        using Self = D3Q27<Precision>::Memory;
+
+        template <int myQ, int myXYZ>
+        static constexpr auto getVelocityComponent() -> int
+        {
+            static_assert(myQ < Q);
+            static_assert(myXYZ < 3);
+
+#define ADD_COMPONENT(QQ, XXX, YYY, ZZZ) \
+    if constexpr ((myQ) == (QQ)) {       \
+        if constexpr ((myXYZ) == 0) {    \
+            return XXX;                  \
+        }                                \
+        if constexpr ((myXYZ) == 1) {    \
+            return YYY;                  \
+        }                                \
+        if constexpr ((myXYZ) == 2) {    \
+            return ZZZ;                  \
+        }                                \
+    }
+
+            ADD_COMPONENT(0, -1, 0, 0)
+            ADD_COMPONENT(1, 0, -1, 0)
+            ADD_COMPONENT(2, 0, 0, -1)
+            ADD_COMPONENT(3, -1, -1, 0)
+            ADD_COMPONENT(4, -1, 1, 0)
+            ADD_COMPONENT(5, -1, 0, -1)
+            ADD_COMPONENT(6, -1, 0, 1)
+            ADD_COMPONENT(7, 0, -1, -1)
+            ADD_COMPONENT(8, 0, -1, 1)
+            ADD_COMPONENT(9, -1, -1, -1)
+            ADD_COMPONENT(10, -1, -1, 1)
+            ADD_COMPONENT(11, -1, 1, -1)
+            ADD_COMPONENT(12, -1, 1, 1)
+            ADD_COMPONENT(13, 0, 0, 0)
+            ADD_COMPONENT(14, 1, 0, 0)
+            ADD_COMPONENT(15, 0, 1, 0)
+            ADD_COMPONENT(16, 0, 0, 1)
+            ADD_COMPONENT(17, 1, 1, 0)
+            ADD_COMPONENT(18, 1, -1, 0)
+            ADD_COMPONENT(19, 1, 0, 1)
+            ADD_COMPONENT(20, 1, 0, -1)
+            ADD_COMPONENT(21, 0, 1, 1)
+            ADD_COMPONENT(22, 0, 1, -1)
+            ADD_COMPONENT(23, 1, 1, 1)
+            ADD_COMPONENT(24, 1, 1, -1)
+            ADD_COMPONENT(25, 1, -1, 1)
+            ADD_COMPONENT(26, 1, -1, -1)
+
+#undef ADD_COMPONENT
+        }
+
+        static constexpr int center = 13; /** Position of direction {0,0,0} */
+
+        template <int myQ>
+        static constexpr auto mapToRegisters()
+            -> int
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+            ADD_COMPONENT(0, 0)
+            ADD_COMPONENT(1, 1)
+            ADD_COMPONENT(2, 2)
+            ADD_COMPONENT(3, 3)
+            ADD_COMPONENT(4, 4)
+            ADD_COMPONENT(5, 5)
+            ADD_COMPONENT(6, 6)
+            ADD_COMPONENT(7, 7)
+            ADD_COMPONENT(8, 8)
+            ADD_COMPONENT(9, 9)
+            ADD_COMPONENT(10, 10)
+            ADD_COMPONENT(11, 11)
+            ADD_COMPONENT(12, 12)
+            ADD_COMPONENT(13, 13)
+            ADD_COMPONENT(14, 14)
+            ADD_COMPONENT(15, 15)
+            ADD_COMPONENT(16, 16)
+            ADD_COMPONENT(17, 17)
+            ADD_COMPONENT(18, 18)
+
+            ADD_COMPONENT(19, 19)
+            ADD_COMPONENT(20, 20)
+            ADD_COMPONENT(21, 21)
+            ADD_COMPONENT(22, 22)
+            ADD_COMPONENT(23, 23)
+            ADD_COMPONENT(24, 24)
+            ADD_COMPONENT(25, 25)
+            ADD_COMPONENT(26, 26)
+
+#undef ADD_COMPONENT
+        }
+
+        template <int myQ>
+        static constexpr auto mapToMemory()
+            -> int
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+            ADD_COMPONENT(0, 0)
+            ADD_COMPONENT(1, 1)
+            ADD_COMPONENT(2, 2)
+            ADD_COMPONENT(3, 3)
+            ADD_COMPONENT(4, 4)
+            ADD_COMPONENT(5, 5)
+            ADD_COMPONENT(6, 6)
+            ADD_COMPONENT(7, 7)
+            ADD_COMPONENT(8, 8)
+            ADD_COMPONENT(9, 9)
+            ADD_COMPONENT(10, 10)
+            ADD_COMPONENT(11, 11)
+            ADD_COMPONENT(12, 12)
+            ADD_COMPONENT(13, 13)
+            ADD_COMPONENT(14, 14)
+            ADD_COMPONENT(15, 15)
+            ADD_COMPONENT(16, 16)
+            ADD_COMPONENT(17, 17)
+            ADD_COMPONENT(18, 18)
+
+            ADD_COMPONENT(19, 19)
+            ADD_COMPONENT(20, 20)
+            ADD_COMPONENT(21, 21)
+            ADD_COMPONENT(22, 22)
+            ADD_COMPONENT(23, 23)
+            ADD_COMPONENT(24, 24)
+            ADD_COMPONENT(25, 25)
+            ADD_COMPONENT(26, 26)
+#undef ADD_COMPONENT
+        }
+
+        template <int myQ>
+        static constexpr auto getOpposite() -> int
+        {
+            static_assert(myQ < Q);
+
+#define ADD_COMPONENT(QQ, XXX)     \
+    if constexpr ((myQ) == (QQ)) { \
+        return XXX;                \
+    }
+            ADD_COMPONENT(0, 14)
+            ADD_COMPONENT(1, 15)
+            ADD_COMPONENT(2, 16)
+            ADD_COMPONENT(3, 17)
+            ADD_COMPONENT(4, 18)
+            ADD_COMPONENT(5, 19)
+            ADD_COMPONENT(6, 20)
+            ADD_COMPONENT(7, 21)
+            ADD_COMPONENT(8, 22)
+            ADD_COMPONENT(9, 23)
+            ADD_COMPONENT(10, 24)
+            ADD_COMPONENT(11, 25)
+            ADD_COMPONENT(12, 26)
+            ADD_COMPONENT(13, 13)
+            ADD_COMPONENT(14, 0)
+            ADD_COMPONENT(15, 1)
+            ADD_COMPONENT(16, 2)
+            ADD_COMPONENT(17, 3)
+            ADD_COMPONENT(18, 4)
+            ADD_COMPONENT(19, 5)
+            ADD_COMPONENT(20, 6)
+            ADD_COMPONENT(21, 7)
+            ADD_COMPONENT(22, 8)
+            ADD_COMPONENT(23, 9)
+            ADD_COMPONENT(24, 10)
+            ADD_COMPONENT(25, 11)
+            ADD_COMPONENT(26, 12)
+#undef ADD_COMPONENT
+        }
+    };
+
+
+    template <int fwdRegIdx_>
+    struct RegisterMapper
+    {
+        constexpr static int fwdRegQ = fwdRegIdx_;
+        constexpr static int bkwRegQ = Registers::template getOpposite<fwdRegQ>();
+        constexpr static int fwdMemQ = Memory::template mapToMemory<fwdRegQ>();
+        constexpr static int bkwMemQ = Memory::template mapToMemory<bkwRegQ>();
+        constexpr static int centerRegQ = Registers::center;
+        constexpr static int centerMemQ = Memory::center;
+
+        constexpr static int fwdMemQX = Memory::template getVelocityComponent<fwdMemQ, 0>();
+        constexpr static int fwdMemQY = Memory::template getVelocityComponent<fwdMemQ, 1>();
+        constexpr static int fwdMemQZ = Memory::template getVelocityComponent<fwdMemQ, 2>();
+
+        constexpr static int bkwMemQX = Memory::template getVelocityComponent<bkwMemQ, 0>();
+        constexpr static int bkwMemQY = Memory::template getVelocityComponent<bkwMemQ, 1>();
+        constexpr static int bkwMemQZ = Memory::template getVelocityComponent<bkwMemQ, 2>();
+    };
+
+
+   public:
+    template <int mappingType>
+    static auto getDirectionAsVector()
+        -> std::vector<Neon::index_3d>
+    {
+        std::vector<Neon::index_3d> vec;
+        if constexpr (mappingType == RegisterMapping) {
+            Neon::ConstexprFor<0, Q, 1>(
+                [&vec](auto q) {
+                    Neon::index_3d val(Registers::template getVelocityComponent<q, 0>(),
+                                       Registers::template getVelocityComponent<q, 1>(),
+                                       Registers::template getVelocityComponent<q, 2>());
+                    vec.push_back(val);
+                });
+        } else if constexpr (mappingType == MemoryMapping) {
+            Neon::ConstexprFor<0, Q, 1>(
+                [&vec](auto q) {
+                    Neon::index_3d val(Memory::template getVelocityComponent<q, 0>(),
+                                       Memory::template getVelocityComponent<q, 1>(),
+                                       Memory::template getVelocityComponent<q, 2>());
+                    vec.push_back(val);
+                });
+        }
+        return vec;
+    }
+};
\ No newline at end of file
diff --git a/benchmarks/lbm/src/DeviceD3QXX.h b/benchmarks/lbm/src/DeviceD3QXX.h
new file mode 100644
index 00000000..d7d16550
--- /dev/null
+++ b/benchmarks/lbm/src/DeviceD3QXX.h
@@ -0,0 +1,363 @@
+#pragma once
+#include "CellType.h"
+#include "D3Q19.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Containter.h"
+template <typename Precision_, typename Grid_, typename Lattice_>
+struct DeviceD3QXX
+{
+    using Lattice = Lattice_;
+    using Precision = Precision_;
+    using Compute = typename Precision::Compute;
+    using Storage = typename Precision::Storage;
+    using Grid = Grid_;
+
+    using PopField = typename Grid::template Field<Storage, Lattice::Q>;
+    using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    using Idx = typename PopField::Idx;
+    using Rho = typename Grid::template Field<Storage, 1>;
+    using U = typename Grid::template Field<Storage, 3>;
+
+    struct Pull
+    {
+        static inline NEON_CUDA_HOST_DEVICE auto
+        pullStream(Idx const&                          gidx,
+                   const uint32_t&                     wallBitFlag,
+                   typename PopField::Partition const& fin,
+                   NEON_OUT Storage                    popIn[Lattice::Q])
+        {
+            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                using QPullingReference = typename Lattice::template RegisterMapper<q>;
+
+                if constexpr (QPullingReference::fwdRegQ == QPullingReference::centerRegQ) {
+                    popIn[QPullingReference::centerRegQ] = fin(gidx, QPullingReference::centerMemQ);
+                } else {
+                    if (CellType::isWall<QPullingReference::bkwRegQ>(wallBitFlag)) {
+                        // The cell in the opposite direction of the pull is a wall
+                        popIn[QPullingReference::fwdRegQ] = fin(gidx, QPullingReference::bkwRegQ) +
+                                                            fin.template getNghData<QPullingReference::bkwMemQX,
+                                                                                    QPullingReference::bkwMemQY,
+                                                                                    QPullingReference::bkwMemQZ>(gidx, QPullingReference::fwdMemQ)();
+                    } else {
+                        popIn[QPullingReference::fwdRegQ] = fin.template getNghData<QPullingReference::bkwMemQX,
+                                                                                    QPullingReference::bkwMemQY,
+                                                                                    QPullingReference::bkwMemQZ>(gidx, QPullingReference::fwdMemQ)();
+                    }
+                }
+            });
+        }
+    };
+
+    struct AA
+    {
+        static inline NEON_CUDA_HOST_DEVICE auto
+        pullStream(Idx const&                          gidx,
+                   const uint32_t&                     wallBitFlag,
+                   typename PopField::Partition const& fin,
+                   NEON_OUT Storage                    popIn[Lattice::Q])
+        {
+            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                using QPullingReference = typename Lattice::template RegisterMapper<q>;
+
+                if constexpr (QPullingReference::fwdRegQ == QPullingReference::centerRegQ) {
+                    popIn[QPullingReference::centerRegQ] = fin(gidx, QPullingReference::centerMemQ);
+                } else {
+                    if (CellType::isWall<QPullingReference::bkwRegQ>(wallBitFlag)) {
+                        // The cell in the opposite direction of the pull is a wall
+                        popIn[QPullingReference::fwdRegQ] = fin(gidx, QPullingReference::fwdRegQ) +
+                                                            fin.template getNghData<QPullingReference::bkwMemQX,
+                                                                                    QPullingReference::bkwMemQY,
+                                                                                    QPullingReference::bkwMemQZ>(gidx, QPullingReference::bkwMemQ)();
+                    } else {
+                        popIn[QPullingReference::fwdRegQ] = fin.template getNghData<QPullingReference::bkwMemQX,
+                                                                                    QPullingReference::bkwMemQY,
+                                                                                    QPullingReference::bkwMemQZ>(gidx, QPullingReference::bkwMemQ)();
+                    }
+                }
+            });
+        }
+    };
+
+    struct Push
+    {
+        static inline NEON_CUDA_HOST_DEVICE auto
+        pushStream(Idx const&                             gidx,
+                   const uint32_t&                        wallNghBitFlag,
+                   NEON_OUT Storage                       pOut[Lattice::Q],
+                   NEON_OUT typename PopField::Partition& fOut)
+        {
+            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                using M = typename Lattice::template RegisterMapper<q>;
+
+                if constexpr (M::fwdMemQ == M::centerMemQ) {
+                    fOut(gidx, M::centerMemQ) = pOut[M::centerRegQ];
+                } else {
+                    if (CellType::isWall<M::fwdRegQ>(wallNghBitFlag)) {
+                        const auto pop_out = pOut[M::fwdRegQ];
+                        const auto f_nb_k = fOut.template getNghData<M::fwdMemQX, M::fwdMemQY, M::fwdMemQZ>(gidx, M::fwdMemQ)();
+
+                        // fout(i, opp[k]) =
+                        fOut(gidx, M::bkwMemQ) =
+                            // pop_out +
+                            pop_out +
+                            // f(nb, k);
+                            f_nb_k;
+                    } else {
+                        // fout(nb,
+                        fOut.template writeNghData<M::fwdMemQX, M::fwdMemQY, M::fwdMemQZ>(gidx,
+                                                                                          // k)
+                                                                                          M::fwdMemQ,
+                                                                                          //    = pop_out;
+                                                                                          pOut[M::fwdRegQ]);
+                    }
+                }
+            });
+        }
+    };
+
+
+    struct Common
+    {
+        static inline NEON_CUDA_HOST_DEVICE auto
+        macroscopic(const Storage     pop[Lattice::Q],
+                    NEON_OUT Compute& rho,
+                    NEON_OUT std::array<Compute, 3>& u)
+            -> void
+        {
+            if constexpr (Lattice::Q == 19) {
+#define POP(IDX) static_cast<Compute>(pop[IDX])
+                const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6);
+                const Compute X_P1 = POP(10) + POP(13) + POP(14) + POP(15) + POP(16);
+                const Compute X_0 = POP(9) + POP(1) + POP(2) + POP(7) + POP(8) + POP(11) + POP(12) + POP(17) + POP(18);
+
+                const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(14);
+                const Compute Y_P1 = POP(4) + POP(11) + POP(13) + POP(17) + POP(18);
+
+                const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(16) + POP(18);
+                const Compute Z_P1 = POP(6) + POP(8) + POP(12) + POP(15) + POP(17);
+#undef POP
+
+                rho = X_M1 + X_P1 + X_0;
+                u[0] = (X_P1 - X_M1) / rho;
+                u[1] = (Y_P1 - Y_M1) / rho;
+                u[2] = (Z_P1 - Z_M1) / rho;
+                return;
+            }
+            if constexpr (Lattice::Q == 27) {
+#define POP(IDX) static_cast<Compute>(pop[IDX])
+                const Compute X_M1 = POP(0) + POP(3) + POP(4) + POP(5) + POP(6) + POP(9) + POP(10) + POP(11) + POP(12);
+                const Compute X_P1 = POP(14) + POP(17) + POP(18) + POP(19) + POP(20) + POP(23) + POP(24) + POP(25) + POP(26);
+                const Compute X_0 = POP(1) + POP(2) + POP(7) + POP(8) + POP(13) + POP(15) + POP(16) + POP(21) + POP(22);
+
+                const Compute Y_M1 = POP(1) + POP(3) + POP(7) + POP(8) + POP(9) + POP(10) + POP(18) + POP(25) + POP(26);
+                const Compute Y_P1 = POP(15) + POP(17) + POP(21) + POP(22) + POP(23) + POP(24) + POP(4) + POP(11) + POP(12);
+
+                const Compute Z_M1 = POP(2) + POP(5) + POP(7) + POP(9) + POP(11) + POP(20) + POP(22) + POP(24) + POP(26);
+                const Compute Z_P1 = POP(16) + POP(19) + POP(21) + POP(23) + POP(25) + POP(6) + POP(8) + POP(10) + POP(12);
+#undef POP
+                rho = X_M1 + X_P1 + X_0;
+                u[0] = (X_P1 - X_M1) / rho;
+                u[1] = (Y_P1 - Y_M1) / rho;
+                u[2] = (Z_P1 - Z_M1) / rho;
+                return;
+            }
+            printf("Error: macroscopic function does not support the selected lattice.\n");
+        }
+
+        static inline NEON_CUDA_HOST_DEVICE auto
+        collideBgkUnrolled(Compute const&                rho /*!   Density            */,
+                           std::array<Compute, 3> const& u /*!     Velocity           */,
+                           Compute const&                usqr /*!  Usqr               */,
+                           Compute const&                omega /*! Omega              */,
+                           NEON_IO Storage               pop[Lattice::Q])
+
+            -> void
+        {
+
+            // constexpr Compute c1over18 = 1. / 18.;
+            constexpr Compute c4dot5 = 4.5;
+            constexpr Compute c3 = 3.;
+            constexpr Compute c1 = 1.;
+            constexpr Compute c6 = 6.;
+
+            // constexpr int regCenter = Lattice::Registers::center;
+            // constexpr int regFir = Lattice::Registers::center;
+
+            Neon::ConstexprFor<0, Lattice::Registers::firstHalfQLen, 1>(
+                [&](auto q) {
+                    using M = typename Lattice::template RegisterMapper<q>;
+                    using T = typename Lattice::Registers;
+
+                    Compute eqFw;
+                    Compute eqBk;
+
+                    const Compute ck_u = u[0] * Lattice::Registers::template getVelocityComponent<q, 0>() +
+                                         u[1] * Lattice::Registers::template getVelocityComponent<q, 1>() +
+                                         u[2] * Lattice::Registers::template getVelocityComponent<q, 2>();
+
+                    // double eq = rho * t[k] *
+                    //             (1. +
+                    //             3. * ck_u +
+                    //             4.5 * ck_u * ck_u -
+                    //             usqr);
+                    eqFw = rho * T::template getT<M::fwdRegQ>() *
+                           (c1 +
+                            c3 * ck_u +
+                            c4dot5 * ck_u * ck_u -
+                            usqr);
+
+                    // double eqopp = eq - 6.* rho * t[k] * ck_u;
+                    eqBk = eqFw - c6 * rho * T::template getT<M::fwdRegQ>() * ck_u;
+
+                    // pop_out      = (1. - omega) * fin(i, k)                             + omega * eq;
+                    pop[M::fwdRegQ] = (c1 - omega) * static_cast<Compute>(pop[M::fwdRegQ]) + omega * eqFw;
+                    // pop_out_opp  = (1. - omega) * fin(i, opp[k])                        + omega * eqopp;
+                    pop[M::bkwRegQ] = (c1 - omega) * static_cast<Compute>(pop[M::bkwRegQ]) + omega * eqBk;
+                });
+            {  // Center;
+                using T = typename Lattice::Registers;
+                using M = typename Lattice::template RegisterMapper<Lattice::Registers::center>;
+                //                  eq = rho * t[k]                * (1. - usqr);
+                const Compute eqCenter = rho * T::template getT<M::centerRegQ>() * (c1 - usqr);
+                //      fout(i, k) = (1. - omega) * fin(i, k)                                + omega * eq;
+                pop[M::centerRegQ] = (c1 - omega) * static_cast<Compute>(pop[M::centerRegQ]) + omega * eqCenter;
+            }
+        }
+
+        static inline NEON_CUDA_HOST_DEVICE auto
+        localLoad(Idx const&                                  gidx,
+                  NEON_IN typename PopField::Partition const& fOut,
+                  Storage NEON_RESTRICT                       pOut[Lattice::Q])
+        {
+            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                using M = typename Lattice::template RegisterMapper<q>;
+                pOut[M::fwdRegQ] = fOut(gidx, M::fwdMemQ);
+            });
+        }
+
+        static inline NEON_CUDA_HOST_DEVICE auto
+        localStore(Idx const&                             gidx,
+                   Storage NEON_RESTRICT                  pOut[Lattice::Q],
+                   NEON_OUT typename PopField::Partition& fOut)
+        {
+            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                using M = typename Lattice::template RegisterMapper<q>;
+                fOut(gidx, M::fwdMemQ) = pOut[M::fwdRegQ];
+            });
+        }
+
+        static inline NEON_CUDA_HOST_DEVICE auto
+        localStoreOpposite(Idx const&                             gidx,
+                           Storage NEON_RESTRICT                  pOut[Lattice::Q],
+                           NEON_OUT typename PopField::Partition& fOut)
+        {
+            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                using M = typename Lattice::template RegisterMapper<q>;
+                fOut(gidx, M::bkwMemQ) = pOut[M::fwdRegQ];
+            });
+        }
+
+        static inline NEON_CUDA_HOST_DEVICE auto
+        collideKBCUnrolled(Compute const&                   rho /*!   Density            */,
+                           std::array<Compute, 3> const&    u /*!     Velocity           */,
+                           Compute const&                   usqr /*!  Usqr               */,
+                           Compute const&                   omega /*! Omega              */,
+                           Compute const&                   invBeta /*! invBeta              */,
+                           [[maybe_unused]] NEON_IO Storage pop[Lattice::Q])
+
+            -> void
+        {
+            if constexpr (Lattice::Q == 27) {
+                constexpr Compute tiny = Compute(1e-7);
+
+                Compute       Pi[6] = {0, 0, 0, 0, 0, 0};
+                Compute       e0 = 0;
+                Compute       e1 = 0;
+                Compute       deltaS[Lattice::Q];
+                Compute       fneq[Lattice::Q];
+                Compute       feq[Lattice::Q];
+                const Compute beta = omega * 0.5;
+
+                auto fdecompose_shear = [&](const int q) -> Compute {
+                    const Compute Nxz = Pi[0] - Pi[5];
+                    const Compute Nyz = Pi[3] - Pi[5];
+                    if (q == 0 /* -1, 0, 0 */) {
+                        return (2.0 * Nxz - Nyz) / 6.0;
+                    } else if (q == 14 /* 1, 0, -1 */) {
+                        return (2.0 * Nxz - Nyz) / 6.0;
+                    } else if (q == 1 /*  0, -1, 0 */) {
+                        return (-Nxz + 2.0 * Nyz) / 6.0;
+                    } else if (q == 15 /* 0, 1, 0 */) {
+                        return (-Nxz + 2.0 * Nyz) / 6.0;
+                    } else if (q == 2 /* 0, 0, -1 */) {
+                        return (-Nxz - Nyz) / 6.0;
+                    } else if (q == 16 /* 0, 0, 1 */) {
+                        return (-Nxz - Nyz) / 6.0;
+                    } else if (q == 3 /* -1, -1, 0 */ || q == 17 /* 1, 1, 0 */) {
+                        return Pi[1] / 4.0;
+                    } else if (q == 18 /* 1, -1, 0 */ || q == 4 /* -1, 1, 0 */) {
+                        return -Pi[1] / 4.0;
+                    } else if (q == 5 /* -1, 0, -1 */ || q == 19 /* 1, 0, 1 */) {
+                        return Pi[2] / 4.0;
+                    } else if (q == 20 /* 1, 0, -1 */ || q == 6 /* -1, 0, 1 */) {
+                        return -Pi[2] / 4.0;
+                    } else if (q == 21 /* 0, 1, 1 */ || q == 7 /* 0, -1, -1 */) {
+                        return Pi[4] / 4.0;
+                    } else if (q == 22 /* 0, 1, -1 */ || q == 8 /* 0, -1, 1 */) {
+                        return -Pi[4] / 4.0;
+                    } else {
+                        return Compute(0);
+                    }
+                };
+
+                // equilibrium
+                Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                    const Compute cu = Compute(3) *
+                                       (u[0] * Lattice::Registers::template getVelocityComponent<q, 0>() +
+                                        u[1] * Lattice::Registers::template getVelocityComponent<q, 1>() +
+                                        u[2] * Lattice::Registers::template getVelocityComponent<q, 2>());
+
+
+                    feq[q] = rho * Lattice::Registers::template getT<q>() * (1. + cu + 0.5 * cu * cu - usqr);
+                    fneq[q] = pop[q] - feq[q];
+                });
+
+                // momentum_flux
+                Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                    //                    Neon::ConstexprFor<0, 6, 1>([&](auto i) {
+                    //                        Pi[i] += fneq[q] * Lattice::Registers::template getMomentByDirection<q, i>();
+                    //                    });
+                    Pi[0] += fneq[q] * Lattice::Registers::template getMomentumComponet<q, 0>();
+                    Pi[1] += fneq[q] * Lattice::Registers::template getMomentumComponet<q, 1>();
+                    Pi[2] += fneq[q] * Lattice::Registers::template getMomentumComponet<q, 2>();
+                    Pi[3] += fneq[q] * Lattice::Registers::template getMomentumComponet<q, 3>();
+                    Pi[4] += fneq[q] * Lattice::Registers::template getMomentumComponet<q, 4>();
+                    Pi[5] += fneq[q] * Lattice::Registers::template getMomentumComponet<q, 5>();
+                });
+
+                // fdecompose_shear
+                Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                    deltaS[q] = rho * fdecompose_shear(q);
+
+                    Compute deltaH = fneq[q] - deltaS[q];
+
+                    e0 += (deltaS[q] * deltaH / feq[q]);
+                    e1 += (deltaH * deltaH / feq[q]);
+                });
+
+                // gamma
+                Compute gamma = invBeta - (2.0 - invBeta) * e0 / (tiny + e1);
+
+
+                // fout
+                Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                    Compute deltaH = fneq[q] - deltaS[q];
+                    pop[q] = pop[q] - beta * (2.0 * deltaS[q] + gamma * deltaH);
+                });
+            } else {
+                printf("ERROR %d \n", Lattice::Q);
+            }
+        }
+    };
+};
\ No newline at end of file
diff --git a/benchmarks/lbm/src/Lbm.h b/benchmarks/lbm/src/Lbm.h
new file mode 100644
index 00000000..ce465ce4
--- /dev/null
+++ b/benchmarks/lbm/src/Lbm.h
@@ -0,0 +1,475 @@
+#include "./Config.h"
+#include "./Methods.h"
+#include "./Metrics.h"
+#include "./Repoert.h"
+#include "CellType.h"
+#include "ContainersD3QXX.h"
+#include "D3Q19.h"
+#include "Methods.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Backend.h"
+#include "Neon/set/Containter.h"
+#include "Neon/skeleton/Skeleton.h"
+
+int backendWasReported = false;
+
+template <typename Grid_,
+          lbm::Method method,
+          Collision   CollisionId,
+          typename Precision_,
+          typename Lattice_>
+struct Lbm
+{
+    using Grid = Grid_;
+    using Lattice = Lattice_;
+    using Precision = Precision_;
+
+    using PField = typename Grid::template Field<typename Precision::Storage, Lattice::Q>;
+    using CField = typename Grid::template Field<CellType, 1>;
+    using RhoField = typename Grid::template Field<typename Precision::Storage, 1>;
+    using UField = typename Grid::template Field<typename Precision::Storage, 3>;
+
+    // using CommonContainerFactory = common::ContainerFactory<Precision, Lattice, Grid>;
+    using ContainerFactory = ContainerFactoryD3QXX<Precision, Grid, Lattice, CollisionId>;
+
+    template <typename Lambda>
+    Lbm(Config& config,
+        Report& report,
+        Lambda  activeMask)
+    {
+        configurations = config;
+        reportPtr = &report;
+
+
+        // Setting the backend
+        Neon::Backend bk = [&] {
+            if (config.deviceType == "cpu") {
+                Neon::Backend bk(config.devices, Neon::Runtime::openmp);
+                return bk;
+            }
+            if (config.deviceType == "gpu") {
+                Neon::Backend bk(config.devices, Neon::Runtime::stream);
+                return bk;
+            }
+            Neon::NeonException exce("run");
+            exce << config.deviceType << " is not a supported option as device type";
+            NEON_THROW(exce);
+        }();
+
+        auto [gridInitClockStart, notcare] = metrics::restartClock(bk, true);
+
+        // Setting the grid
+        grid = Grid(
+            bk, {config.N, config.N, config.N},
+            [&](const Neon::index_3d& p) { return activeMask(p); },
+            Lattice::template getDirectionAsVector<Lattice::MemoryMapping>(),
+            1.0, 0.0,
+            config.spaceCurveCli.getOption());
+
+        // Allocating Populations
+        for (int i = 0; i < lbm::MethodUtils::getNumberOfPFields<method>(); i++) {
+            std::stringstream name;
+            name << "PopField_0" << i;
+            using Storage = typename Precision::Storage;
+            std::cout << "Allocating population field (#" << std::to_string(i + 1) << std::endl;
+            auto field = grid.template newField<Storage,
+                                                Lattice::Q>(name.str(),
+                                                            Lattice::Q,
+                                                            Storage(0.0));
+            pFieldList.push_back(field);
+        }
+
+        // Allocating cell type field
+        CellType defaultCelltype;
+        cellFlagField = grid.template newField<CellType, 1>("cellFlags", 1, defaultCelltype);
+
+        // Allocating rho and u
+        if (config.vti != 0) {
+            std::cout << "Allocating rho and u" << std::endl;
+            using Storage = typename Precision::Storage;
+            rho = grid.template newField<Storage, 1>("rho", 1, Storage(0.0));
+            u = grid.template newField<Storage, 3>("u", 3, Storage(0.0));
+        }
+
+        {  // Setting Equilibrium all population field
+            for (auto& pField : pFieldList) {
+                // Set all to eq
+                ContainerFactory::Common::setToEquilibrium(pField, cellFlagField).run(Neon::Backend::mainStreamIdx);
+            }
+        }
+        metrics::recordGridInitMetrics(bk, *reportPtr, gridInitClockStart);
+    }
+
+    // Lambda = void(*)(Neon::Index3d) -> std::tuple<BcType, Array<Storage, Lattice::Q>>
+    template <typename Lambda>
+    auto setBC(Lambda bcSetFunction) -> void
+    {
+        auto [setBcClockStart, notcare] = metrics::restartClock(grid.getBackend(), true);
+
+        std::cout << "Setting the problem's boundary." << std::endl;
+        grid.getBackend().sync(Neon::Backend::mainStreamIdx);
+        // Compute ngh mask
+        ContainerFactory::Common::userSettingBc(bcSetFunction,
+                                                pFieldList[0],
+                                                cellFlagField)
+            .run(Neon::Backend::mainStreamIdx);
+
+        for (int i = 1; i < int(pFieldList.size()); i++) {
+            ContainerFactory::Common::copyPopulation(pFieldList[0],
+                                                     pFieldList[i])
+                .run(Neon::Backend::mainStreamIdx);
+        }
+        cellFlagField.newHaloUpdate(Neon::set::StencilSemantic::standard,
+                                    Neon::set::TransferMode::get,
+                                    Neon::Execution::device)
+            .run(Neon::Backend::mainStreamIdx);
+        grid.getBackend().sync(Neon::Backend::mainStreamIdx);
+        ContainerFactory::Common::computeWallNghMask(cellFlagField,
+                                                     cellFlagField)
+            .run(Neon::Backend::mainStreamIdx);
+        cellFlagField.newHaloUpdate(Neon::set::StencilSemantic::standard,
+                                    Neon::set::TransferMode::get,
+                                    Neon::Execution::device)
+            .run(Neon::Backend::mainStreamIdx);
+        metrics::recordProblemSetupMetrics(grid.getBackend(), *reportPtr, setBcClockStart);
+    }
+
+    auto helpPrep() -> void
+    {
+        grid.getBackend().sync(Neon::Backend::mainStreamIdx);
+        // One collide if 2Pop - pull
+        // One iteration if 2Pop = push
+        if constexpr (lbm::Method::pull == method) {
+            // For pull we set up the system in a way that it does one single collide as first operation
+            using Compute = typename Precision::Compute;
+            auto lbmParameters = configurations.template getLbmParameters<Compute>();
+            {
+                skeleton = std::vector<Neon::skeleton::Skeleton>(2);
+                for (int iteration : {0, 1}) {
+                    iterationPhase.resetPhase(iteration);
+                    int  skIdx = iterationPhase.getSkeletonIdx();
+                    auto even = ContainerFactory::Pull::iteration(
+                        configurations.stencilSemanticCli.getOption(),
+                        pFieldList.at(iterationPhase.getInputIdx()),
+                        cellFlagField,
+                        lbmParameters.omega,
+                        pFieldList.at(iterationPhase.getOutputIdx()));
+
+                    std::vector<Neon::set::Container> ops;
+                    skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend());
+                    Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption());
+                    ops.push_back(even);
+                    std::stringstream appName;
+
+                    if (skIdx % 2 == 0)
+                        appName << "LBM_pull_even";
+                    else
+                        appName << "LBM_pull_odd";
+
+                    skeleton.at(skIdx).sequence(ops, appName.str(), opt);
+
+                    if (skIdx % 2 == 0)
+                        skeleton.at(skIdx).ioToDot("lbm-pull-even","lbm_pull_even",true);
+                    else
+                        skeleton.at(skIdx).ioToDot("lbm-pull-odd","lbm_pull_even", true);
+                }
+            }
+            {
+                // Let's compute 1 collide operation to prepare the input of the first iteration
+                iterationPhase.resetPhase(0);
+                ContainerFactory::Pull::localCollide(pFieldList.at(iterationPhase.getInputIdx()),
+                                                     cellFlagField,
+                                                     lbmParameters.omega,
+                                                     pFieldList.at(iterationPhase.getOutputIdx()))
+                    .run(Neon::Backend::mainStreamIdx);
+                pFieldList[0].getBackend().syncAll();
+                iterationPhase.updateIterationPhase();
+            }
+            return;
+        }
+        if constexpr (lbm::Method::push == method) {
+            using Compute = typename Precision::Compute;
+            auto lbmParameters = configurations.template getLbmParameters<Compute>();
+            skeleton = std::vector<Neon::skeleton::Skeleton>(2);
+            for (int iteration : {0, 1}) {
+                iterationPhase.resetPhase(iteration);
+                int  skIdx = iterationPhase.getSkeletonIdx();
+                auto even = ContainerFactory::Push::iteration(
+                    configurations.stencilSemanticCli.getOption(),
+                    pFieldList.at(iterationPhase.getInputIdx()),
+                    cellFlagField,
+                    lbmParameters.omega,
+                    pFieldList.at(iterationPhase.getOutputIdx()));
+
+                std::vector<Neon::set::Container> ops;
+                skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend());
+                Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption());
+                ops.push_back(even);
+                std::stringstream appName;
+                if (iteration % 2 == 0)
+                    appName << "LBM_push_even";
+                else
+                    appName << "LBM_push_odd";
+                skeleton.at(skIdx).sequence(ops, appName.str(), opt);
+            }
+
+            {
+                iterationPhase.resetPhase(0);
+                int skIdx = iterationPhase.getSkeletonIdx();
+                skeleton.at(skIdx).run();
+                iterationPhase.updateIterationPhase();
+            }
+            return;
+        }
+        if constexpr (lbm::Method::aa == method) {
+            using Compute = typename Precision::Compute;
+            auto lbmParameters = configurations.template getLbmParameters<Compute>();
+            skeleton = std::vector<Neon::skeleton::Skeleton>(2);
+            for (int iteration : {0, 1}) {
+                iterationPhase.resetPhase(iteration);
+                int                  skIdx = iterationPhase.getSkeletonIdx();
+                Neon::set::Container lbmIteration;
+                std::stringstream    appName;
+                if (iterationPhase.getPhase() == IterationPhase::Phase::even) {
+                    lbmIteration = ContainerFactory::AA::Even::iteration(
+                        cellFlagField,
+                        lbmParameters.omega,
+                        pFieldList.at(0));
+                    appName << "LBM_aa_even";
+                } else {
+                    lbmIteration = ContainerFactory::AA::Odd::iteration(
+                        cellFlagField,
+                        lbmParameters.omega,
+                        pFieldList.at(0));
+                    appName << "LBM_aa_even";
+                }
+                std::vector<Neon::set::Container> ops;
+                skeleton.at(skIdx) = Neon::skeleton::Skeleton(pFieldList[0].getBackend());
+                Neon::skeleton::Options opt(configurations.occCli.getOption(), configurations.transferModeCli.getOption());
+                ops.push_back(lbmIteration);
+                skeleton.at(skIdx).sequence(ops, appName.str(), opt);
+            }
+
+            {
+                iterationPhase.resetPhase(0);
+                int const skIdx = iterationPhase.getSkeletonIdx();
+                skeleton.at(skIdx).run();
+                iterationPhase.updateIterationPhase();
+            }
+            return;
+        }
+        NEON_DEV_UNDER_CONSTRUCTION("");
+    }
+
+    auto iterate() -> void
+    {
+        helpPrep();
+        // Iteration keep track of all iterations
+        // clock_iter keeps tracks of the iteration done after the last clock reset
+        std::cout << "Starting main LBM loop." << std::endl;
+
+        auto& bk = grid.getBackend();
+        auto [start, clock_iter] = metrics::restartClock(bk, true);
+        int time_iter = 0;
+        // Reset the clock, to be used when a benchmark simulation is executed.
+        tie(start, clock_iter) = metrics::restartClock(bk, true);
+
+        for (time_iter = 0; time_iter < configurations.benchMaxIter; ++time_iter) {
+            if ((configurations.vti > 1) && ((time_iter % configurations.vti) == 0)) {
+                bk.syncAll();
+                helpExportVti();
+            }
+
+            if (configurations.benchmark && time_iter == configurations.benchIniIter) {
+                std::cout << "Warm up completed (" << time_iter << " iterations ).\n"
+                          << "Starting benchmark step ("
+                          << configurations.benchMaxIter - configurations.benchIniIter << " iterations)."
+                          << std::endl;
+                tie(start, clock_iter) = metrics::restartClock(bk, false);
+            }
+
+            skeleton[iterationPhase.getSkeletonIdx()].run();
+
+            ++clock_iter;
+            iterationPhase.updateIterationPhase();
+        }
+        std::cout << "Iterations completed." << std::endl;
+        metrics::recordMetrics(bk, configurations, *reportPtr, start, clock_iter);
+    }
+
+    auto helpExportVti() -> void
+    {
+        grid.getBackend().syncAll();
+        auto& pop = pFieldList.at(iterationPhase.getOutputIdx());
+        bool  done = false;
+        if constexpr (method == lbm::Method::push) {
+            auto computeRhoAndU = ContainerFactory::Push::computeRhoAndU(pop, cellFlagField, rho, u);
+            computeRhoAndU.run(Neon::Backend::mainStreamIdx);
+            done = true;
+        }
+        if constexpr (method == lbm::Method::pull) {
+            pop.newHaloUpdate(Neon::set::StencilSemantic::standard,
+                              Neon::set::TransferMode::get,
+                              Neon::Execution::device)
+                .run(Neon::Backend::mainStreamIdx);
+            auto computeRhoAndU = ContainerFactory::Pull::computeRhoAndU(pop, cellFlagField, rho, u);
+            computeRhoAndU.run(Neon::Backend::mainStreamIdx);
+            done = true;
+        }
+        if constexpr (method == lbm::Method::aa) {
+            if (iterationPhase.getPhase() == IterationPhase::Phase::even) {
+                auto computeRhoAndU = ContainerFactory::AA::Even::computeRhoAndU(pop, cellFlagField, rho, u);
+                computeRhoAndU.run(Neon::Backend::mainStreamIdx);
+            } else {
+                auto computeRhoAndU = ContainerFactory::AA::Odd::computeRhoAndU(pop, cellFlagField, rho, u);
+                computeRhoAndU.run(Neon::Backend::mainStreamIdx);
+            }
+            done = true;
+        }
+        if (!done) {
+            NEON_DEV_UNDER_CONSTRUCTION("helpExportVti");
+        }
+        u.updateHostData(Neon::Backend::mainStreamIdx);
+        rho.updateHostData(Neon::Backend::mainStreamIdx);
+        // pop.updateHostData(Neon::Backend::mainStreamIdx);
+        grid.getBackend().sync(Neon::Backend::mainStreamIdx);
+
+        size_t      numDigits = 5;
+        std::string iterIdStr = std::to_string(iterationPhase.getCounter());
+        iterIdStr = std::string(numDigits - std::min(numDigits, iterIdStr.length()), '0') + iterIdStr;
+
+        // pop.ioToVtk("pop_" + iterIdStr, "pop", false);
+        u.ioToVtk("u_" + iterIdStr, "u", false, Neon::IoFileType::BINARY);
+        rho.ioToVtk("rho_" + iterIdStr, "rho", false, Neon::IoFileType::BINARY);
+        cellFlagField.template ioToVtk<int>("cellFlagField_" + iterIdStr, "flag", false);
+
+#if 0
+        std::vector<std::pair<double, double>> xPosVal;
+        std::vector<std::pair<double, double>> yPosVal;
+        const double scale = 1.0 / ulid.v[0];
+
+        const Neon::index_3d grid_dim = grid.getDimension();
+        u.forEachActiveCell([&](const Neon::index_3d& id, const int& card, auto& val) {
+            if (id.x == grid_dim.x / 2 && id.z == grid_dim.z / 2) {
+                if (card == 0) {
+                    yPosVal.push_back({static_cast<double>(id.v[1]) / static_cast<double>(grid_dim.y), val * scale});
+                }
+            }
+
+            if (id.y == grid_dim.y / 2 && id.z == grid_dim.z / 2) {
+                if (card == 1) {
+                    xPosVal.push_back({static_cast<double>(id.v[0]) / static_cast<double>(grid_dim.x), val * scale});
+                }
+            }
+        },
+                            Neon::computeMode_t::seq);
+
+        // sort the position so the linear interpolation works
+        std::sort(xPosVal.begin(), xPosVal.end(), [=](std::pair<double, double>& a, std::pair<double, double>& b) {
+            return a.first < b.first;
+        });
+
+        std::sort(yPosVal.begin(), yPosVal.end(), [=](std::pair<double, double>& a, std::pair<double, double>& b) {
+            return a.first < b.first;
+        });
+
+        auto writeToFile = [](const std::vector<std::pair<double, double>>& posVal, std::string filename) {
+            std::ofstream file;
+            file.open(filename);
+            for (auto v : posVal) {
+                file << v.first << " " << v.second << "\n";
+            }
+            file.close();
+        };
+        writeToFile(yPosVal, "NeonUniformLBM_" + iterIdStr + "_Y.dat");
+        writeToFile(xPosVal, "NeonUniformLBM_" + iterIdStr + "_X.dat");
+#endif
+    }
+
+
+    struct IterationPhase
+    {
+        enum Phase
+        {
+            even,
+            odd,
+        };
+
+       private:
+        Phase state{Phase::even};
+
+        int counter = 0;
+
+       public:
+        auto getCounter() const -> int
+        {
+            return counter;
+        }
+
+        auto resetPhase(Phase newPhase)
+        {
+            state = newPhase;
+            counter = 0;
+        }
+
+        auto resetPhase(int iteration)
+        {
+            if (iteration != 0 && iteration != 1) {
+                NEON_THROW_UNSUPPORTED_OPERATION("");
+            }
+            state = iteration == 0 ? even : odd;
+            counter = 0;
+        }
+
+        auto getPhase() const -> Phase
+        {
+            return state;
+        }
+
+        auto updateIterationPhase() -> void
+        {
+            state = state == even ? odd : even;
+            counter++;
+        }
+
+        auto getInputIdx() -> int
+        {
+            if constexpr (method == lbm::Method::pull || method == lbm::Method::push) {
+                return state == IterationPhase::even ? 0 : 1;
+            }
+            if constexpr (method == lbm::Method::aa) {
+                return 0;
+            }
+            NEON_THROW_UNSUPPORTED_OPERATION("helpGetInputIdx");
+        }
+        auto getOutputIdx() -> int
+        {
+            if constexpr (method == lbm::Method::pull || method == lbm::Method::push) {
+                return state == IterationPhase::even ? 1 : 0;
+            }
+            if constexpr (method == lbm::Method::aa) {
+                return 0;
+            }
+            NEON_THROW_UNSUPPORTED_OPERATION("helpGetInputIdx");
+        }
+
+        auto getSkeletonIdx() -> int
+        {
+            if constexpr (method == lbm::Method::pull || method == lbm::Method::push || method == lbm::Method::aa) {
+                return state == IterationPhase::even ? 0 : 1;
+            }
+            NEON_THROW_UNSUPPORTED_OPERATION("helpGetInputIdx");
+        }
+    };
+
+    Config                                configurations;
+    IterationPhase                        iterationPhase;
+    bool                                  prepDone = false;
+    Grid                                  grid;
+    std::vector<PField>                   pFieldList;
+    CField                                cellFlagField;
+    RhoField                              rho;
+    UField                                u;
+    std::vector<Neon::skeleton::Skeleton> skeleton;
+    Report*                               reportPtr;
+};
diff --git a/benchmarks/lbm/src/Methods.h b/benchmarks/lbm/src/Methods.h
new file mode 100644
index 00000000..11a1da6f
--- /dev/null
+++ b/benchmarks/lbm/src/Methods.h
@@ -0,0 +1,59 @@
+#pragma once
+#include "Neon/core/core.h"
+
+namespace lbm {
+enum class Method
+{
+    push = 0,
+    pull = 1,
+    aa = 2
+};
+
+struct MethodUtils
+{
+    template <lbm::Method method>
+    static auto getNumberOfPFields() -> int
+    {
+        switch (method) {
+            case Method::pull:
+                return 2;
+            case Method::push:
+                return 2;
+            case Method::aa:
+                return 1;
+        }
+        std::stringstream msg;
+        msg << "The following LBM method is not recognized" << lbm::MethodUtils::toString(method) << std::endl;
+        NEON_THROW_UNSUPPORTED_OPERATION(msg.str());
+    }
+
+    static auto toString(lbm::Method method) -> std::string
+    {
+        switch (method) {
+            case Method::pull:
+                return "pull";
+            case Method::push:
+                return "push";
+            case Method::aa:
+                return "aa";
+        }
+        std::stringstream msg;
+        msg << "The following LBM method is not recognized" << lbm::MethodUtils::toString(method) << std::endl;
+        NEON_THROW_UNSUPPORTED_OPERATION(msg.str());
+    }
+
+    static auto formInt(int method) -> Method
+    {
+        if (method == int(Method::pull))
+            return Method::pull;
+        if (method == int(Method::push))
+            return Method::push;
+        if (method == int(Method::aa))
+            return Method::aa;
+
+        std::stringstream msg;
+        msg << "The following LBM method is not recognized" << method << std::endl;
+        NEON_THROW_UNSUPPORTED_OPERATION(msg.str());
+    }
+};
+}  // namespace lbm
\ No newline at end of file
diff --git a/benchmarks/lbm/src/Metrics.h b/benchmarks/lbm/src/Metrics.h
new file mode 100644
index 00000000..10356a4a
--- /dev/null
+++ b/benchmarks/lbm/src/Metrics.h
@@ -0,0 +1,88 @@
+#pragma once
+#include <iomanip>
+#include "Config.h"
+#include "Neon/Neon.h"
+#include "Neon/set/Backend.h"
+#include "Repoert.h"
+
+namespace metrics {
+// Return a new clock for the current time, for benchmarking.
+namespace {
+
+auto restartClock(Neon::Backend& bk, bool sync = true)
+{
+    if (sync) {
+        bk.syncAll();
+    }
+    return make_pair(std::chrono::high_resolution_clock::now(), 0);
+}
+
+void recordBackend(Neon::Backend& bk,
+                   Report&        report)
+{
+    report.recordBk(bk);
+}
+
+void recordGrid(Neon::domain::interface::GridBase& g,
+                   Report&        report)
+{
+    report.recordGrid(g);
+}
+
+}  // namespace
+
+
+// Compute the time elapsed since a starting point, and the corresponding
+// benchmarks of the code in Mega Lattice site updates per second (MLups).
+template <class TimePoint>
+void recordMetrics(Neon::Backend& bk,
+                   const Config&  config,
+                   Report&        report,
+                   TimePoint      start,
+                   int            clock_iter)
+{
+    bk.syncAll();
+    size_t nElements = config.N * config.N * config.N;
+    auto   stop = std::chrono::high_resolution_clock::now();
+    auto   duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+    double mlups = static_cast<double>(nElements * clock_iter) / duration.count();
+
+    report.recordLoopTime(duration.count(), "microseconds");
+    report.recordMLUPS(mlups);
+
+    std::cout << "Metrics: " << std::endl;
+    std::cout << "-- time: " << std::setprecision(4) << duration.count() << " microseconds" << std::endl;
+    std::cout << "-- MLUPS: " << std::setprecision(4) << mlups << " MLUPS" << std::endl;
+}
+
+template <class TimePoint>
+void recordGridInitMetrics(Neon::Backend& bk,
+                           Report&        report,
+                           TimePoint      start)
+{
+    bk.syncAll();
+    auto stop = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+
+    report.recordNeonGridInitTime(duration.count(), "microseconds");
+
+    std::cout << "Metrics: " << std::endl;
+    std::cout << "- Grid Init: " << std::setprecision(4) << duration.count() << " microseconds" << std::endl;
+}
+
+
+template <class TimePoint>
+void recordProblemSetupMetrics(Neon::Backend& bk,
+                               Report&        report,
+                               TimePoint      start)
+{
+    bk.syncAll();
+    auto stop = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
+
+    report.recordProblemSetupTime(duration.count(), "microseconds");
+
+    std::cout << "Metrics: " << std::endl;
+    std::cout << "    Problem Setup: " << std::setprecision(4) << duration.count() << " microseconds" << std::endl;
+}
+}  // namespace metrics
\ No newline at end of file
diff --git a/benchmarks/lbm/src/Precision.h b/benchmarks/lbm/src/Precision.h
new file mode 100644
index 00000000..a45ff69e
--- /dev/null
+++ b/benchmarks/lbm/src/Precision.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "Neon/Neon.h"
+#include "Neon/set/Backend.h"
+#include "Neon/set/memory/memSet.h"
+
+template <typename StorageFP,
+          typename ComputeFP>
+struct Precision
+{
+    using Storage = StorageFP;
+    using Compute = ComputeFP;
+};
diff --git a/benchmarks/lbm/src/Repoert.h b/benchmarks/lbm/src/Repoert.h
new file mode 100644
index 00000000..095bce9a
--- /dev/null
+++ b/benchmarks/lbm/src/Repoert.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include "Config.h"
+#include "Neon/domain/interface/GridBase.h"
+struct Report
+{
+    Neon::Report mReport;
+    std::string  mFname;
+
+    std::vector<double> mMLUPS;
+    std::vector<double> mLoopTime;
+    std::vector<double> mNeonGridInitTime;
+    std::vector<double> mProblemSetupTime;
+
+    std::string mtimeUnit = "";
+
+    explicit Report(const Config& c);
+
+    auto recordMLUPS(double mlups)
+        -> void;
+
+    auto recordLoopTime(double             time,
+                        const std::string& unit)
+        -> void;
+
+    auto recordNeonGridInitTime(double             time,
+                                const std::string& unit)
+        -> void;
+
+    auto recordProblemSetupTime(double             time,
+                                const std::string& unit)
+        -> void;
+
+    auto save(std::stringstream & testCode)
+        -> void;
+    void recordBk(Neon::Backend& backend);
+    void recordGrid(Neon::domain::interface::GridBase& g);
+};
diff --git a/benchmarks/lbm/src/Report.cpp b/benchmarks/lbm/src/Report.cpp
new file mode 100644
index 00000000..e332de43
--- /dev/null
+++ b/benchmarks/lbm/src/Report.cpp
@@ -0,0 +1,113 @@
+#include <string>
+#include <vector>
+#include "Repoert.h"
+
+Report::Report(const Config& c)
+    : mReport("lbm-lid-driven-cavity-flow")
+{
+    mFname = c.reportFile;
+
+    mReport.addMember("argv", c.mArgv);
+
+    mReport.addMember("Re", c.Re);
+    mReport.addMember("ulb", c.ulb);
+    mReport.addMember("N", c.N);
+    mReport.addMember("benchmark", c.benchmark);
+    mReport.addMember("max_t", c.max_t);
+    mReport.addMember("repetitions", c.repetitions);
+    mReport.addMember("vti", c.vti);
+
+
+    mReport.addMember("benchIniIter", c.benchIniIter);
+    mReport.addMember("benchMaxIter", c.benchMaxIter);
+
+    mReport.addMember("deviceType", c.deviceType);
+    mReport.addMember("numDevices", c.devices.size());
+    mReport.addMember("devices", c.devices);
+    mReport.addMember("reportFile", c.reportFile);
+    mReport.addMember("gridType", c.gridType);
+
+
+
+    c.occCli.addToReport(mReport);
+    c.transferModeCli.addToReport(mReport);
+    c.stencilSemanticCli.addToReport(mReport);
+    c.spaceCurveCli.addToReport(mReport);
+    c.collisionCli.addToReport(mReport);
+
+    mReport.addMember("computeTypeStr", c.computeTypeStr);
+    mReport.addMember("storeTypeStr", c.storeTypeStr);
+    mReport.addMember("streamingMethod", c.streamingMethod);
+    mReport.addMember("lattice", c.lattice);
+
+
+    mReport.addMember("nu", c.mLbmParameters.nu);
+    mReport.addMember("omega", c.mLbmParameters.omega);
+    mReport.addMember("dx", c.mLbmParameters.dx);
+    mReport.addMember("dt", c.mLbmParameters.dt);
+}
+
+auto Report::
+    recordMLUPS(double mlups)
+        -> void
+{
+    mMLUPS.push_back(mlups);
+}
+
+auto Report::
+    recordLoopTime(double             time,
+                   const std::string& unit)
+        -> void
+{
+    if (mtimeUnit.length() == 0) {
+        mtimeUnit = unit;
+    }
+    if (unit.length() != mtimeUnit.length()) {
+        NEON_THROW_UNSUPPORTED_OPERATION("Time unit inconsistency");
+    }
+    mLoopTime.push_back(time);
+}
+
+auto Report::recordNeonGridInitTime(double time, const std::string& unit) -> void
+{
+    if (mtimeUnit.length() == 0) {
+        mtimeUnit = unit;
+    }
+    if (unit.length() != mtimeUnit.length()) {
+        NEON_THROW_UNSUPPORTED_OPERATION("Time unit inconsistency");
+    }
+    mNeonGridInitTime.push_back(time);
+}
+
+auto Report::recordProblemSetupTime(double time, const std::string& unit) -> void
+{
+    if (mtimeUnit.length() == 0) {
+        mtimeUnit = unit;
+    }
+    if (unit.length() != mtimeUnit.length()) {
+        NEON_THROW_UNSUPPORTED_OPERATION("Time unit inconsistency");
+    }
+    mProblemSetupTime.push_back(time);
+}
+
+auto Report::
+    save(std::stringstream & testCode)
+        -> void
+{
+    mReport.addMember("MLUPS", mMLUPS);
+    mReport.addMember(std::string("Loop Time (") + mtimeUnit + ")", mLoopTime);
+    mReport.addMember(std::string("Problem Setup Time (") + mtimeUnit + ")", mProblemSetupTime);
+    mReport.addMember(std::string("Neon Grid Init Time (") + mtimeUnit + ")", mNeonGridInitTime);
+
+    mReport.write(mFname + testCode.str(), true);
+}
+
+void Report::recordBk(Neon::Backend& backend)
+{
+    backend.toReport(mReport);
+}
+
+void Report::recordGrid(Neon::domain::interface::GridBase& g)
+{
+    g.toReport(mReport, true);
+}
\ No newline at end of file
diff --git a/benchmarks/lbm/src/RunCavityTwoPop.cu b/benchmarks/lbm/src/RunCavityTwoPop.cu
new file mode 100644
index 00000000..55d6bac5
--- /dev/null
+++ b/benchmarks/lbm/src/RunCavityTwoPop.cu
@@ -0,0 +1,307 @@
+#include "Config.h"
+
+#include "D3Q19.h"
+#include "D3Q27.h"
+
+#include "Neon/domain/bGrid.h"
+#include "Neon/domain/dGrid.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
+#include "Neon/domain/eGrid.h"
+
+#include "./Lbm.h"
+#include "CellType.h"
+#include "Metrics.h"
+#include "Repoert.h"
+namespace CavityTwoPop {
+
+int backendWasReported = false;
+// #include <fenv.h>
+// #include "/usr/include/fenv.h"
+
+namespace details {
+template <lbm::Method method_,
+          Collision   CollisionType,
+          typename Lattice_,
+          typename Grid,
+          typename Storage_,
+          typename Compute_>
+auto run(Config&                             config,
+         Report&                             report,
+         [[maybe_unused]] std::stringstream& code) -> void
+{
+    using Storage = Storage_;
+    using Compute = Compute_;
+    using Precision = Precision<Storage, Compute>;
+    using Lattice = Lattice_;  // D3Q27<Precision>;
+
+    code << "_" << config.deviceType << "_";
+    for (auto const& id : config.devices) {
+        code << id;
+    }
+    code << "_SS" << config.stencilSemanticCli.getStringOption();
+    code << "_SF" << config.spaceCurveCli.getStringOption();
+    code << "_TM" << config.transferModeCli.getStringOption();
+    code << "_Occ" << config.occCli.getStringOption();
+    code << "__";
+    // using PopulationField = typename Grid::template Field<Storage, Lattice::Q>;
+
+    // using PopField = typename Grid::template Field<typename Precision::Storage, Lattice::Q>;
+    // using CellTypeField = typename Grid::template Field<CellType, 1>;
+
+    // using Idx = typename PopField::Idx;
+    // using RhoField = typename Grid::template Field<typename Precision::Storage, 1>;
+    // using UField = typename Grid::template Field<typename Precision::Storage, 3>;
+
+    Neon::double_3d ulid(1., 0., 0.);
+    // Neon Grid and Fields initialization
+    Neon::index_3d domainDim(config.N, config.N, config.N);
+
+    Lbm<Grid, method_, CollisionType, Precision, Lattice> lbm(config,
+                                                              report,
+                                                              [](Neon::index_3d const&) { return true; });
+    auto                                                  ulb = config.ulb;
+    lbm.setBC([=] NEON_CUDA_HOST_DEVICE(Neon::index_3d const& globalIdx,
+                                        NEON_OUT Storage      p[Lattice::Q],
+                                        NEON_OUT CellType::Classification& cellClass) {
+        typename Lattice::Precision::Storage popVal = 0;
+
+        if (globalIdx.x == 0 || globalIdx.x == domainDim.x - 1 ||
+            globalIdx.y == 0 || globalIdx.y == domainDim.y - 1 ||
+            globalIdx.z == 0 || globalIdx.z == domainDim.z - 1) {
+            cellClass = CellType::bounceBack;
+
+            if (globalIdx.y == domainDim.y - 1) {
+                cellClass = CellType::movingWall;
+            }
+
+            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                using M = typename Lattice::template RegisterMapper<q>;
+                if (globalIdx.y == domainDim.y - 1) {
+                    popVal = -6. * Lattice::Registers::template getT<M::fwdRegQ>() * ulb *
+                             (Lattice::Registers::template getVelocityComponent<M::fwdRegQ, 0>() * ulid.v[0] +
+                              Lattice::Registers::template getVelocityComponent<M::fwdRegQ, 1>() * ulid.v[1] +
+                              Lattice::Registers::template getVelocityComponent<M::fwdRegQ, 2>() * ulid.v[2]);
+                } else {
+                    popVal = 0;
+                }
+                p[q] = popVal;
+            });
+        } else {
+            cellClass = CellType::bulk;
+            Neon::ConstexprFor<0, Lattice::Q, 1>([&](auto q) {
+                using M = typename Lattice::template RegisterMapper<q>;
+                p[q] = Lattice::Registers::template getT<M::fwdRegQ>();
+            });
+        }
+    });
+    lbm.iterate();
+}
+
+
+template <Collision CollisionType, typename Lattice, typename Grid, typename Storage, typename Compute>
+auto runFilterMethod(Config&            config,
+                     Report&            report,
+                     std::stringstream& testCode) -> void
+{
+    //feenableexcept(FE_ALL_EXCEPT & ~FE_INEXACT);  // Enable all floating point exceptions but FE_INEXACT
+    if (config.streamingMethod == "push") {
+        if (config.devices.size() != 1) {
+            NEON_THROW_UNSUPPORTED_OPERATION("We only support PUSH in a single device configuration for now.")
+        }
+        testCode << "_push";
+        return run<lbm::Method::push, CollisionType, Lattice, Grid, Storage, Compute>(config, report, testCode);
+    }
+    if (config.streamingMethod == "pull") {
+        testCode << "_pull";
+        return run<lbm::Method::pull, CollisionType, Lattice, Grid, Storage, Compute>(config, report, testCode);
+    }
+    if (config.streamingMethod == "aa") {
+        if (config.devices.size() != 1) {
+            NEON_THROW_UNSUPPORTED_OPERATION("We only support AA in a single device configuration for now.")
+        }
+        testCode << "_aa";
+        return run<lbm::Method::aa, CollisionType, Lattice, Grid, Storage, Compute>(config, report, testCode);
+    }
+    NEON_DEV_UNDER_CONSTRUCTION("");
+}
+
+template <typename Lattice, typename Grid, typename Storage, typename Compute>
+auto runFilterCollision(Config&            config,
+                        Report&            report,
+                        std::stringstream& testCode) -> void
+{
+    if (config.collisionCli.getOption() == Collision::bgk) {
+        testCode << "_bgk";
+        return runFilterMethod<Collision::bgk, Lattice, Grid, Storage, Compute>(config, report, testCode);
+    }
+    if (config.collisionCli.getOption() == Collision::kbc) {
+        if (config.lattice != "d3q27" && config.lattice != "D3Q27") {
+            Neon::NeonException e("runFilterCollision");
+            e << "LBM kbc collision model only supports d3q27 lattice";
+            NEON_THROW(e);
+        }
+        testCode << "_kbc";
+        using L = D3Q27<Precision<Storage, Compute>>;
+        if constexpr (std::is_same_v<Lattice, L>) {
+            return runFilterMethod<Collision::kbc, Lattice, Grid, Storage, Compute>(config, report, testCode);
+        }
+    }
+    NEON_DEV_UNDER_CONSTRUCTION("");
+}
+
+template <typename Grid, typename Storage, typename Compute>
+auto runFilterLattice(Config&            config,
+                      Report&            report,
+                      std::stringstream& testCode) -> void
+{
+    using P = Precision<Storage, Compute>;
+
+    if (config.lattice == "d3q19" || config.lattice == "D3Q19") {
+        testCode << "_D3Q19";
+        using L = D3Q19<P>;
+        return runFilterCollision<L, Grid, Storage, Compute>(config, report, testCode);
+    }
+    if (config.lattice == "d3q27" || config.lattice == "D3Q27") {
+        testCode << "_D3Q27";
+        using L = D3Q27<P>;
+        return runFilterCollision<L, Grid, Storage, Compute>(config, report, testCode);
+    }
+    NEON_DEV_UNDER_CONSTRUCTION("Lattice type not supported. Available options: D3Q19 and D3Q27");
+}
+
+
+template <typename Grid, typename Storage>
+auto runFilterComputeType(Config&            config,
+                          Report&            report,
+                          std::stringstream& testCode)
+{
+    if (config.computeTypeStr == "double") {
+        testCode << "_Sdouble";
+        return runFilterLattice<Grid, Storage, double>(config, report, testCode);
+    }
+    if (config.computeTypeStr == "float") {
+        testCode << "_Sfloat";
+        return runFilterLattice<Grid, Storage, float>(config, report, testCode);
+    }
+    NEON_DEV_UNDER_CONSTRUCTION("");
+}
+
+template <typename Grid>
+auto runFilterStoreType(Config&            config,
+                        Report&            report,
+                        std::stringstream& testCode)
+    -> void
+{
+    if (config.storeTypeStr == "double") {
+        testCode << "_Cdouble";
+        return runFilterComputeType<Grid, double>(config, report, testCode);
+    }
+    if (config.storeTypeStr == "float") {
+        testCode << "_Cfloat";
+        return runFilterComputeType<Grid, float>(config, report, testCode);
+    }
+    NEON_DEV_UNDER_CONSTRUCTION("");
+}
+}  // namespace details
+
+#ifdef NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS
+constexpr bool skipTest = false;
+#else
+constexpr bool skipTest = false;
+#endif
+
+auto run(Config&            config,
+         Report&            report,
+         std::stringstream& testCode) -> void
+{
+    testCode << "___" << config.N << "_";
+    testCode << "_numDevs_" << config.devices.size();
+
+    if (config.gridType == "dGrid") {
+        testCode << "_dGrid";
+        return details::runFilterStoreType<Neon::dGrid>(config, report, testCode);
+    }
+    //    if (config.gridType == "eGrid") {
+    //        if constexpr (!skipTest) {
+    //            return details::runFilterStoreType<Neon::eGrid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "bGrid" || config.gridType == "bGrid_8_8_8") {
+    //        return details::runFilterStoreType<Neon::bGrid>(config, report);
+    //    }
+    if (config.gridType == "bGrid_4_4_4") {
+        if constexpr (!skipTest) {
+            testCode << "_bGrid_4_4_4";
+            using Sblock = Neon::domain::details::bGrid::StaticBlock<4, 4, 4>;
+            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+            return details::runFilterStoreType<Grid>(config, report, testCode);
+        } else {
+            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+        }
+    }
+    //    if (config.gridType == "bGrid_8_8_8") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<8, 8, 8>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report, testCode);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "bGrid_2_2_2") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<2, 2, 2>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "bGrid_32_8_4") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "bGrid_32_8_4") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 4>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "bGrid_32_2_8") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 2, 8>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "bGrid_32_8_2") {
+    //        if constexpr (!skipTest) {
+    //            using Sblock = Neon::domain::details::bGrid::StaticBlock<32, 8, 2>;
+    //            using Grid = Neon::domain::details::bGrid::bGrid<Sblock>;
+    //            return details::runFilterStoreType<Grid>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    //    if (config.gridType == "dGridSoA") {
+    //        if constexpr (!skipTest) {
+    //            return details::runFilterStoreType<Neon::domain::details::dGridSoA::dGridSoA>(config, report);
+    //        } else {
+    //            NEON_THROW_UNSUPPORTED_OPERATION("This option was disables. PLease define NEON_BENCHMARK_DESIGN_OF_EXPERIMENTS to enable it.")
+    //        }
+    //    }
+    NEON_THROW_UNSUPPORTED_OPERATION("Unknown grid type: " + config.gridType);
+}
+}  // namespace CavityTwoPop
diff --git a/benchmarks/lbm/src/RunCavityTwoPop.h b/benchmarks/lbm/src/RunCavityTwoPop.h
new file mode 100644
index 00000000..0386d28e
--- /dev/null
+++ b/benchmarks/lbm/src/RunCavityTwoPop.h
@@ -0,0 +1,13 @@
+#include "Config.h"
+#include "D3Q19.h"
+#include "Neon/domain/dGrid.h"
+
+#include "Metrics.h"
+#include "Repoert.h"
+
+namespace CavityTwoPop {
+
+auto run(Config& config,
+         Report& report,
+         std::stringstream&) -> void;
+}  // namespace CavityTwoPop
\ No newline at end of file
diff --git a/benchmarks/lbm/src/app.cpp b/benchmarks/lbm/src/app.cpp
new file mode 100644
index 00000000..8cbfc1cf
--- /dev/null
+++ b/benchmarks/lbm/src/app.cpp
@@ -0,0 +1,48 @@
+
+#include "Config.h"
+#include "Repoert.h"
+#include "RunCavityTwoPop.h"
+
+#include "Neon/Neon.h"
+#include "Neon/core/tools/clipp.h"
+#include "Neon/domain/dGrid.h"
+
+int main(int argc, char** argv)
+{
+    Config config;
+    Neon::init();
+
+    config.Re = 100.;         // Reynolds number
+    config.ulb = 0.04;        // Velocity in lattice units
+    config.N = 160;           // Number of nodes in x-direction
+    config.benchmark = true;  // Run in benchmark mode ?
+    config.max_t = 10.0;      // Non-benchmark mode: Total time in dim.less units
+                              //    config.out_freq = 20000000;     // Non-benchmark mode: Frequency in LU for output of terminal message and profiles (use 0 for no messages)
+                              //    config.data_freq = 20000000;    // Non-benchmark mode: Frequency in LU of full data dump (use 0 for no data dump)
+                              //    config.bench_ini_iter = 0;      // Benchmark mode: Number of warmup iterations
+                              //    config.bench_max_iter = 10000;  // Benchmark mode: Total number of iterations
+                              //    config.perKeeperFile = "perf";
+                              //    config.devices = {0};
+                              //    config.gridType = "dGrid";
+                              //    config.occ = Neon::skeleton::Options_t::Occ::none
+
+
+    if (config.parseArgs(argc, argv) != 0) {
+        return -1;
+    }
+
+    std::cout << "--------------- Parameters ---------------\n";
+    std::cout << config.toString();
+    std::cout << "-------------------------------------------\n";
+
+    Report            report(config);
+    std::stringstream testCode;
+    for (int i = 0; i < config.repetitions; i++) {
+        testCode = std::stringstream();
+        CavityTwoPop::run(config, report, testCode);
+    }
+
+    report.save(testCode);
+
+    return 0;
+}
diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming.h b/libNeonCore/include/Neon/core/tools/metaprogramming.h
index 53678ed6..ea004a43 100644
--- a/libNeonCore/include/Neon/core/tools/metaprogramming.h
+++ b/libNeonCore/include/Neon/core/tools/metaprogramming.h
@@ -4,3 +4,4 @@
 #include "Neon/core/tools/metaprogramming/debugHelp.h"
 #include "Neon/core/tools/metaprogramming/extractTupleVecType.h"
 #include "Neon/core/tools/metaprogramming/tupleVecTable.h"
+#include "Neon/core/tools/metaprogramming/ConstexprFor.h"
\ No newline at end of file
diff --git a/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h
new file mode 100644
index 00000000..a6d8767e
--- /dev/null
+++ b/libNeonCore/include/Neon/core/tools/metaprogramming/ConstexprFor.h
@@ -0,0 +1,31 @@
+#pragma once
+
+namespace Neon {
+
+/**
+ * Implementation of a constexpr for loop.
+ * Reference: https://artificial-mind.net/blog/2020/10/31/constexpr-for
+ *
+ * The loop is implemented as a recursive template function.
+ * It is equicalent to the following code:
+ *
+ * for(int i = Start; i < End; i += Inc) {
+ *    f(i);
+ *    // do something
+ *    // ...
+ *    // ...
+ * }
+ */
+template <auto Start /**< First index for the loop */,
+          auto End /**< Last index for the loop is (End-1) */,
+          auto Inc /**< Loop increment */,
+          class F>
+constexpr void ConstexprFor(F&& f)
+{
+    if constexpr (Start < End) {
+        f(std::integral_constant<decltype(Start), Start>());
+        ConstexprFor<Start + Inc, End, Inc>(f);
+    }
+}
+
+}  // namespace Neon
\ No newline at end of file
diff --git a/libNeonCore/include/Neon/core/types/Macros.h b/libNeonCore/include/Neon/core/types/Macros.h
index 5e909d3a..d9f47914 100644
--- a/libNeonCore/include/Neon/core/types/Macros.h
+++ b/libNeonCore/include/Neon/core/types/Macros.h
@@ -206,8 +206,12 @@
 #define NEON_RESTRICT restrict
 #endif
 
-#ifdef NEON_COMPILER_CUDA
+#if defined(NEON_COMPILER_CUDA)
+#if!defined(_WIN32)
 #define NEON_RESTRICT __restrict__
+#else
+#define NEON_RESTRICT
+#endif
 #endif
 
 #ifdef NEON_COMPILER_CLANG
diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h
index acdae410..e41c8f26 100644
--- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h
+++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.tdecl.h
@@ -56,6 +56,10 @@ class Vec_3d<IntegerType_ta, true, false>
         num_axis = 3
     };
 
+    static constexpr int directionX = axis_e::x_axis;
+    static constexpr int directionY = axis_e::y_axis;
+    static constexpr int directionZ = axis_e::z_axis;
+
     union
     {
         Integer v[axis_e::num_axis]{0, 0, 0};
@@ -120,10 +124,15 @@ class Vec_3d<IntegerType_ta, true, false>
 
     NEON_CUDA_HOST_DEVICE inline void constexpr set(Integer p[self_t::num_axis]);
 
-    NEON_CUDA_HOST_DEVICE inline void  constexpr set(const self_t& other);
+    NEON_CUDA_HOST_DEVICE inline void constexpr set(const self_t& other);
 
     NEON_CUDA_HOST_DEVICE inline void constexpr set(const Integer& xyz);
 
+    template <int componentId>
+    NEON_CUDA_HOST_DEVICE inline constexpr Integer getComponent() const
+    {
+        return v[componentId];
+    }
 
     //---- [REDUCE SECTION] --------------------------------------------------------------------------------------------
     //---- [REDUCE SECTION] --------------------------------------------------------------------------------------------
@@ -324,10 +333,10 @@ class Vec_3d<IntegerType_ta, true, false>
      *   @return Resulting point is C =(A.x / B.x, A.y / B.y, A.z / B.z)
      * */
     template <typename K_tt>
-    NEON_CUDA_HOST_DEVICE inline self_t operator*(const Vec_3d<K_tt>& B) const;
+    NEON_CUDA_HOST_DEVICE inline constexpr self_t operator*(const Vec_3d<K_tt>& B) const;
 
     template <typename K_tt>
-    NEON_CUDA_HOST_DEVICE inline self_t operator*(const K_tt& alpha) const;
+    NEON_CUDA_HOST_DEVICE inline constexpr self_t operator*(const K_tt& alpha) const;
     /**
      *   Compute the division between two points A and B, component by component (A.x/B.x, A.y/B.y, A.z/B.z).
      *   Be careful!!! if the type is int, the division will be an integer division!!!
@@ -364,15 +373,15 @@ class Vec_3d<IntegerType_ta, true, false>
      *   @param[in] B: second point for the operation.
      *   @return True if A.x <= B.x && A.y <= B.y && A.z <= B.z
      */
-    NEON_CUDA_HOST_DEVICE inline bool operator==(const self_t& B) const;
+    NEON_CUDA_HOST_DEVICE inline constexpr bool operator==(const self_t& B) const;
 
-    NEON_CUDA_HOST_DEVICE inline bool operator==(const Integer other[self_t::num_axis]) const;
+    NEON_CUDA_HOST_DEVICE inline constexpr bool operator==(const Integer other[self_t::num_axis]) const;
 
-    NEON_CUDA_HOST_DEVICE inline bool operator==(const Integer otherScalar) const;
+    NEON_CUDA_HOST_DEVICE inline constexpr bool operator==(const Integer otherScalar) const;
 
-    NEON_CUDA_HOST_DEVICE inline bool operator!=(const self_t& B) const;
+    NEON_CUDA_HOST_DEVICE inline constexpr bool operator!=(const self_t& B) const;
 
-    NEON_CUDA_HOST_DEVICE inline bool operator!=(const Integer other[self_t::num_axis]) const;
+    NEON_CUDA_HOST_DEVICE inline constexpr bool operator!=(const Integer other[self_t::num_axis]) const;
 
     NEON_CUDA_HOST_DEVICE inline self_t operator-() const;
 
diff --git a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h
index fe7222eb..c5ceea55 100644
--- a/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h
+++ b/libNeonCore/include/Neon/core/types/vec/vec3d_integer.timp.h
@@ -458,7 +458,7 @@ NEON_CUDA_HOST_DEVICE inline Vec_3d<IntegerType_ta, true, false> Vec_3d<IntegerT
 
 template <typename IntegerType_ta>
 template <typename K_tt>
-NEON_CUDA_HOST_DEVICE inline Vec_3d<IntegerType_ta, true, false> Vec_3d<IntegerType_ta, true, false>::operator*(const Vec_3d<K_tt>& B) const
+NEON_CUDA_HOST_DEVICE inline constexpr Vec_3d<IntegerType_ta, true, false> Vec_3d<IntegerType_ta, true, false>::operator*(const Vec_3d<K_tt>& B) const
 {
     const Vec_3d<Integer>& A = *this;
     // Vec_3d<Integer>        C((Integer)(A.x * B.x), (Integer)(A.y * B.y), (Integer)(A.z * B.z));
@@ -468,7 +468,7 @@ NEON_CUDA_HOST_DEVICE inline Vec_3d<IntegerType_ta, true, false> Vec_3d<IntegerT
 
 template <typename IntegerType_ta>
 template <typename K_tt>
-NEON_CUDA_HOST_DEVICE inline Vec_3d<IntegerType_ta, true, false> Vec_3d<IntegerType_ta, true, false>::operator*(const K_tt& alpha) const
+NEON_CUDA_HOST_DEVICE inline constexpr Vec_3d<IntegerType_ta, true, false> Vec_3d<IntegerType_ta, true, false>::operator*(const K_tt& alpha) const
 {
     const Vec_3d<Integer>& A = *this;
     const auto             alpha_c = static_cast<Integer>(alpha);
@@ -526,35 +526,35 @@ NEON_CUDA_HOST_DEVICE inline bool Vec_3d<IntegerType_ta, true, false>::operator<
 
 
 template <typename IntegerType_ta>
-NEON_CUDA_HOST_DEVICE inline bool Vec_3d<IntegerType_ta, true, false>::operator==(const Vec_3d<IntegerType_ta, true, false>& B) const
+NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d<IntegerType_ta, true, false>::operator==(const Vec_3d<IntegerType_ta, true, false>& B) const
 {
     const Vec_3d<Integer>& A = *this;
     return A.x == B.x && A.y == B.y && A.z == B.z;
 }
 
 template <typename IntegerType_ta>
-NEON_CUDA_HOST_DEVICE inline bool Vec_3d<IntegerType_ta, true, false>::operator==(const IntegerType_ta other[Vec_3d<IntegerType_ta, true, false>::num_axis]) const
+NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d<IntegerType_ta, true, false>::operator==(const IntegerType_ta other[Vec_3d<IntegerType_ta, true, false>::num_axis]) const
 {
     const Vec_3d<Integer>& A = *this;
     return A.x == other[0] && A.y == other[1] && A.z == other[2];
 }
 
 template <typename IntegerType_ta>
-NEON_CUDA_HOST_DEVICE inline bool Vec_3d<IntegerType_ta, true, false>::operator==(const IntegerType_ta otherScalar) const
+NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d<IntegerType_ta, true, false>::operator==(const IntegerType_ta otherScalar) const
 {
     const Vec_3d<Integer>& A = *this;
     return A.x == otherScalar && A.y == otherScalar && A.z == otherScalar;
 }
 
 template <typename IntegerType_ta>
-NEON_CUDA_HOST_DEVICE inline bool Vec_3d<IntegerType_ta, true, false>::operator!=(const Vec_3d<IntegerType_ta, true, false>& B) const
+NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d<IntegerType_ta, true, false>::operator!=(const Vec_3d<IntegerType_ta, true, false>& B) const
 {
     const Vec_3d<Integer>& A = *this;
     return !(A == B);
 }
 
 template <typename IntegerType_ta>
-NEON_CUDA_HOST_DEVICE inline bool Vec_3d<IntegerType_ta, true, false>::operator!=(const IntegerType_ta other[Vec_3d<IntegerType_ta, true, false>::num_axis]) const
+NEON_CUDA_HOST_DEVICE inline constexpr bool Vec_3d<IntegerType_ta, true, false>::operator!=(const IntegerType_ta other[Vec_3d<IntegerType_ta, true, false>::num_axis]) const
 {
     const Vec_3d<Integer>& A = *this;
     return A.x != other[0] || A.y != other[1] || A.z != other[2];
diff --git a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h
index 788291a6..940c6d2c 100644
--- a/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h
+++ b/libNeonCore/include/Neon/core/types/vec/vec4d_integer.tdecl.h
@@ -58,6 +58,7 @@ template <typename IntegerType_ta>
 class Vec_4d<IntegerType_ta, true, false>
 {
    public:
+    using Integer = IntegerType_ta;
     using element_t = IntegerType_ta;
     using self_t = Vec_4d<element_t, true, false>;
 
diff --git a/libNeonDomain/include/Neon/domain/Grids.h b/libNeonDomain/include/Neon/domain/Grids.h
index aad0cda5..7c899b98 100644
--- a/libNeonDomain/include/Neon/domain/Grids.h
+++ b/libNeonDomain/include/Neon/domain/Grids.h
@@ -3,3 +3,4 @@
 #include "Neon/domain/aGrid.h"
 #include "Neon/domain/eGrid.h"
 #include "Neon/domain/bGrid.h"
+#include "Neon/domain/dGridSoA.h"
diff --git a/libNeonDomain/include/Neon/domain/dGridSoA.h b/libNeonDomain/include/Neon/domain/dGridSoA.h
new file mode 100644
index 00000000..bdd63f25
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/dGridSoA.h
@@ -0,0 +1,7 @@
+#pragma once
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
+
+
+namespace Neon {
+using dGridSoA = Neon::domain::details::dGridSoA::dGridSoA;
+}
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
index 1e62f883..1ae2bf1d 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bField_imp.h
@@ -63,7 +63,8 @@ bField<T, C, SBlock>::bField(const std::string&  fieldUserName,
                                                      blockConnectivity.mem(),
                                                      bitmask.mem(),
                                                      dataBlockOrigins.mem(),
-                                                     mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx));
+                                                     mData->grid->helpGetStencilIdTo3dOffset().rawMem(execution, setIdx),
+                                                     mData->grid->getDimension());
             });
     }
 
@@ -311,8 +312,9 @@ auto bField<T, C, SBlock>::initHaloUpdateTable() -> void
                 T* srcMem = blockViewPartitions[Data::EndPoints::src]->mem();
                 T* dstMem = blockViewPartitions[Data::EndPoints::dst]->mem();
 
-                Neon::size_4d srcBoundaryBuff(boundaryZBeginIdx[Data::EndPoints::src][static_cast<int>(byDirection)], 0, 0, 0);
                 Neon::size_4d dstGhostBuff(ghostZBeginIdx[Data::EndPoints::dst][static_cast<int>(ByDirectionUtils::invert(byDirection))], 0, 0, 0);
+                Neon::size_4d srcBoundaryBuff(boundaryZBeginIdx[Data::EndPoints::src][static_cast<int>(byDirection)], 0, 0, 0);
+
                 size_t        transferDataBlockCount = mData->grid->mData->partitioner1D.getSpanLayout().getBoundsBoundary(setIdxVec[Data::EndPoints::src], byDirection).count;
 
                 //                std::cout << "To  " << dstGhostBuff << " prt " << blockViewPartitions[Data::EndPoints::dst]->prtID() << " From  " << srcBoundaryBuff << " prt " << blockViewPartitions[Data::EndPoints::src]->prtID() <<  std::endl;
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
index 59131cd7..e1c7e55d 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid.h
@@ -60,7 +60,8 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
           const ActiveCellLambda       activeCellLambda,
           const Neon::domain::Stencil& stencil,
           const double_3d&             spacingData = double_3d(1, 1, 1),
-          const double_3d&             origin = double_3d(0, 0, 0));
+          const double_3d&             origin = double_3d(0, 0, 0),
+          Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep);
 
 
     /**
@@ -72,10 +73,10 @@ class bGrid : public Neon::domain::interface::GridBaseTemplate<bGrid<SBlock>,
           const ActiveCellLambda       activeCellLambda /**< Function that identify the user domain inside the boxed Cartesian discretization  */,
           const Neon::domain::Stencil& stencil /**< union of tall the stencil that will be used in the computation */,
           const int                    multiResDiscreteIdxSpacing /**< Parameter for the multi-resolution. Index i and index (i+1) may be remapped as i*voxelSpacing  and (i+1)* voxelSpacing.
-                                                                   * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */
-          ,
+                                                                   * For a uniform bGrid, i.e outside the context of multi-resolution this parameter is always 1 */,
           const double_3d& spacingData /** Physical spacing between two consecutive data points in the Cartesian domain */,
-          const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */);
+          const double_3d& origin /** Physical location in space of the origin of the Cartesian discretization */,
+          Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep);
 
     /**
      * Returns some properties for a given cartesian in the Cartesian domain.
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
index 607237c6..a375c64d 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bGrid_imp.h
@@ -1,28 +1,31 @@
 #include "Neon/domain/details/bGrid/bGrid.h"
+#include "Neon/domain/tools/SpaceCurves.h"
 
 namespace Neon::domain::details::bGrid {
 
 template <typename SBlock>
 template <typename ActiveCellLambda>
-bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
-                     const Neon::int32_3d&        domainSize,
-                     const ActiveCellLambda       activeCellLambda,
-                     const Neon::domain::Stencil& stencil,
-                     const double_3d&             spacingData,
-                     const double_3d&             origin)
-    : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin)
+bGrid<SBlock>::bGrid(const Neon::Backend&                         backend,
+                     const Neon::int32_3d&                        domainSize,
+                     const ActiveCellLambda                       activeCellLambda,
+                     const Neon::domain::Stencil&                 stencil,
+                     const double_3d&                             spacingData,
+                     const double_3d&                             origin,
+                     Neon::domain::tool::spaceCurves::EncoderType encoderType)
+    : bGrid(backend, domainSize, activeCellLambda, stencil, 1, spacingData, origin, encoderType)
 {
 }
 
 template <typename SBlock>
 template <typename ActiveCellLambda>
-bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
-                     const Neon::int32_3d&        domainSize,
-                     const ActiveCellLambda       activeCellLambda,
-                     const Neon::domain::Stencil& stencil,
-                     const int                    multiResDiscreteIdxSpacing,
-                     const double_3d&             spacingData,
-                     const double_3d&             origin)
+bGrid<SBlock>::bGrid(const Neon::Backend&                         backend,
+                     const Neon::int32_3d&                        domainSize,
+                     const ActiveCellLambda                       activeCellLambda,
+                     const Neon::domain::Stencil&                 stencil,
+                     const int                                    multiResDiscreteIdxSpacing,
+                     const double_3d&                             spacingData,
+                     const double_3d&                             origin,
+                     Neon::domain::tool::spaceCurves::EncoderType encoderType)
 {
 
 
@@ -35,18 +38,25 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
                                           SBlock::memBlockSizeY,
                                           SBlock::memBlockSizeZ);
 
+    std::stringstream gridName;
+    gridName << "bGrid_" << SBlock::memBlockSizeX << "_"
+             << SBlock::memBlockSizeY << "_"
+             << SBlock::memBlockSizeZ;
     {
         auto nElementsPerPartition = backend.devSet().template newDataSet<size_t>(0);
         // We do an initialization with nElementsPerPartition to zero,
         // then we reset to the computed number.
-        bGrid::GridBase::init("bGrid",
+
+        bGrid::GridBase::init(gridName.str(),
                               backend,
                               domainSize,
                               stencil,
                               nElementsPerPartition,
                               defaultKernelBlockSize,
                               multiResDiscreteIdxSpacing,
-                              origin);
+                              origin,
+                              encoderType,
+                              defaultKernelBlockSize);
     }
 
     {  // Initialization of the partitioner
@@ -58,6 +68,7 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
             SBlock::memBlockSize3D.template newType<int32_t>(),
             domainSize,
             Neon::domain::Stencil::s27_t(false),
+            encoderType,
             multiResDiscreteIdxSpacing);
 
         mData->mDataBlockOriginField = mData->partitioner1D.getGlobalMapping();
@@ -107,8 +118,8 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
                             for (int j = 0; j < SBlock::memBlockSize3D.template newType<int32_t>().y; j++) {
                                 for (int i = 0; i < SBlock::memBlockSize3D.template newType<int32_t>().x; i++) {
                                     auto       globalPosition = blockOrigin + Neon::int32_3d(i * this->mData->mMultiResDiscreteIdxSpacing,
-                                                                                       j * this->mData->mMultiResDiscreteIdxSpacing,
-                                                                                       k * this->mData->mMultiResDiscreteIdxSpacing);
+                                                                                             j * this->mData->mMultiResDiscreteIdxSpacing,
+                                                                                             k * this->mData->mMultiResDiscreteIdxSpacing);
                                     bool const isInDomain = globalPosition < domainSize * this->mData->mMultiResDiscreteIdxSpacing;
                                     bool const isActive = activeCellLambda(globalPosition);
                                     if (isActive && isInDomain) {
@@ -155,8 +166,8 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
                                                                   BlockIdx                                  blockNghIdx = Span::getInvalidBlockId();
                                                                   typename decltype(blockConnectivity)::Idx nghIdx;
                                                                   Neon::int8_3d                             stencilPoint(i - int8_t(1),
-                                                                                             j - int8_t(1),
-                                                                                             k - int8_t(1));
+                                                                                                                         j - int8_t(1),
+                                                                                                                         k - int8_t(1));
                                                                   bool                                      isValid = blockConnectivity.getNghIndex(idx, stencilPoint, nghIdx);
                                                                   if (isValid) {
                                                                       blockNghIdx = static_cast<BlockIdx>(nghIdx.helpGet());
@@ -220,14 +231,16 @@ bGrid<SBlock>::bGrid(const Neon::Backend&         backend,
         mData->stencilIdTo3dOffset.updateDeviceData(backend, Neon::Backend::mainStreamIdx);
     }
     // Init the base grid
-    bGrid::GridBase::init("bGrid",
+    bGrid::GridBase::init(gridName.str(),
                           backend,
                           domainSize,
                           Neon::domain::Stencil(),
                           mData->mNumActiveVoxel,
                           SBlock::memBlockSize3D.template newType<int32_t>(),
                           spacingData,
-                          origin);
+                          origin,
+                          encoderType,
+                          defaultKernelBlockSize);
     {  // setting launchParameters
         mData->launchParametersTable.forEachSeq([&](Neon::DataView               dw,
                                                     Neon::set::LaunchParameters& bLaunchParameters) {
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
index b12fa671..fc596898 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition.h
@@ -36,7 +36,8 @@ class bPartition
                         typename Idx::DataBlockIdx*                   mBlockConnectivity,
                         typename SBlock::BitMask const* NEON_RESTRICT mMask,
                         Neon::int32_3d*                               mOrigin,
-                        NghIdx*                                       mStencilNghIndex);
+                        NghIdx*                                       mStencilNghIndex,
+                        Neon::int32_3d                                mDomainSize);
 
     /**
      * Retrieve the cardinality of the field.
@@ -98,6 +99,27 @@ class bPartition
                T          defaultValue)
         const -> NghData;
 
+    template <int xOff,
+              int yOff,
+              int zOff,
+              typename LambdaVALID,
+              typename LambdaNOTValid = void*>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid = nullptr)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> && (std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>;
+
+    template <int xOff,
+              int yOff,
+              int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    writeNghData(const Idx& gidx,
+                 int        card,
+                 T          value)
+        -> bool;
+
     /**
      * Gets the global coordinates of the cartesian point.
      */
@@ -109,6 +131,11 @@ class bPartition
     isActive(const Idx&                      cell,
              const typename SBlock::BitMask* mask = nullptr) const -> bool;
 
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    getDomainSize()
+        const -> Neon::index_3d;
+
     /**
      * Gets the Idx for in the block view space.
      */
@@ -116,7 +143,7 @@ class bPartition
     getBlockViewIdx(const Idx& cell)
         const -> BlockViewGridIdx;
 
-   
+
     NEON_CUDA_HOST_DEVICE inline auto
     helpGetPitch(const Idx& cell, int card)
         const -> uint32_t;
@@ -147,6 +174,7 @@ class bPartition
     helpGetNghIdx(const Idx& idx, const typename Idx::DataBlockIdx* blockConnectivity)
         const -> Idx;
 
+
     int                                             mCardinality;
     T*                                              mMem;
     NghIdx const* NEON_RESTRICT                     mStencilNghIndex;
@@ -154,6 +182,8 @@ class bPartition
     typename SBlock::BitMask const* NEON_RESTRICT   mMask;
     Neon::int32_3d const* NEON_RESTRICT             mOrigin;
     int                                             mSetIdx;
+    int                                             mMultiResDiscreteIdxSpacing = 1;
+    Neon::int32_3d                                  mDomainSize;
 };
 
 }  // namespace Neon::domain::details::bGrid
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
index ec456913..75d2006b 100644
--- a/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/bGrid/bPartition_imp.h
@@ -25,14 +25,16 @@ bPartition<T, C, SBlock>::
                typename Idx::DataBlockIdx*                   blockConnectivity,
                typename SBlock::BitMask const* NEON_RESTRICT mask,
                Neon::int32_3d*                               origin,
-               NghIdx*                                       stencilNghIndex)
+               NghIdx*                                       stencilNghIndex,
+               Neon::int32_3d                                mDomainSize)
     : mCardinality(cardinality),
       mMem(mem),
       mStencilNghIndex(stencilNghIndex),
       mBlockConnectivity(blockConnectivity),
       mMask(mask),
       mOrigin(origin),
-      mSetIdx(setIdx)
+      mSetIdx(setIdx),
+      mDomainSize(mDomainSize)
 {
 }
 
@@ -45,9 +47,20 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     location.x += gidx.mInDataBlockIdx.x;
     location.y += gidx.mInDataBlockIdx.y;
     location.z += gidx.mInDataBlockIdx.z;
+    if constexpr (SBlock::isMultiResMode) {
+        return location * mMultiResDiscreteIdxSpacing;
+    }
     return location;
 }
 
+template <typename T, int C, typename SBlock>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getDomainSize()
+        const -> Neon::index_3d
+{
+    return mDomainSize;
+}
+
 template <typename T, int C, typename SBlock>
 NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     getBlockViewIdx(const Idx& gidx)
@@ -68,7 +81,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
 
 template <typename T, int C, typename SBlock>
 inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
-                                  operator()(const Idx& cell,
+operator()(const Idx& cell,
            int        card) -> T&
 {
     return mMem[helpGetPitch(cell, card)];
@@ -76,7 +89,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
 
 template <typename T, int C, typename SBlock>
 inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
-                                  operator()(const Idx& cell,
+operator()(const Idx& cell,
            int        card) const -> const T&
 {
     return mMem[helpGetPitch(cell, card)];
@@ -97,7 +110,7 @@ inline NEON_CUDA_HOST_DEVICE auto bPartition<T, C, SBlock>::
     helpGetValidIdxPitchExplicit(const Idx& idx, int card)
         const -> uint32_t
 {
-    uint32_t const blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
+    uint32_t constexpr blockPitchByCard = SBlock::memBlockSizeX * SBlock::memBlockSizeY * SBlock::memBlockSizeZ;
     uint32_t const inBlockInCardPitch = idx.mInDataBlockIdx.x +
                                         SBlock::memBlockSizeX * idx.mInDataBlockIdx.y +
                                         (SBlock::memBlockSizeX * SBlock::memBlockSizeY) * idx.mInDataBlockIdx.z;
@@ -371,6 +384,54 @@ NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
     return result;
 }
 
+template <typename T, int C, typename SBlock>
+
+template <int xOff,
+          int yOff,
+          int zOff,
+          typename LambdaVALID,
+          typename LambdaNOTValid>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> && (std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx<xOff, yOff, zOff>(gidx);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+
+    if (isValid) {
+        auto const& value = mMem[pitch];
+        funIfValid(value);
+        return;
+    }
+
+    if constexpr (!std::is_same_v<LambdaNOTValid, void*>) {
+        funIfNOTValid();
+    }
+    return;
+}
+
+template <typename T, int C, typename SBlock>
+template <int xOff, int yOff, int zOff>
+NEON_CUDA_HOST_DEVICE inline auto bPartition<T, C, SBlock>::
+    writeNghData(const Idx& gidx,
+                 int        card,
+                 T          value)
+        -> bool
+{
+    NghData result;
+    bIndex  nghIdx = helpGetNghIdx<xOff, yOff, zOff>(gidx);
+    auto [isValid, pitch] = helpNghPitch(nghIdx, card);
+    if (!isValid) {
+        return false;
+    }
+    mMem[pitch] = value;
+    return true;
+}
+
 template <typename T, int C, typename SBlock>
 NEON_CUDA_HOST_DEVICE inline auto
 bPartition<T, C, SBlock>::isActive(const Idx&                      cell,
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h
index 49f57dbd..11dda19e 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dField_imp.h
@@ -26,7 +26,8 @@ dField<T, C>::dField(const std::string&                        fieldUserName,
                                                                              T(0),
                                                                              dataUse,
                                                                              memoryOptions,
-                                                                             haloStatus) {
+                                                                             haloStatus)
+{
 
     // only works if dims in x and y direction for all partitions match
     for (int i = 0; i < dims.size() - 1; ++i) {
@@ -88,7 +89,7 @@ dField<T, C>::dField(const std::string&                        fieldUserName,
 
     {  // Setting up partitions
         Neon::aGrid const& aGrid = mData->grid->helpFieldMemoryAllocator();
-        mData->memoryField = aGrid.newField<T,C>(fieldUserName + "-storage", cardinality, T(), dataUse, memoryOptions);
+        mData->memoryField = aGrid.newField<T, C>(fieldUserName + "-storage", cardinality, T(), dataUse, memoryOptions);
         // const int setCardinality = mData->grid->getBackend().getDeviceCount();
         mData->partitionTable.forEachConfiguration(
             [&](Neon::Execution           execution,
@@ -306,7 +307,7 @@ auto dField<T, C>::operator()(const Neon::index_3d& idxGlobal,
     auto& partition = mData->partitionTable.getPartition(Neon::Execution::host,
                                                          partitionIdx,
                                                          Neon::DataView::STANDARD);
-    auto& span = mData->grid->getSpan(Neon::Execution::host,partitionIdx, Neon::DataView::STANDARD);
+    auto& span = mData->grid->getSpan(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD);
     Idx   idx;
     bool  isOk = span.setAndValidate(idx, localIDx.x, localIDx.y, localIDx.z);
     if (!isOk) {
@@ -326,7 +327,7 @@ auto dField<T, C>::getReference(const Neon::index_3d& idxGlobal,
     auto& partition = mData->partitionTable.getPartition(Neon::Execution::host,
                                                          partitionIdx,
                                                          Neon::DataView::STANDARD);
-    auto& span = mData->grid->getSpan(Neon::Execution::host,partitionIdx, Neon::DataView::STANDARD);
+    auto& span = mData->grid->getSpan(Neon::Execution::host, partitionIdx, Neon::DataView::STANDARD);
     Idx   idx;
     bool  isOk = span.setAndValidate(idx, localIDx.x, localIDx.y, localIDx.z);
     if (!isOk) {
@@ -484,6 +485,81 @@ auto dField<T, C>::initHaloUpdateTable()
                 transfersVec.push_back(transfer);
             }
         });
+
+    mData->latticeHaloUpdateTable.forEachPutConfiguration(
+        bk, [&](Neon::SetIdx                                  setIdxSrc,
+                Execution                                     execution,
+                Neon::domain::tool::partitioning::ByDirection byDirection,
+                std::vector<Neon::set::MemoryTransfer>&       transfersVec) {
+            {
+                using namespace Neon::domain::tool::partitioning;
+
+                Neon::SetIdx setIdxDst = getNghSetIdx(setIdxSrc, byDirection);
+
+                int r = grid.getStencil().getRadius();
+
+                std::array<Partition*, Data::EndPointsUtils::nConfigs>                                  partitions;
+                std::array<std::array<int, ByDirectionUtils::nConfigs>, Data::EndPointsUtils::nConfigs> ghostZBeginIdx;
+                std::array<std::array<int, ByDirectionUtils::nConfigs>, Data::EndPointsUtils::nConfigs> boundaryZBeginIdx;
+                std::array<Neon::size_4d, Data::EndPointsUtils::nConfigs>                               memPhyDim;
+
+                partitions[Data::EndPoints::dst] = &this->getPartition(execution, setIdxDst, Neon::DataView::STANDARD);
+                partitions[Data::EndPoints::src] = &this->getPartition(execution, setIdxSrc, Neon::DataView::STANDARD);
+
+                for (auto endPoint : {Data::EndPoints::dst, Data::EndPoints::src}) {
+                    ghostZBeginIdx[endPoint][static_cast<int>(ByDirection::down)] = 0;
+                    boundaryZBeginIdx[endPoint][static_cast<int>(ByDirection::down)] = r;
+                    boundaryZBeginIdx[endPoint][static_cast<int>(ByDirection::up)] = partitions[endPoint]->dim().z;
+                    ghostZBeginIdx[endPoint][static_cast<int>(ByDirection::up)] = partitions[endPoint]->dim().z + r;
+
+                    memPhyDim[endPoint] = Neon::size_4d(
+                        1,
+                        size_t(partitions[endPoint]->dim().x),
+                        size_t(partitions[endPoint]->dim().x) * partitions[endPoint]->dim().y,
+                        size_t(partitions[endPoint]->dim().x) * partitions[endPoint]->dim().y * (partitions[endPoint]->dim().z + 2 * r));
+                }
+
+                for (int j = 0; j < this->getCardinality(); j++) {
+                    auto const& stencil = this->getGrid().getStencil();
+                    if (this->getCardinality() != stencil.nPoints()) {
+                        continue;
+                    }
+                    T* srcMem = partitions[Data::EndPoints::src]->mem();
+                    T* dstMem = partitions[Data::EndPoints::dst]->mem();
+
+                    Neon::size_4d srcBoundaryBuff(0, 0, boundaryZBeginIdx[Data::EndPoints::src][static_cast<int>(byDirection)], j);
+                    Neon::size_4d dstGhostBuff(0, 0, ghostZBeginIdx[Data::EndPoints::dst][static_cast<int>(ByDirectionUtils::invert(byDirection))], j);
+
+                    //                    std::cout << "To  " << dstGhostBuff << " prt " << partitions[Data::EndPoints::dst]->prtID() << " From  " << srcBoundaryBuff << "(src dim" << partitions[Data::EndPoints::src]->dim() << ")" << std::endl;
+                    //                    std::cout << "dst mem " << partitions[Data::EndPoints::dst]->mem() << " " << std::endl;
+                    //                    std::cout << "dst pitch " << (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum() << " " << std::endl;
+                    //                    std::cout << "dst dstGhostBuff " << dstGhostBuff << " " << std::endl;
+                    //                    std::cout << "dst pitch all" << memPhyDim[Data::EndPoints::dst] << " " << std::endl;
+
+                    Neon::set::MemoryTransfer transfer({setIdxDst, dstMem + (dstGhostBuff * memPhyDim[Data::EndPoints::dst]).rSum(), dstGhostBuff},
+                                                       {setIdxSrc, srcMem + (srcBoundaryBuff * memPhyDim[Data::EndPoints::src]).rSum(), srcBoundaryBuff},
+                                                       sizeof(T) *
+                                                           r *
+                                                           partitions[Data::EndPoints::src]->dim().x *
+                                                           partitions[Data::EndPoints::src]->dim().y);
+                    if (ByDirection::up == byDirection && bk.isLastDevice(setIdxSrc)) {
+                        return;
+                    }
+
+                    if (ByDirection::down == byDirection && bk.isFirstDevice(setIdxSrc)) {
+                        return;
+                    }
+                    if (ByDirection::up == byDirection && !(stencil.points()[j].z > 0)) {
+                        continue;
+                    }
+                    if (ByDirection::down == byDirection && !(stencil.points()[j].z < 0)) {
+                        continue;
+                    }
+                    // std::cout << transfer.toString() << std::endl;
+                    transfersVec.push_back(transfer);
+                }
+            }
+        });
     //
     //    mData->latticeHaloUpdateTable.forEachPutConfiguration(
     //        bk, [&](Neon::SetIdx                                  setIdxSrc,
@@ -608,7 +684,33 @@ auto dField<T, C>::
                     execution);
         }
     } else {
-        NEON_DEV_UNDER_CONSTRUCTION("");
+        auto transfers = bk.template newDataSet<std::vector<Neon::set::MemoryTransfer>>();
+        if (this->getMemoryOptions().getOrder() == Neon::MemoryLayout::structOfArrays) {
+            for (auto byDirection : {tool::partitioning::ByDirection::up,
+                                     tool::partitioning::ByDirection::down}) {
+
+                auto const& tableEntryByDir = mData->latticeHaloUpdateTable.get(transferMode,
+                                                                                execution,
+                                                                                byDirection);
+
+                tableEntryByDir.forEachSeq([&](SetIdx setIdx, auto const& tableEntryByDirBySetIdx) {
+                    transfers[setIdx].insert(std::end(transfers[setIdx]),
+                                             std::begin(tableEntryByDirBySetIdx),
+                                             std::end(tableEntryByDirBySetIdx));
+                });
+            }
+            dataTransferContainer =
+                Neon::set::Container::factoryDataTransfer(
+                    *this,
+                    transferMode,
+                    stencilSemantic,
+                    transfers,
+                    execution);
+
+
+        } else {
+            NEON_DEV_UNDER_CONSTRUCTION("");
+        }
     }
     Neon::set::Container SyncContainer =
         Neon::set::Container::factorySynchronization(
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h
index 226b0bb8..5d56e526 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid.h
@@ -20,7 +20,7 @@
 #include "Neon/domain/interface/LaunchConfig.h"
 #include "Neon/domain/interface/Stencil.h"
 #include "Neon/domain/interface/common.h"
-
+#include "Neon/domain/tools/SpaceCurves.h"
 #include "Neon/domain/tools/SpanTable.h"
 
 #include "Neon/domain/patterns/PatternScalar.h"
@@ -84,7 +84,8 @@ class dGrid : public Neon::domain::interface::GridBaseTemplate<dGrid, dIndex>
           const SparsityPattern&       activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */,
           const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */,
           const Vec_3d<double>&        spacing = Vec_3d<double>(1, 1, 1) /**< Spacing, i.e. size of a voxel */,
-          const Vec_3d<double>&        origin = Vec_3d<double>(0, 0, 0) /**< Origin  */);
+          const Vec_3d<double>&        origin = Vec_3d<double>(0, 0, 0) /**< Origin  */,
+          Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep);
 
     /**
      * Returns a LaunchParameters configured for the specified inputs.
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h
index 297971de..a263400a 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dGrid_imp.h
@@ -8,12 +8,18 @@ template <typename ActiveCellLambda>
 dGrid::dGrid(const Neon::Backend&  backend,
              const Neon::int32_3d& dimension,
              const ActiveCellLambda& /*activeCellLambda*/,
-             const Neon::domain::Stencil& stencil,
-             const Vec_3d<double>&        spacing,
-             const Vec_3d<double>&        origin)
+             const Neon::domain::Stencil&                 stencil,
+             const Vec_3d<double>&                        spacing,
+             const Vec_3d<double>&                        origin,
+             Neon::domain::tool::spaceCurves::EncoderType encoderType)
 {
     mData = std::make_shared<Data>(backend);
     const index_3d defaultBlockSize(256, 1, 1);
+    if (encoderType != Neon::domain::tool::spaceCurves::EncoderType::sweep) {
+        NeonException exce("dGrid");
+        exce << "dGRid only supports sweep space filling curves";
+        NEON_THROW(exce);
+    }
 
     {
         auto nElementsPerPartition = backend.devSet().template newDataSet<size_t>(0);
@@ -26,7 +32,9 @@ dGrid::dGrid(const Neon::Backend&  backend,
                               nElementsPerPartition,
                               Neon::index_3d(256, 1, 1),
                               spacing,
-                              origin);
+                              origin,
+                              Neon::domain::tool::spaceCurves::EncoderType::sweep,
+                              {0, 0, 0});
     }
 
     const int32_t numDevices = getBackend().devSet().setCardinality();
@@ -83,15 +91,17 @@ dGrid::dGrid(const Neon::Backend&  backend,
                                                   Neon::DataView dw,
                                                   dSpan&         span) {
             span.mDataView = dw;
-            span.mZHaloRadius = setCardinality == 1 ? 0 : mData->halo.z;
-            span.mZBoundaryRadius = mData->halo.z;
+            span.mZghostRadius = setCardinality == 1 ? 0 : mData->halo.z;
+            span.mZboundaryRadius = mData->halo.z;
+            span.mMaxZInDomain = mData->partitionDims[setIdx].z;
 
             switch (dw) {
                 case Neon::DataView::STANDARD: {
                     // Only works z partitions.
                     assert(mData->halo.x == 0 && mData->halo.y == 0);
 
-                    span.mDim = mData->partitionDims[setIdx];
+                    span.mSpanDim = mData->partitionDims[setIdx];
+
                     break;
                 }
                 case Neon::DataView::BOUNDARY: {
@@ -99,8 +109,8 @@ dGrid::dGrid(const Neon::Backend&  backend,
                     // Only works z partitions.
                     assert(mData->halo.x == 0 && mData->halo.y == 0);
 
-                    span.mDim = mData->partitionDims[setIdx];
-                    span.mDim.z = span.mZBoundaryRadius * 2;
+                    span.mSpanDim = mData->partitionDims[setIdx];
+                    span.mSpanDim.z = span.mZboundaryRadius * 2;
 
                     break;
                 }
@@ -109,12 +119,12 @@ dGrid::dGrid(const Neon::Backend&  backend,
                     // Only works z partitions.
                     assert(mData->halo.x == 0 && mData->halo.y == 0);
 
-                    span.mDim = mData->partitionDims[setIdx];
-                    span.mDim.z = span.mDim.z - span.mZBoundaryRadius * 2;
-                    if (span.mDim.z <= 0 && setCardinality > 1) {
+                    span.mSpanDim = mData->partitionDims[setIdx];
+                    span.mSpanDim.z = span.mSpanDim.z - span.mZboundaryRadius * 2;
+                    if (span.mSpanDim.z <= 0 && setCardinality > 1) {
                         NeonException exp("dGrid");
                         exp << "The grid size is too small to support the data view model correctly \n";
-                        exp << span.mDim << " for setIdx " << setIdx << " and device " << getDevSet().devId(setIdx);
+                        exp << span.mSpanDim << " for setIdx " << setIdx << " and device " << getDevSet().devId(setIdx);
                         NEON_THROW(exp);
                     }
 
@@ -132,7 +142,7 @@ dGrid::dGrid(const Neon::Backend&  backend,
                                                              Neon::DataView  dw,
                                                              int&            count) {
             if (Execution::host == execution) {
-                count = mData->spanTable.getSpan(Neon::Execution::host, setIdx, dw).mDim.rMul();
+                count = mData->spanTable.getSpan(Neon::Execution::host, setIdx, dw).mSpanDim.rMul();
             }
         });
     }
@@ -180,7 +190,9 @@ dGrid::dGrid(const Neon::Backend&  backend,
                               nElementsPerPartition,
                               defaultBlockSize,
                               spacing,
-                              origin);
+                              origin,
+                              Neon::domain::tool::spaceCurves::EncoderType::sweep,
+                              {0, 0, 0});
     }
 }
 
@@ -224,11 +236,11 @@ auto dGrid::newContainer(const std::string& name,
 {
     const Neon::index_3d& defaultBlockSize = getDefaultBlock();
     Neon::set::Container  c = Neon::set::Container::factory<execution>(name,
-                                                                     Neon::set::internal::ContainerAPI::DataViewSupport::on,
-                                                                     *this,
-                                                                     lambda,
-                                                                     defaultBlockSize,
-                                                                     [](const Neon::index_3d&) { return 0; });
+                                                                      Neon::set::internal::ContainerAPI::DataViewSupport::on,
+                                                                      *this,
+                                                                      lambda,
+                                                                      defaultBlockSize,
+                                                                      [](const Neon::index_3d&) { return 0; });
     return c;
 }
 
@@ -242,11 +254,11 @@ auto dGrid::newContainer(const std::string& name,
     -> Neon::set::Container
 {
     Neon::set::Container c = Neon::set::Container::factory<execution>(name,
-                                                                    Neon::set::internal::ContainerAPI::DataViewSupport::on,
-                                                                    *this,
-                                                                    lambda,
-                                                                    blockSize,
-                                                                    [sharedMem](const Neon::index_3d&) { return sharedMem; });
+                                                                      Neon::set::internal::ContainerAPI::DataViewSupport::on,
+                                                                      *this,
+                                                                      lambda,
+                                                                      blockSize,
+                                                                      [sharedMem](const Neon::index_3d&) { return sharedMem; });
     return c;
 }
 
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h
index 3291e622..a2c57cdb 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex.h
@@ -37,9 +37,9 @@ struct dIndex
 
     NEON_CUDA_HOST_DEVICE inline explicit dIndex(const Location& location);
 
-    NEON_CUDA_HOST_DEVICE inline auto set() -> Location&;
+    NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&;
 
-    NEON_CUDA_HOST_DEVICE inline auto get() const -> const Location&;
+    NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&;
 };
 
 }  // namespace Neon::domain::details::dGrid
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h
index 4389fb3f..6426e43a 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dIndex_imp.h
@@ -16,11 +16,11 @@ NEON_CUDA_HOST_DEVICE inline dIndex::dIndex(const Location::Integer &x,
     mLocation.z = z;
 }
 
-NEON_CUDA_HOST_DEVICE inline auto dIndex::set() -> Location&
+NEON_CUDA_HOST_DEVICE inline auto dIndex::setLocation() -> Location&
 {
     return mLocation;
 }
-NEON_CUDA_HOST_DEVICE inline auto dIndex::get() const -> const Location&
+NEON_CUDA_HOST_DEVICE inline auto dIndex::getLocation() const -> const Location&
 {
     return mLocation;
 }
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
index 196f6b70..c1e17b0b 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dPartition.h
@@ -44,16 +44,16 @@ class dPartition
                         int            cardinality,
                         Neon::index_3d fullGridSize,
                         NghIdx*        stencil = nullptr)
-        : m_dataView(dataView),
-          m_mem(mem),
-          m_dim(dim),
-          m_zHaloRadius(zHaloRadius),
-          m_zBoundaryRadius(zBoundaryRadius),
-          m_pitch(pitch),
-          m_prtID(prtID),
-          m_origin(origin),
-          m_cardinality(cardinality),
-          m_fullGridSize(fullGridSize),
+        : mDataView(dataView),
+          mMem(mem),
+          mDim(dim),
+          mZHaloRadius(zHaloRadius),
+          mZBoundaryRadius(zBoundaryRadius),
+          mPitch(pitch),
+          mPrtID(prtID),
+          mOrigin(origin),
+          mCardinality(cardinality),
+          mFullGridSize(fullGridSize),
           mPeriodicZ(false),
           mStencil(stencil)
     {
@@ -70,21 +70,21 @@ class dPartition
     prtID()
         const -> int
     {
-        return m_prtID;
+        return mPrtID;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     cardinality()
         const -> int
     {
-        return m_cardinality;
+        return mCardinality;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     getPitchData()
         const -> const Pitch&
     {
-        return m_pitch;
+        return mPitch;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
@@ -92,76 +92,76 @@ class dPartition
              int        cardinalityIdx = 0)
         const -> int64_t
     {
-        return idx.get().x * int64_t(m_pitch.x) +
-               idx.get().y * int64_t(m_pitch.y) +
-               idx.get().z * int64_t(m_pitch.z) +
-               cardinalityIdx * int64_t(m_pitch.w);
+        return idx.getLocation().x * int64_t(mPitch.x) +
+               idx.getLocation().y * int64_t(mPitch.y) +
+               idx.getLocation().z * int64_t(mPitch.z) +
+               cardinalityIdx * int64_t(mPitch.w);
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     dim()
         const -> const Neon::index_3d
     {
-        return m_dim;
+        return mDim;
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     halo()
         const -> const Neon::index_3d
     {
-        return Neon::index_3d(0, 0, m_zHaloRadius);
+        return Neon::index_3d(0, 0, mZHaloRadius);
     }
 
     inline NEON_CUDA_HOST_DEVICE auto
     origin()
         const -> const Neon::index_3d
     {
-        return m_origin;
+        return mOrigin;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx& eId,
+    getNghData(const Idx& gidx,
                NghIdx     nghOffset,
                int        card,
                const T&   alternativeVal)
         const -> NghData
     {
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh);
         T          val = alternativeVal;
         if (isValidNeighbour) {
-            val = operator()(cellNgh, card);
+            val = operator()(gidxNgh, card);
         }
         return NghData(val, isValidNeighbour);
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx& eId,
+    getNghData(const Idx& gidx,
                NghIdx     nghOffset,
                int        card)
         const -> NghData
     {
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx(eId, nghOffset, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh);
         T          val;
         if (isValidNeighbour) {
-            val = operator()(cellNgh, card);
+            val = operator()(gidxNgh, card);
         }
         return NghData(val, isValidNeighbour);
     }
 
-    template <int xOff, int yOff, int zOff, typename LambdaVALID, typename LambdaNOTValid = void* >
+    template <int xOff, int yOff, int zOff, typename LambdaVALID, typename LambdaNOTValid = void*>
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx&     eId,
+    getNghData(const Idx&     gidx,
                int            card,
                LambdaVALID    funIfValid,
                LambdaNOTValid funIfNOTValid = nullptr)
-        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> , void>
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T>, void>
     {
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(eId, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
         if (isValidNeighbour) {
-            T val = this->operator()(cellNgh, card);
+            T val = this->operator()(gidxNgh, card);
             funIfValid(val);
         }
         if constexpr (!std::is_same_v<LambdaNOTValid, void*>) {
@@ -171,129 +171,146 @@ class dPartition
         }
     }
 
-    template <int xOff, int yOff, int zOff>
+    template <int xOff,
+              int yOff,
+              int zOff>
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx& eId,
+    getNghData(const Idx& gidx,
                int        card)
         const -> NghData
     {
-        NghData    res;
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(eId, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
         if (isValidNeighbour) {
-            T val = operator()(cellNgh, card);
-            res.set(val, true);
-        } else {
-            res.invalidate();
+            T val = operator()(gidxNgh, card);
+            return NghData(val, isValidNeighbour);
         }
-        return res;
+        return NghData();
+    }
+
+    template <int xOff,
+              int yOff,
+              int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    writeNghData(const Idx& gidx,
+                 int        card,
+                 T          value)
+        -> bool
+    {
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        if (isValidNeighbour) {
+            operator()(gidxNgh, card) = value;
+        }
+        return isValidNeighbour;
     }
 
     template <int xOff, int yOff, int zOff>
     NEON_CUDA_HOST_DEVICE inline auto
-    getNghData(const Idx& eId,
+    getNghData(const Idx& gidx,
                int        card,
                T const&   defaultValue)
         const -> NghData
     {
         NghData    res(defaultValue, false);
-        Idx        cellNgh;
-        const bool isValidNeighbour = nghIdx<xOff, yOff, zOff>(eId, cellNgh);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
         if (isValidNeighbour) {
-            T val = operator()(cellNgh, card);
+            T val = operator()(gidxNgh, card);
             res.set(val, true);
         }
         return res;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
-    nghVal(const Idx& eId,
+    nghVal(const Idx& gidx,
            uint8_t    nghID,
            int        card,
            const T&   alternativeVal)
         const -> NghData
     {
         NghIdx nghOffset = mStencil[nghID];
-        return getNghData(eId, nghOffset, card, alternativeVal);
+        return getNghData(gidx, nghOffset, card, alternativeVal);
     }
     /**
      * Get the index of the neighbor given the offset
      * @tparam dataView_ta
-     * @param[in] eId Index of the current element
+     * @param[in] gidx Index of the current element
      * @param[in] nghOffset Offset of the neighbor of interest from the current element
      * @param[in,out] neighbourIdx Index of the neighbor
      * @return Whether the neighbour is valid
      */
     NEON_CUDA_HOST_DEVICE inline auto
-    nghIdx(const Idx&    eId,
-           const NghIdx& nghOffset,
-           Idx&          neighbourIdx)
+    helpGetNghIdx(const Idx&    gidx,
+                  const NghIdx& nghOffset,
+                  Idx&          neighbourIdx)
         const -> bool
     {
-        Idx cellNgh(eId.get().x + nghOffset.x,
-                    eId.get().y + nghOffset.y,
-                    eId.get().z + nghOffset.z);
+        Idx gidxNgh(gidx.getLocation().x + nghOffset.x,
+                    gidx.getLocation().y + nghOffset.y,
+                    gidx.getLocation().z + nghOffset.z);
 
-        const auto cellNghGlobal = getGlobalIndex(cellNgh);
+        const auto gidxNghGlobal = getGlobalIndex(gidxNgh);
 
         bool isValidNeighbour = true;
 
-        if (mPeriodicZ) {
-            printf("Error, periodic not implemented yet");
-            assert(false);
-        }
-
-        isValidNeighbour = (cellNghGlobal.x >= 0) &&
-                           (cellNghGlobal.y >= 0) &&
-                           (cellNghGlobal.z >= 0);
-
-        //        isValidNeighbour = (cellNgh.get().x < m_dim.x) &&
-        //                           (cellNgh.get().y < m_dim.y) &&
-        //                           (cellNgh.get().z < m_dim.z + 2 * m_zHaloRadius) && isValidNeighbour;
+        isValidNeighbour = (gidxNghGlobal.x >= 0) &&
+                           (gidxNghGlobal.y >= 0) &&
+                           (gidxNghGlobal.z >= 0);
 
-        isValidNeighbour = (cellNghGlobal.x < m_fullGridSize.x) &&
-                           (cellNghGlobal.y < m_fullGridSize.y) &&
-                           (cellNghGlobal.z < m_fullGridSize.z) &&
+        isValidNeighbour = (gidxNghGlobal.x < mFullGridSize.x) &&
+                           (gidxNghGlobal.y < mFullGridSize.y) &&
+                           (gidxNghGlobal.z < mFullGridSize.z) &&
                            isValidNeighbour;
 
         if (isValidNeighbour) {
-            neighbourIdx = cellNgh;
+            neighbourIdx = gidxNgh;
         }
         return isValidNeighbour;
     }
 
     template <int xOff, int yOff, int zOff>
     NEON_CUDA_HOST_DEVICE inline auto
-    nghIdx(const Idx& eId,
-           Idx&       cellNgh)
+    helpGetNghIdx(const Idx& gidx,
+                  Idx&       gidxNgh)
         const -> bool
     {
-        cellNgh = Idx(eId.get().x + xOff,
-                      eId.get().y + yOff,
-                      eId.get().z + zOff);
-        Idx cellNgh_global(cellNgh.get() + m_origin);
-        // const bool isValidNeighbour = (cellNgh_global >= 0 && cellNgh < (m_dim + m_halo) && cellNgh_global < m_fullGridSize);
+        //        NghIdx offset(xOff, yOff, zOff);
+        //        return helpGetNghIdx(gidx, offset, gidxNgh);
+        gidxNgh = Idx(gidx.getLocation().x + xOff,
+                      gidx.getLocation().y + yOff,
+                      gidx.getLocation().z + zOff);
+
         bool isValidNeighbour = true;
         if constexpr (xOff > 0) {
-            isValidNeighbour = cellNgh.get().x < (m_dim.x) && isValidNeighbour;
-            isValidNeighbour = cellNgh_global.get().x <= m_fullGridSize.x && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionX;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
         }
         if constexpr (xOff < 0) {
-            isValidNeighbour = cellNgh_global.get().x >= 0 && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionX;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
         }
         if constexpr (yOff > 0) {
-            isValidNeighbour = cellNgh.get().y < (m_dim.y) && isValidNeighbour;
-            isValidNeighbour = cellNgh_global.get().y <= m_fullGridSize.y && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionY;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
         }
         if constexpr (yOff < 0) {
-            isValidNeighbour = cellNgh_global.get().y >= 0 && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionY;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
         }
         if constexpr (zOff > 0) {
-            isValidNeighbour = cellNgh.get().z < (m_dim.z + m_zHaloRadius * 2) && isValidNeighbour;
-            isValidNeighbour = cellNgh_global.get().z <= m_fullGridSize.z && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionZ;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
         }
         if constexpr (zOff < 0) {
-            isValidNeighbour = cellNgh_global.get().z >= m_zHaloRadius && isValidNeighbour;
+            int constexpr direction = Neon::index_3d::directionZ;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
         }
         return isValidNeighbour;
     }
@@ -303,7 +320,7 @@ class dPartition
     mem()
         -> T*
     {
-        return m_mem;
+        return mMem;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -311,7 +328,7 @@ class dPartition
         const
         -> const T*
     {
-        return m_mem;
+        return mMem;
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -319,7 +336,7 @@ class dPartition
         int        cardinalityIdx) -> T*
     {
         int64_t p = getPitch(cell, cardinalityIdx);
-        return m_mem[p];
+        return mMem[p];
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -327,7 +344,7 @@ class dPartition
                int        cardinalityIdx) -> T&
     {
         int64_t p = getPitch(cell, cardinalityIdx);
-        return m_mem[p];
+        return mMem[p];
     }
 
     NEON_CUDA_HOST_DEVICE inline auto
@@ -335,7 +352,7 @@ class dPartition
                int        cardinalityIdx) const -> const T&
     {
         int64_t p = getPitch(cell, cardinalityIdx);
-        return m_mem[p];
+        return mMem[p];
     }
 
     template <typename ComputeType>
@@ -377,7 +394,8 @@ class dPartition
         }
     }
 
-    NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local) const -> Neon::index_3d
+    NEON_CUDA_HOST_DEVICE inline auto
+    getGlobalIndex(const Idx& local) const -> Neon::index_3d
     {
         //        assert(local.mLocation.x >= 0 &&
         //               local.mLocation.y >= 0 &&
@@ -386,22 +404,35 @@ class dPartition
         //               local.mLocation.y < m_dim.y &&
         //               local.mLocation.z < m_dim.z + m_zHaloRadius);
 
-        Neon::index_3d result = local.mLocation + m_origin;
-        result.z -= m_zHaloRadius;
+        Neon::index_3d result = local.mLocation;
+        result.z = result.z + mOrigin.z - mZHaloRadius;
         return result;
     }
 
+    template <int direction>
+    NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local)
+        const -> int
+    {
+        if constexpr (Neon::index_3d::directionZ != direction) {
+            return local.mLocation.v[direction];
+        } else {
+            return local.mLocation.v[Neon::index_3d::directionZ] +
+                   mOrigin.v[Neon::index_3d::directionZ] -
+                   mZHaloRadius;
+        }
+    }
+
     NEON_CUDA_HOST_DEVICE inline auto getDomainSize()
         const -> Neon::index_3d
     {
-        return m_fullGridSize;
+        return mFullGridSize;
     }
 
     auto ioToVti(std::string const& fname, std::string const& fieldName)
     {
-        auto fnameCommplete = fname + "_" + std::to_string(m_prtID);
-        auto haloOrigin = Vec_3d<double>(m_origin.x, m_origin.y, m_origin.z - m_zHaloRadius);
-        auto haloDim = m_dim + Neon::index_3d(0, 0, 2 * m_zHaloRadius) + 1;
+        auto fnameCommplete = fname + "_" + std::to_string(mPrtID);
+        auto haloOrigin = Vec_3d<double>(mOrigin.x, mOrigin.y, mOrigin.z - mZHaloRadius);
+        auto haloDim = mDim + Neon::index_3d(0, 0, 2 * mZHaloRadius) + 1;
 
         IoToVTK<int, int64_t> io(fnameCommplete,
                                  haloDim,
@@ -413,25 +444,37 @@ class dPartition
         io.addField([&](const Neon::index_3d& idx, int i) {
             return operator()(dIndex(idx), i);
         },
-                    m_cardinality, "Partition", ioToVTKns::VtiDataType_e::voxel);
+                    mCardinality, "Partition", ioToVTKns::VtiDataType_e::voxel);
 
         io.flushAndClear();
         return;
     }
 
+    auto getDataView()
+        const -> Neon::DataView
+    {
+        return mDataView;
+    }
+
+    auto helpGetGlobalToLocalOffets()
+        const -> NghIdx*
+    {
+        return mStencil;
+    }
+
    private:
-    Neon::DataView m_dataView;
-    T*             m_mem;
-    Neon::index_3d m_dim;
-    int            m_zHaloRadius;
-    int            m_zBoundaryRadius;
-    Pitch          m_pitch;
-    int            m_prtID;
-    Neon::index_3d m_origin;
-    int            m_cardinality;
-    Neon::index_3d m_fullGridSize;
-    bool           mPeriodicZ;
-    NghIdx*        mStencil;
+    Neon::DataView        mDataView;
+    T* NEON_RESTRICT      mMem;
+    Neon::index_3d        mDim;
+    int                   mZHaloRadius;
+    int                   mZBoundaryRadius;
+    Pitch                 mPitch;
+    int                   mPrtID;
+    Neon::index_3d        mOrigin;
+    int                   mCardinality;
+    Neon::index_3d        mFullGridSize;
+    bool                  mPeriodicZ;
+    NghIdx* NEON_RESTRICT mStencil;
 };
 
 
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h
index 74ab5ff3..c81baace 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan.h
@@ -43,11 +43,12 @@ class dSpan
 
    private:
     Neon::DataView mDataView;
-    int            mZHaloRadius;
-    int            mZBoundaryRadius;
-    Neon::index_3d mDim /** Dimension of the span, its values depends on the mDataView*/;
+    int            mZghostRadius;
+    int            mZboundaryRadius;
+    int            mMaxZInDomain;
+    Neon::index_3d mSpanDim /** Dimension of the span, its values depends on the mDataView*/;
 };
 
-}  // namespace Neon::domain::details::dGrid
+}  // namespace Neon::domain::deta  ils::dGrid
 
 #include "dSpan_imp.h"
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h
index 8f6f9fea..37bea7d7 100644
--- a/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/dGrid/dSpan_imp.h
@@ -10,29 +10,29 @@ dSpan::setAndValidate(Idx&            idx,
     const -> bool
 {
     bool res = false;
-    idx.set().x = int(x);
-    idx.set().y = int(y);
-    idx.set().z = int(z);
+    idx.setLocation().x = int(x);
+    idx.setLocation().y = int(y);
+    idx.setLocation().z = int(z);
 
-    if (idx.get() < mDim) {
+    if (idx.getLocation() < mSpanDim) {
         res = true;
     }
 
     switch (mDataView) {
         case Neon::DataView::STANDARD: {
-            idx.set().z += mZHaloRadius;
+            idx.setLocation().z += mZghostRadius;
             return res;
         }
         case Neon::DataView::INTERNAL: {
-            idx.set().z += mZHaloRadius + mZBoundaryRadius;
+            idx.setLocation().z += mZghostRadius + mZboundaryRadius;
             return res;
         }
         case Neon::DataView::BOUNDARY: {
 
-            idx.set().z += idx.get().z < mZBoundaryRadius
-                               ? 0
-                               : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */);
-            idx.set().z += mZHaloRadius;
+            idx.setLocation().z += idx.getLocation().z < mZboundaryRadius
+                                       ? 0
+                               : (mMaxZInDomain - 1) + (-1 * mZboundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */);
+            idx.setLocation().z += mZghostRadius;
 
             return res;
         }
@@ -51,19 +51,19 @@ NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetDataView()
 NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetZHaloRadius()
     const -> int const&
 {
-    return mZHaloRadius;
+    return mZghostRadius;
 }
 
 NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetZBoundaryRadius()
     const -> int const&
 {
-    return mZBoundaryRadius;
+    return mZboundaryRadius;
 }
 
 NEON_CUDA_HOST_DEVICE inline auto dSpan::helpGetDim()
     const -> Neon::index_3d const&
 {
-    return mDim;
+    return mSpanDim;
 }
 
 }  // namespace Neon::domain::details::dGrid
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h
new file mode 100644
index 00000000..7ce3e582
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dGridSoA.h
@@ -0,0 +1,98 @@
+#pragma once
+#include <assert.h>
+
+#include "Neon/core/core.h"
+#include "Neon/core/types/DataUse.h"
+#include "Neon/core/types/Macros.h"
+
+#include "Neon/set/BlockConfig.h"
+#include "Neon/set/Containter.h"
+#include "Neon/set/DevSet.h"
+#include "Neon/set/MemoryOptions.h"
+
+#include "Neon/sys/memory/MemDevice.h"
+
+#include "Neon/domain/aGrid.h"
+
+#include "Neon/domain/interface/GridBaseTemplate.h"
+#include "Neon/domain/interface/GridConcept.h"
+#include "Neon/domain/interface/KernelConfig.h"
+#include "Neon/domain/interface/LaunchConfig.h"
+#include "Neon/domain/interface/Stencil.h"
+#include "Neon/domain/interface/common.h"
+
+#include "Neon/domain/tools/GridTransformer.h"
+#include "Neon/domain/tools/SpanTable.h"
+
+#include "Neon/domain/details/eGrid/eGrid.h"
+#include "Neon/domain/patterns/PatternScalar.h"
+
+#include "dPartitionSoA.h"
+#include "dSpanSoA.h"
+
+namespace Neon::domain::details::dGridSoA {
+
+namespace details {
+struct dGridSoATransformation
+{
+    using FoundationGrid = Neon::domain::details::dGrid::dGrid;
+    using Idx = dIndexSoA;
+    using Span = dSpanSoA;
+    template <typename T, int C>
+    using Partition = dPartitionSoA<T, C>;
+
+    static constexpr Neon::set::internal::ContainerAPI::DataViewSupport dataViewSupport = Neon::set::internal::ContainerAPI::DataViewSupport::on;
+    static constexpr Neon::set::details::ExecutionThreadSpan            executionThreadSpan = FoundationGrid::executionThreadSpan;
+    using ExecutionThreadSpanIndexType = int32_t;
+
+    static auto getDefaultBlock(FoundationGrid& foundationGrid) -> Neon::index_3d const&
+    {
+        return foundationGrid.getDefaultBlock();
+    }
+
+    static auto initSpan(FoundationGrid& foundationGrid, Neon::domain::tool::SpanTable<Span>& spanTable) -> void
+    {
+        spanTable.forEachConfiguration([&](Neon::Execution execution,
+                                           Neon::SetIdx    setIdx,
+                                           Neon::DataView  dw,
+                                           Span&           span) {
+            span.helpInit(foundationGrid.getSpan(execution, setIdx, dw));
+        });
+    }
+
+    static auto initLaunchParameters(FoundationGrid&       foundationGrid,
+                                     Neon::DataView        dataView,
+                                     const Neon::index_3d& blockSize,
+                                     const size_t&         shareMem) -> Neon::set::LaunchParameters
+    {
+        return foundationGrid.getLaunchParameters(dataView, blockSize, shareMem);
+    }
+
+    //    static auto helpGetGridIdx(FoundationGrid&,
+    //                               Neon::SetIdx const&,
+    //                               FoundationGrid::Idx const& fgIdx)
+    //        -> dGridSoATransformation::Idx
+    //    {
+    //        dGridSoATransformation::Idx tgIdx = fgIdx;
+    //        return tgIdx;
+    //    }
+
+    template <typename T, int C>
+    static auto initFieldPartition(FoundationGrid::Field<T, C>&                         foundationField,
+                                   Neon::domain::tool::PartitionTable<Partition<T, C>>& partitionTable) -> void
+    {
+        partitionTable.forEachConfiguration(
+            [&](Neon::Execution  execution,
+                Neon::SetIdx     setIdx,
+                Neon::DataView   dw,
+                Partition<T, C>& partition) {
+                auto& foundationPartition = foundationField.getPartition(execution, setIdx, dw);
+                partition = Partition<T, C>(foundationPartition);
+            });
+    }
+};
+
+}  // namespace details
+using dGridSoA = Neon::domain::tool::GridTransformer<details::dGridSoATransformation>::Grid;
+
+}  // namespace Neon::domain::details::dGridSoA
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h
new file mode 100644
index 00000000..2ed82d86
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include "Neon/core/core.h"
+#include "Neon/domain/details/dGridSoA/dIndexSoA.h"
+
+namespace Neon::domain::details::dGridSoA {
+
+// Common forward declarations
+class dSpanSoA;
+template <typename T, int C>
+class dPartitionSoA;
+
+struct dIndexSoA
+{
+    using OuterIdx = dIndexSoA;
+
+    template <typename T, int C>
+    friend class dPartition;
+    friend dSpanSoA;
+
+    template <typename T,
+              int Cardinality>
+    friend class dField;
+
+    // dGrid specific types
+    using Offset = int32_t;
+    using Location = index_3d;
+    using Count = int32_t;
+
+    dIndexSoA() = default;
+    Location mLocation = 0;
+    Offset   mOffset = 0;
+
+    NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location const& location,
+                                                    Offset const&   offset);
+
+    NEON_CUDA_HOST_DEVICE inline explicit dIndexSoA(Location::Integer const& x,
+                                                 Location::Integer const& y,
+                                                 Location::Integer const& z,
+                                                 Offset const&            offset);
+
+    NEON_CUDA_HOST_DEVICE inline auto setLocation() -> Location&;
+
+    NEON_CUDA_HOST_DEVICE inline auto setOffset() -> Offset&;
+
+    NEON_CUDA_HOST_DEVICE inline auto getLocation() const -> const Location&;
+
+    NEON_CUDA_HOST_DEVICE inline auto getOffset() const -> const Offset&;
+};
+
+}  // namespace Neon::domain::details::dGridSoA
+
+#include "dIndexSoA_imp.h"
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h
new file mode 100644
index 00000000..790608c7
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dIndexSoA_imp.h
@@ -0,0 +1,50 @@
+#pragma once
+#include "Neon/core/core.h"
+
+namespace Neon::domain::details::dGridSoA {
+
+NEON_CUDA_HOST_DEVICE inline dIndexSoA::
+    dIndexSoA(const Location& location,
+              Offset const&   offset)
+{
+    mLocation = location;
+    mOffset = offset;
+}
+
+NEON_CUDA_HOST_DEVICE inline dIndexSoA::
+    dIndexSoA(const Location::Integer& x,
+              const Location::Integer& y,
+              const Location::Integer& z,
+              Offset const&            offset)
+{
+    mLocation.x = x;
+    mLocation.y = y;
+    mLocation.z = z;
+    mOffset = offset;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dIndexSoA::
+    setLocation() -> Location&
+{
+    return mLocation;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dIndexSoA::
+    setOffset() -> Offset&
+{
+    return mOffset;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dIndexSoA::
+    getLocation() const -> const Location&
+{
+    return mLocation;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dIndexSoA::
+    getOffset()
+        const -> const Offset&
+{
+    return mOffset;
+}
+}  // namespace Neon::domain::details::dGridSoA
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
new file mode 100644
index 00000000..15c914a3
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dPartitionSoA.h
@@ -0,0 +1,365 @@
+#pragma once
+#include <assert.h>
+#include "Neon/core/core.h"
+#include "Neon/core/types/Macros.h"
+#include "Neon/domain/details/dGrid/dGrid.h"
+#include "Neon/domain/interface/NghData.h"
+#include "Neon/set/DevSet.h"
+#include "Neon/sys/memory/CudaIntrinsics.h"
+#include "cuda_fp16.h"
+#include "dIndexSoA.h"
+
+namespace Neon::domain::details::dGridSoA {
+
+template <typename T,
+          int C = 1>
+class dPartitionSoA
+{
+   public:
+    using Idx = dIndexSoA;
+    using NghData = Neon::domain::NghData<T>;
+    using Pitch = uint32_4d;
+    using NghIdx = int8_3d;
+    using Type = T;
+
+    dPartitionSoA()
+    {
+    }
+
+    dPartitionSoA(Neon::domain::details::dGrid::dPartition<T, C>& dPartitionOriginal)
+    {
+        mDataView = dPartitionOriginal.getDataView();
+        mMem = dPartitionOriginal.mem();
+        mDim = dPartitionOriginal.dim();
+        mZHaloRadius = dPartitionOriginal.halo().z;
+        mPitch = dPartitionOriginal.getPitchData().template newType<Pitch::Integer>();
+        mPrtID = dPartitionOriginal.prtID();
+        mOrigin = dPartitionOriginal.origin();
+        mCardinality = dPartitionOriginal.cardinality();
+        mFullGridSize = dPartitionOriginal.getDomainSize();
+        mStencil = dPartitionOriginal.helpGetGlobalToLocalOffets();
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    prtID()
+        const -> int
+    {
+        return mPrtID;
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    cardinality()
+        const -> int
+    {
+        return mCardinality;
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    getPitchData()
+        const -> const Pitch&
+    {
+        return mPitch;
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    getPitch(const Idx& idx,
+             int        cardinality)
+        const -> Idx::Offset
+    {
+        return idx.getOffset() + cardinality * mPitch.w;
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    dim()
+        const -> const Neon::index_3d
+    {
+        return mDim;
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    halo()
+        const -> const Neon::index_3d
+    {
+        return Neon::index_3d(0, 0, mZHaloRadius);
+    }
+
+    inline NEON_CUDA_HOST_DEVICE auto
+    origin()
+        const -> const Neon::index_3d
+    {
+        return mOrigin;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& gidx,
+               NghIdx     nghOffset,
+               int        card,
+               const T&   alternativeVal)
+        const -> NghData
+    {
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh);
+        T          val = alternativeVal;
+        if (isValidNeighbour) {
+            val = operator()(gidxNgh, card);
+        }
+        return NghData(val, isValidNeighbour);
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& gidx,
+               NghIdx     nghOffset,
+               int        card)
+        const -> NghData
+    {
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx(gidx, nghOffset, gidxNgh);
+        T          val;
+        if (isValidNeighbour) {
+            val = operator()(gidxNgh, card);
+        }
+        return NghData(val, isValidNeighbour);
+    }
+
+    template <int xOff,
+              int yOff,
+              int zOff,
+              typename LambdaVALID,
+              typename LambdaNOTValid = void*>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid = nullptr)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T>, void>
+    {
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        if (isValidNeighbour) {
+            T val = this->operator()(gidxNgh, card);
+            funIfValid(val);
+        }
+        if constexpr (!std::is_same_v<LambdaNOTValid, void*>) {
+            if (!isValidNeighbour) {
+                funIfNOTValid();
+            }
+        }
+    }
+
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& gidx,
+               int        card)
+        const -> NghData
+    {
+        NghData    res;
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        if (isValidNeighbour) {
+            T val = operator()(gidxNgh, card);
+            res.set(val, true);
+        } else {
+            res.invalidate();
+        }
+        return res;
+    }
+
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx& gidx,
+               int        card,
+               T const&   defaultValue)
+        const -> NghData
+    {
+        NghData    res(defaultValue, false);
+        Idx        gidxNgh;
+        const bool isValidNeighbour = helpGetNghIdx<xOff, yOff, zOff>(gidx, gidxNgh);
+        if (isValidNeighbour) {
+            T val = operator()(gidxNgh, card);
+            res.set(val, true);
+        }
+        return res;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    nghVal(const Idx& gidx,
+           uint8_t    nghID,
+           int        card,
+           const T&   alternativeVal)
+        const -> NghData
+    {
+        NghIdx nghOffset = mStencil[nghID];
+        return getNghData(gidx, nghOffset, card, alternativeVal);
+    }
+
+    /**
+     * Get the index of the neighbor given the offset
+     * @tparam dataView_ta
+     * @param[in] gidx Index of the current element
+     * @param[in] nghOffset Offset of the neighbor of interest from the current element
+     * @param[in,out] neighbourIdx Index of the neighbor
+     * @return Whether the neighbour is valid
+     */
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetNghIdx(const Idx&    gidx,
+                  const NghIdx& nghOffset,
+                  Idx&          neighbourIdx)
+        const -> bool
+    {
+        Neon::index_3d cartesian(gidx.getLocation().x + nghOffset.x,
+                                 gidx.getLocation().y + nghOffset.y,
+                                 gidx.getLocation().z + nghOffset.z);
+
+        neighbourIdx = Idx(cartesian, gidx.getOffset() +
+                                          nghOffset.x * getPitchData().x +
+                                          nghOffset.y * getPitchData().y +
+                                          nghOffset.z * getPitchData().z);
+
+        Neon::index_3d const nghCartesianIdx = getGlobalIndex(neighbourIdx);
+
+        bool isValidNeighbour = true;
+
+        isValidNeighbour = (nghCartesianIdx.x >= 0) &&
+                           (nghCartesianIdx.y >= 0) &&
+                           (nghCartesianIdx.z >= 0);
+
+        isValidNeighbour = (nghCartesianIdx.x < mFullGridSize.x) &&
+                           (nghCartesianIdx.y < mFullGridSize.y) &&
+                           (nghCartesianIdx.z < mFullGridSize.z) &&
+                           isValidNeighbour;
+
+        return isValidNeighbour;
+    }
+
+    template <int xOff, int yOff, int zOff>
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetNghIdx(const Idx& gidx,
+                  Idx&       gidxNgh)
+        const -> bool
+    {
+        {
+            Neon::index_3d cartesian(gidx.getLocation().x + xOff,
+                                     gidx.getLocation().y + yOff,
+                                     gidx.getLocation().z + zOff);
+            gidxNgh = Idx(cartesian, gidx.getOffset() +
+                                         xOff * static_cast<int>(getPitchData().x) +
+                                         yOff * static_cast<int>(getPitchData().y) +
+                                         zOff * static_cast<int>(getPitchData().z));
+        }
+
+        bool isValidNeighbour = true;
+        if constexpr (xOff > 0) {
+            int constexpr direction = Neon::index_3d::directionX;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
+        }
+        if constexpr (xOff < 0) {
+            int constexpr direction = Neon::index_3d::directionX;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
+        }
+        if constexpr (yOff > 0) {
+            int constexpr direction = Neon::index_3d::directionY;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
+        }
+        if constexpr (yOff < 0) {
+            int constexpr direction = Neon::index_3d::directionY;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
+        }
+        if constexpr (zOff > 0) {
+            int constexpr direction = Neon::index_3d::directionZ;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection < mFullGridSize.v[direction] && isValidNeighbour;
+        }
+        if constexpr (zOff < 0) {
+            int constexpr direction = Neon::index_3d::directionZ;
+            int const cartesianByDirection = getGlobalIndexByDirection<direction>(gidxNgh);
+            isValidNeighbour = cartesianByDirection >= 0 && isValidNeighbour;
+        }
+        return isValidNeighbour;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    mem()
+        -> T*
+    {
+        return mMem;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    mem() const
+        -> const T*
+    {
+        return mMem;
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    mem(const Idx& cell,
+        int        cardinalityIdx)
+        -> T*
+    {
+        Idx::Offset p = getPitch(cell, cardinalityIdx);
+        return mMem[p];
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    operator()(const Idx& cell,
+               int        cardinalityIdx)
+        -> T&
+    {
+        Idx::Offset p = getPitch(cell, cardinalityIdx);
+        return mMem[p];
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    operator()(const Idx& cell,
+               int        cardinalityIdx)
+        const -> const T&
+    {
+        Idx::Offset p = getPitch(cell, cardinalityIdx);
+        return mMem[p];
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto getGlobalIndex(const Idx& local)
+        const -> Neon::index_3d
+    {
+        Neon::index_3d result = local.mLocation + mOrigin;
+        result.z -= mZHaloRadius;
+        return result;
+    }
+
+    template <int direction>
+    NEON_CUDA_HOST_DEVICE inline auto getGlobalIndexByDirection(const Idx& local)
+        const -> int
+    {
+        if constexpr (Neon::index_3d::directionZ != direction) {
+            return local.mLocation.v[direction] +
+                   mOrigin.v[direction];
+        } else {
+            return local.mLocation.v[Neon::index_3d::directionZ] +
+                   mOrigin.v[Neon::index_3d::directionZ] -
+                   mZHaloRadius;
+        }
+    }
+
+    NEON_CUDA_HOST_DEVICE inline auto getDomainSize()
+        const -> Neon::index_3d
+    {
+        return mFullGridSize;
+    }
+
+    Neon::DataView        mDataView;
+    T* NEON_RESTRICT      mMem;
+    Neon::index_3d        mDim;
+    int                   mZHaloRadius;
+    Pitch                 mPitch;
+    int                   mPrtID;
+    Neon::index_3d        mOrigin;
+    int                   mCardinality;
+    Neon::index_3d        mFullGridSize;
+    NghIdx* NEON_RESTRICT mStencil;
+};
+
+}  // namespace Neon::domain::details::dGridSoA
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h
new file mode 100644
index 00000000..3aee038c
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA.h
@@ -0,0 +1,57 @@
+#pragma once
+#include "Neon/set/DevSet.h"
+#include "dIndexSoA.h"
+#include "Neon/domain/details/dGrid/dSpan.h"
+
+namespace Neon::domain::details::dGridSoA {
+
+/**
+ * Abstraction that represents the Cell space of a partition
+ * This abstraction is used by the neon lambda executor to
+ * run a containers on aGrid
+ */
+class dSpanSoA
+{
+   public:
+    using Idx = dIndexSoA;
+
+    static constexpr Neon::set::details::ExecutionThreadSpan executionThreadSpan = Neon::set::details::ExecutionThreadSpan::d3;
+    using ExecutionThreadSpanIndexType = int32_t;
+
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    setAndValidate(Idx&            idx,
+                   const uint32_t& x,
+                   const uint32_t& y,
+                   const uint32_t& z) const
+        -> bool;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetDataView()
+        const -> Neon::DataView const&;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetZHaloRadius()
+        const -> int const&;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetZBoundaryRadius()
+        const -> int const&;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpGetDim()
+        const -> Neon::index_3d const&;
+
+    NEON_CUDA_HOST_DEVICE inline auto
+    helpInit(Neon::domain::details::dGrid::dSpan const&) ->void;
+
+   private:
+    Neon::DataView mDataView;
+    int            mZHaloRadius;
+    int            mZBoundaryRadius;
+    Neon::index_3d mDim /** Dimension of the span, its values depends on the mDataView*/;
+};
+
+}  // namespace Neon::domain::details::dGrid
+
+#include "dSpanSoA_imp.h"
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
new file mode 100644
index 00000000..f760adb5
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/details/dGridSoA/dSpanSoA_imp.h
@@ -0,0 +1,86 @@
+#pragma once
+
+namespace Neon::domain::details::dGridSoA {
+
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::setAndValidate(Idx&            idx,
+                         const uint32_t& x,
+                         const uint32_t& y,
+                         const uint32_t& z)
+    const -> bool
+{
+    idx.setLocation().x = int(x);
+    idx.setLocation().y = int(y);
+    idx.setLocation().z = int(z);
+
+    bool  isValid = idx.getLocation() < mDim;
+
+    switch (mDataView) {
+        case Neon::DataView::STANDARD: {
+            idx.setLocation().z += mZHaloRadius;
+            idx.setOffset() = idx.getLocation().x +
+                              idx.getLocation().y * mDim.x +
+                              idx.getLocation().z * mDim.x * mDim.y;
+            break ;
+        }
+        case Neon::DataView::INTERNAL: {
+            idx.setLocation().z += mZHaloRadius + mZBoundaryRadius;
+            idx.setOffset() = idx.getLocation().x +
+                              idx.getLocation().y * mDim.x +
+                              idx.getLocation().z * mDim.x * mDim.y;
+            break ;
+        }
+        case Neon::DataView::BOUNDARY: {
+            idx.setLocation().z += idx.getLocation().z < mZBoundaryRadius
+                                       ? 0
+                                       : (mDim.z - 1) + (-1 * mZBoundaryRadius /* we remove zBoundaryRadius as the first zBoundaryRadius will manage the lower slices */);
+            idx.setLocation().z += mZHaloRadius;
+            idx.setOffset() = idx.getLocation().x +
+                              idx.getLocation().y * mDim.x +
+                              idx.getLocation().z * mDim.x * mDim.y;
+            break ;
+        }
+        default: {
+        }
+    }
+    return isValid;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::helpGetDataView()
+    const -> Neon::DataView const&
+{
+    return mDataView;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::helpGetZHaloRadius()
+    const -> int const&
+{
+    return mZHaloRadius;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::helpGetZBoundaryRadius()
+    const -> int const&
+{
+    return mZBoundaryRadius;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto
+dSpanSoA::helpGetDim()
+    const -> Neon::index_3d const&
+{
+    return mDim;
+}
+
+NEON_CUDA_HOST_DEVICE inline auto dSpanSoA::helpInit(Neon::domain::details::dGrid::dSpan const& dspan) -> void
+{
+    mDataView = dspan.helpGetDataView();
+    mZHaloRadius = dspan.helpGetZHaloRadius();
+    mZBoundaryRadius = dspan.helpGetZBoundaryRadius();
+    mDim = dspan.helpGetDim();
+}
+
+
+}  // namespace Neon::domain::details::dGridSoA
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h
index c89cfdc3..2427dc57 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/eField_imp.h
@@ -65,7 +65,8 @@ eField<T, C>::eField(const std::string&         fieldUserName,
                                              mData->grid->getConnectivityField().getPartition(execution, setIdx, Neon::DataView::STANDARD).mem(),
                                              mData->grid->getGlobalMappingField().getPartition(execution, setIdx, Neon::DataView::STANDARD).mem(),
                                              mData->grid->getStencil3dTo1dOffset().rawMem(execution, setIdx),
-                                             mData->grid->getStencil().getRadius());
+                                             mData->grid->getStencil().getRadius(),
+                                             mData->grid->getDimension());
             });
     }
 
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h
index 346c2121..8a6269eb 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid.h
@@ -84,7 +84,8 @@ class eGrid : public Neon::domain::interface::GridBaseTemplate<eGrid, eIndex>
           const SparsityPattern&       activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */,
           const Neon::domain::Stencil& stencil /**< Stencil used by any computation on the grid */,
           const Vec_3d<double>&        spacing = Vec_3d<double>(1, 1, 1) /**< Spacing, i.e. size of a voxel */,
-          const Vec_3d<double>&        origin = Vec_3d<double>(0, 0, 0) /**< Origin  */);
+          const Vec_3d<double>&        origin = Vec_3d<double>(0, 0, 0) /**< Origin  */,
+          Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep);
 
     eGrid(const Neon::Backend&               backend /**< Target for computation */,
           const Neon::int32_3d&              dimension /**< Dimension of the bounding box containing the domain */,
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h
index a12f87ce..1e5c444b 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/eGrid_imp.h
@@ -10,7 +10,8 @@ eGrid::eGrid(const Neon::Backend&         backend,
              const ActiveCellLambda&      activeCellLambda,
              const Neon::domain::Stencil& stencil,
              const Vec_3d<double>&        spacing,
-             const Vec_3d<double>&        origin)
+             const Vec_3d<double>&        origin,
+             Neon::domain::tool::spaceCurves::EncoderType spaceFillingCode )
 {
     mData = std::make_shared<Data>(backend);
     mData->stencil = stencil;
@@ -29,7 +30,9 @@ eGrid::eGrid(const Neon::Backend&         backend,
                               nElementsPerPartition,
                               Neon::index_3d(256, 1, 1),
                               spacing,
-                              origin);
+                              origin,
+                              spaceFillingCode,
+                              {1,1,1});
     }
 
 
@@ -40,6 +43,7 @@ eGrid::eGrid(const Neon::Backend&         backend,
         1,
         dimension,
         stencil,
+        spaceFillingCode,
         1);
 
 
@@ -124,7 +128,9 @@ eGrid::eGrid(const Neon::Backend&         backend,
                               nElementsPerPartition,
                               defaultBlockSize,
                               spacing,
-                              origin);
+                              origin,
+                              spaceFillingCode,
+                              {1,1,1});
     }
 }
 
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
index 012a3588..4381a24c 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition.h
@@ -59,7 +59,7 @@ class ePartition
      *  |
      *  |   Connectivity table has the same layout of a field with cardinality equal to
      *  |   the number of neighbours and an SoA layout. Let's call this field nghField.
-     *  |   nghField(e, nghIdx) is the eIdx_t of the neighbour element as in a STANDARD
+     *  |   nghField(e, helpGetNghIdx) is the eIdx_t of the neighbour element as in a STANDARD
      *  |   view.
      *  |--)
      */
@@ -186,8 +186,21 @@ class ePartition
     NEON_CUDA_HOST_DEVICE inline auto
     getNghData(Idx eId,
                int card,
-               T defaultValue)
+               T   defaultValue)
         const -> NghData;
+
+    template <int xOff,
+              int yOff,
+              int zOff,
+              typename LambdaVALID,
+              typename LambdaNOTValid = void*>
+    NEON_CUDA_HOST_DEVICE inline auto
+    getNghData(const Idx&     gidx,
+               int            card,
+               LambdaVALID    funIfValid,
+               LambdaNOTValid funIfNOTValid = nullptr)
+        const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> && (std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>;
+
     /**
      * Check is the
      * @tparam dataView_ta
@@ -211,6 +224,10 @@ class ePartition
     getGlobalIndex(Idx Idx) const
         -> Neon::index_3d;
 
+    NEON_CUDA_HOST_DEVICE inline auto
+    getDomainSize()
+        const -> Neon::index_3d;
+
     NEON_CUDA_HOST_DEVICE inline auto
     mem() const
         -> const T*;
@@ -231,7 +248,8 @@ class ePartition
                         Offset*         connRaw,
                         Neon::index_3d* toGlobal,
                         int8_t*         stencil3dTo1dOffset,
-                        int32_t         stencilRadius);
+                        int32_t         stencilRadius,
+                        Neon::index_3d  domainSize);
 
     /**
      * Returns a pointer to element eId with target cardinality cardinalityIdx
@@ -256,11 +274,6 @@ class ePartition
     getOffset(Idx eId, int cardinalityIdx) const
         -> Offset;
 
-    /**
-     * Returns raw pointer of the field
-     * @tparam dataView_ta
-     * @return
-     */
 
    protected:
     //-- [INTERNAL DATA] ----------------------------------------------------------------------------
@@ -278,6 +291,7 @@ class ePartition
     int8_t*         mStencil3dTo1dOffset = {nullptr};
     int32_t         mStencilTableYPitch;
     int32_t         mStencilRadius;  // Shift to be applied to all 3d offset component to access mStencil3dTo1dOffset table
+    Neon::index_3d  mDomainSize;
 };
 }  // namespace Neon::domain::details::eGrid
 
diff --git a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h
index 0063ee9e..29980a61 100644
--- a/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/eGrid/ePartition_imp.h
@@ -37,43 +37,43 @@ ePartition<T, C>::cardinality() const
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::operator()(eIndex eId, int cardinalityIdx) const
+ePartition<T, C>::operator()(eIndex gidx, int cardinalityIdx) const
     -> T
 {
-    Offset jump = getOffset(eId, cardinalityIdx);
+    Offset jump = getOffset(gidx, cardinalityIdx);
     return mMem[jump];
 }
 
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::operator()(eIndex eId, int cardinalityIdx) -> T&
+ePartition<T, C>::operator()(eIndex gidx, int cardinalityIdx) -> T&
 {
-    Offset jump = getOffset(eId, cardinalityIdx);
+    Offset jump = getOffset(gidx, cardinalityIdx);
     return mMem[jump];
 }
 
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghData(eIndex eId,
+ePartition<T, C>::getNghData(eIndex gidx,
                              NghIdx nghIdx,
                              int    card)
     const -> NghData
 {
-    eIndex     eIdxNgh;
-    const bool isValidNeighbour = isValidNgh(eId, nghIdx, eIdxNgh);
+    eIndex     gidxxNgh;
+    const bool isValidNeighbour = isValidNgh(gidx, nghIdx, gidxxNgh);
     if (isValidNeighbour) {
-        T val = this->operator()(eIdxNgh, card);
+        T val = this->operator()(gidxxNgh, card);
         return NghData(val, isValidNeighbour);
     }
-    return NghData(isValidNeighbour);
+    return NghData();
 }
 
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghData(eIndex               eId,
+ePartition<T, C>::getNghData(eIndex               gidx,
                              const Neon::int8_3d& ngh3dIdx,
                              int                  card)
     const -> NghData
@@ -82,7 +82,7 @@ ePartition<T, C>::getNghData(eIndex               eId,
                      (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch +
                      (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
     NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
-    NghData res = getNghData(eId, nghIdx, card);
+    NghData res = getNghData(gidx, nghIdx, card);
 
     return res;
 }
@@ -91,15 +91,15 @@ template <typename T,
           int C>
 template <int xOff, int yOff, int zOff>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghData(eIndex               eId,
-                             int                  card)
+ePartition<T, C>::getNghData(eIndex gidx,
+                             int    card)
     const -> NghData
 {
     int tablePithc = (xOff + mStencilRadius) +
                      (yOff + mStencilRadius) * mStencilTableYPitch +
                      (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
     NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
-    NghData res = getNghData(eId, nghIdx, card);
+    NghData res = getNghData(gidx, nghIdx, card);
 
     return res;
 }
@@ -108,37 +108,66 @@ template <typename T,
           int C>
 template <int xOff, int yOff, int zOff>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghData(eIndex               eId,
-                             int                  card,
-                             T defaultVal)
+ePartition<T, C>::getNghData(eIndex gidx,
+                             int    card,
+                             T      defaultVal)
     const -> NghData
 {
     int tablePithc = (xOff + mStencilRadius) +
                      (yOff + mStencilRadius) * mStencilTableYPitch +
                      (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
     NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
-    NghData res = getNghData(eId, nghIdx, card);
+    NghData res = getNghData(gidx, nghIdx, card);
     if (!res.isValid()) {
         res.set(defaultVal, false);
     }
     return res;
 }
 
+template <typename T,
+          int C>
+template <int xOff,
+          int yOff,
+          int zOff,
+          typename LambdaVALID,
+          typename LambdaNOTValid>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::getNghData(const Idx&     gidx,
+                             int            card,
+                             LambdaVALID    funIfValid,
+                             LambdaNOTValid funIfNOTValid)
+    const -> std::enable_if_t<std::is_invocable_v<LambdaVALID, T> && (std::is_invocable_v<LambdaNOTValid, T> || std::is_same_v<LambdaNOTValid, void*>), void>
+{
+    int tablePithc = (xOff + mStencilRadius) +
+                     (yOff + mStencilRadius) * mStencilTableYPitch +
+                     (zOff + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
+    NghIdx  nghIdx = mStencil3dTo1dOffset[tablePithc];
+    NghData res = getNghData(gidx, nghIdx, card);
+    if (res.isValid()) {
+        funIfValid(res.getData());
+        return;
+    }
+    if constexpr (!std::is_same_v<LambdaNOTValid, void*>) {
+        funIfNOTValid();
+    }
+    return;
+}
+
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getNghIndex(eIndex               eId,
+ePartition<T, C>::getNghIndex(eIndex               gidx,
                               const Neon::int8_3d& ngh3dIdx,
-                              eIndex&              eIdxNgh) const -> bool
+                              eIndex&              gidxxNgh) const -> bool
 {
     int tablePithc = (ngh3dIdx.x + mStencilRadius) +
                      (ngh3dIdx.y + mStencilRadius) * mStencilTableYPitch +
                      (ngh3dIdx.z + mStencilRadius) * mStencilTableYPitch * mStencilTableYPitch;
     NghIdx     nghIdx = mStencil3dTo1dOffset[tablePithc];
     eIndex     tmpEIdxNgh;
-    const bool isValidNeighbour = isValidNgh(eId, nghIdx, tmpEIdxNgh);
+    const bool isValidNeighbour = isValidNgh(gidx, nghIdx, tmpEIdxNgh);
     if (isValidNeighbour) {
-        eIdxNgh = tmpEIdxNgh;
+        gidxxNgh = tmpEIdxNgh;
     }
     return isValidNeighbour;
 }
@@ -146,17 +175,17 @@ ePartition<T, C>::getNghIndex(eIndex               eId,
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::isValidNgh(eIndex  eId,
+ePartition<T, C>::isValidNgh(eIndex  gidx,
                              NghIdx  nghIdx,
                              eIndex& neighbourIdx) const
     -> bool
 {
-    const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + eId.helpGet();
+    const eIndex::Offset connectivityJumo = mCountAllocated * nghIdx + gidx.helpGet();
     neighbourIdx.helpSet() = NEON_CUDA_CONST_LOAD((mConnectivity + connectivityJumo));
     const bool isValidNeighbour = (neighbourIdx.mIdx > -1);
-    //    printf("(prtId %d) getNghData id %d eIdxNgh %d connectivityJumo %d\n",
+    //    printf("(prtId %d) getNghData id %d gidxxNgh %d connectivityJumo %d\n",
     //           mPrtID,
-    //           eId.mIdx, neighbourIdx.mIdx, connectivityJumo);
+    //           gidx.mIdx, neighbourIdx.mIdx, connectivityJumo);
     return isValidNeighbour;
 }
 
@@ -181,7 +210,8 @@ ePartition<T, C>::ePartition(int             prtId,
                              Offset*         connRaw,
                              Neon::index_3d* toGlobal,
                              int8_t*         stencil3dTo1dOffset,
-                             int32_t         stencilRadius)
+                             int32_t         stencilRadius,
+                             Neon::index_3d  domainSize)
 {
     mPrtID = prtId;
     mMem = mem;
@@ -196,25 +226,26 @@ ePartition<T, C>::ePartition(int             prtId,
     mStencilTableYPitch = 2 * stencilRadius + 1;
 
     mStencilRadius = stencilRadius;
+    mDomainSize = domainSize;
 }
 
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE auto
-ePartition<T, C>::pointer(eIndex eId, int cardinalityIdx) const
+ePartition<T, C>::pointer(eIndex gidx, int cardinalityIdx) const
     -> const Type*
 {
-    Offset jump = getOffset(eId, cardinalityIdx);
+    Offset jump = getOffset(gidx, cardinalityIdx);
     return mMem + jump;
 }
 
 template <typename T,
           int C>
 NEON_CUDA_HOST_DEVICE inline auto
-ePartition<T, C>::getOffset(eIndex eId, int cardinalityIdx) const
+ePartition<T, C>::getOffset(eIndex gidx, int cardinalityIdx) const
     -> Offset
 {
-    return Offset(eId.helpGet() * mPitch.x + cardinalityIdx * mPitch.y);
+    return Offset(gidx.helpGet() * mPitch.x + cardinalityIdx * mPitch.y);
 }
 
 template <typename T,
@@ -235,4 +266,13 @@ ePartition<T, C>::mem() const
     return mMem;
 }
 
+template <typename T,
+          int C>
+NEON_CUDA_HOST_DEVICE inline auto
+ePartition<T, C>::getDomainSize()
+    const -> Neon::index_3d
+{
+    return mDomainSize;
+}
+
 }  // namespace Neon::domain::details::eGrid
diff --git a/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h b/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h
index 9473ca55..4ab44988 100644
--- a/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/mGrid/mPartition_imp.h
@@ -33,7 +33,7 @@ mPartition<T, C>::mPartition(int                level,
                              NghIdx*            stencilNghIndex,
                              int*               refFactors,
                              int*               spacing)
-    : Neon::domain::details::bGrid::bPartition<T, C, kStaticBlock>(0, cardinality, mem, neighbourBlocks, mask, origin, stencilNghIndex),
+    : Neon::domain::details::bGrid::bPartition<T, C, kStaticBlock>(0, cardinality, mem, neighbourBlocks, mask, origin, stencilNghIndex, {0,0,0}),
       mLevel(level),
       mMemParent(memParent),
       mMemChild(memChild),
diff --git a/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h b/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h
index eed4c3bf..c76b2d42 100644
--- a/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h
+++ b/libNeonDomain/include/Neon/domain/details/sGrid/sGrid_imp.h
@@ -41,7 +41,9 @@ sGrid<OuterGridT>::sGrid(const OuterGridT&                  outerGrid,
                                   nElementsPerPartition,
                                   defaultsBlockDim,
                                   outerGrid.getSpacing(),
-                                  outerGrid.getOrigin());
+                                  outerGrid.getOrigin(),
+                                  outerGrid.getSpaceCurve(),
+                                  outerGrid.getMemoryBlock());
 
     mStorage = std::make_shared<sStorage>();
     mStorage->init(outerGrid);
@@ -173,7 +175,9 @@ sGrid<OuterGridT>::sGrid(const OuterGridT&                  outerGrid,
                           mStorage->getCount(Neon::DataView::STANDARD),
                           defaultsBlockDim,
                           outerGrid.getSpacing(),
-                          outerGrid.getOrigin());
+                          outerGrid.getOrigin(),
+                          outerGrid.getSpaceCurve(),
+                          outerGrid.getMemoryBlock());
 }
 
 template <typename OuterGridT>
diff --git a/libNeonDomain/include/Neon/domain/interface/GridBase.h b/libNeonDomain/include/Neon/domain/interface/GridBase.h
index daa5d697..04837435 100644
--- a/libNeonDomain/include/Neon/domain/interface/GridBase.h
+++ b/libNeonDomain/include/Neon/domain/interface/GridBase.h
@@ -9,8 +9,8 @@
 #include "Neon/set/DevSet.h"
 
 #include "Neon/core/tools/io/ioToVti.h"
+#include "Neon/domain/tools/SpaceCurves.h"
 #include "Stencil.h"
-
 namespace Neon::domain::interface {
 
 /**
@@ -66,13 +66,6 @@ class GridBase
     auto getNumActiveCellsPerPartition() const
         -> const Neon::set::DataSet<size_t>&;
 
-    //    /**
-    //     * Return the number of cells stored per partition
-    //     * @return
-    //     */
-    //    auto getNumActiveCellsPerPartition() const
-    //        -> const Neon::set::DataSet<size_t>&;
-
     /**
      * Creates a DataSet object compatible with the number of GPU used by the grid.
      */
@@ -123,6 +116,8 @@ class GridBase
     auto getGridUID() const
         -> size_t;
 
+
+
     /**
      * Add the grid information in a Report object
      */
@@ -136,31 +131,40 @@ class GridBase
     auto getDefaultBlock() const
         -> const Neon::index_3d&;
 
+    auto getMemoryBlock() const
+        -> Neon::index_3d;
+
+    auto getSpaceCurve() const
+        -> Neon::domain::tool::spaceCurves::EncoderType;
 
    protected:
     /**
      * Protected constructor
      */
-    GridBase(const std::string&                gridImplementationName,
-             const Neon::Backend&              backend,
-             const Neon::index_3d&             dim,
-             const Neon::domain::Stencil&      stencil,
-             const Neon::set::DataSet<size_t>& nPartitionElements /**< Number of element per partition */,
-             const Neon::index_3d&             defaultBlockSize,
-             const Vec_3d<double>&             spacingData = Vec_3d<double>(1, 1, 1) /*! Spacing, i.e. size of a voxel */,
-             const Vec_3d<double>&             origin = Vec_3d<double>(0, 0, 0) /*!      Origin  */);
+    GridBase(const std::string&                           gridImplementationName,
+             const Neon::Backend&                         backend,
+             const Neon::index_3d&                        dim,
+             const Neon::domain::Stencil&                 stencil,
+             const Neon::set::DataSet<size_t>&            nPartitionElements /**< Number of element per partition */,
+             const Neon::index_3d&                        defaultBlockSize,
+             const Vec_3d<double>&                        spacingData /*! Spacing, i.e. size of a voxel */,
+             const Vec_3d<double>&                        origin /*!      Origin  */,
+             Neon::domain::tool::spaceCurves::EncoderType spaceCurve,
+             Neon::index_3d                               memoryBlock);
 
     /**
      * Protected initialization function used by derived classes to set some parameters.
      */
-    auto init(const std::string&                gridImplementationName /**< Name of the implementation, for example dGrid eGrid etc */,
-              const Neon::Backend&              backend /**< Backend used to create the grid */,
-              const Neon::index_3d&             dimension /**< Dimension of the grid */,
-              const Neon::domain::Stencil&      stencil /**< Union of all the stencil that will be used with the grid */,
-              const Neon::set::DataSet<size_t>& nPartitionElements /**< Elements associated to each partition */,
-              const Neon::index_3d&             defaultBlockSize /**< Default thread block size */,
-              const Vec_3d<double>&             spacingData /**< Grid spacing */,
-              const Vec_3d<double>&             origin /**< Position in space of the grid's origin */) -> void;
+    auto init(const std::string&                           gridImplementationName /**< Name of the implementation, for example dGrid eGrid etc */,
+              const Neon::Backend&                         backend /**< Backend used to create the grid */,
+              const Neon::index_3d&                        dimension /**< Dimension of the grid */,
+              const Neon::domain::Stencil&                 stencil /**< Union of all the stencil that will be used with the grid */,
+              const Neon::set::DataSet<size_t>&            nPartitionElements /**< Elements associated to each partition */,
+              const Neon::index_3d&                        defaultBlockSize /**< Default thread block size */,
+              const Vec_3d<double>&                        spacingData /**< Grid spacing */,
+              const Vec_3d<double>&                        origin /**< Position in space of the grid's origin */,
+              Neon::domain::tool::spaceCurves::EncoderType spaceCurve,
+              Neon::index_3d                               memoryBlock) -> void;
 
     /**
      * Protected method to set the default thread blocks size
@@ -175,6 +179,7 @@ class GridBase
         -> Neon::set::LaunchParameters&;
 
 
+
    private:
     struct Storage
     {
@@ -187,14 +192,16 @@ class GridBase
             index_3d blockDim;
         };
 
-        Neon::Backend              backend /**<            Backend used to create and run the grid. */;
-        Neon::index_3d             dimension /**<          Dimension of the grid                    */;
-        Neon::domain::Stencil      stencil /**<            Stencil used for the grid initialization */;
-        Neon::set::DataSet<size_t> nPartitionElements /**< Number of elements per partition         */;
-        Vec_3d<double>             spacing /**<            Spacing, i.e. size of a voxel            */;
-        Vec_3d<double>             origin /**<             Position in space of the grid's origin   */;
-        Defaults_t                 defaults;
-        std::string                gridImplementationName;
+        Neon::Backend                                backend /**<            Backend used to create and run the grid. */;
+        Neon::index_3d                               dimension /**<          Dimension of the grid                    */;
+        Neon::domain::Stencil                        stencil /**<            Stencil used for the grid initialization */;
+        Neon::set::DataSet<size_t>                   nPartitionElements /**< Number of elements per partition         */;
+        Vec_3d<double>                               spacing /**<            Spacing, i.e. size of a voxel            */;
+        Vec_3d<double>                               origin /**<             Position in space of the grid's origin   */;
+        Defaults_t                                   defaults;
+        std::string                                  gridImplementationName;
+        Neon::domain::tool::spaceCurves::EncoderType spaceCurve;
+        Neon::index_3d                               memoryBlock;
     };
 
     std::shared_ptr<Storage> mStorage;
diff --git a/libNeonDomain/include/Neon/domain/interface/NghData.h b/libNeonDomain/include/Neon/domain/interface/NghData.h
index 487c8fd7..b7de2fca 100644
--- a/libNeonDomain/include/Neon/domain/interface/NghData.h
+++ b/libNeonDomain/include/Neon/domain/interface/NghData.h
@@ -10,7 +10,7 @@ struct NghData
 {
     Type                  mData;
     bool                  mIsValid;
-    NEON_CUDA_HOST_DEVICE NghData(bool status = false)
+    NEON_CUDA_HOST_DEVICE NghData()
     {
         this->mIsValid = false;
     }
diff --git a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h
index 90556fb9..47518f7a 100644
--- a/libNeonDomain/include/Neon/domain/tools/GridTransformer.h
+++ b/libNeonDomain/include/Neon/domain/tools/GridTransformer.h
@@ -1,10 +1,10 @@
 #pragma once
 
+#include "Neon/domain/tools/PartitionTable.h"
+#include "Neon/domain/tools/SpanTable.h"
 #include "Neon/domain/tools/gridTransformer/tField.h"
 #include "Neon/domain/tools/gridTransformer/tGrid.h"
 #include "Neon/domain/tools/gridTransformer/tGrid_ti.h"
-#include "Neon/domain/tools/PartitionTable.h"
-#include "Neon/domain/tools/SpanTable.h"
 
 namespace Neon::domain::tool {
 
@@ -24,9 +24,10 @@ template <typename GridTransformation>
 class GridTransformer
 {
    public:
+    using Idx = typename GridTransformation::Idx;
+    using Span = typename GridTransformation::Span;
     template <typename T, int C>
     using Partition = typename GridTransformation::template Partition<T, C>;
-    using Span = typename GridTransformation::Span;
     using FoundationGrid = typename GridTransformation::FoundationGrid;
 
     using Grid = details::tGrid<GridTransformation>;
diff --git a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h
index 67f7d9f7..6d110a1f 100644
--- a/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h
+++ b/libNeonDomain/include/Neon/domain/tools/Partitioner1D.h
@@ -105,13 +105,14 @@ class Partitioner1D
 
     template <typename ActiveIndexLambda,
               typename BcLambda>
-    Partitioner1D(const Neon::Backend&        backend,
-                  const ActiveIndexLambda&    activeIndexLambda,
-                  const BcLambda&             bcLambda,
-                  const Neon::index_3d&       dataBlockSize,
-                  const Neon::int32_3d&       domainSize,
-                  const Neon::domain::Stencil stencil,
-                  const int&                  multiResDiscreteIdxSpacing = 1)
+    Partitioner1D(const Neon::Backend&                         backend,
+                  const ActiveIndexLambda&                     activeIndexLambda,
+                  const BcLambda&                              bcLambda,
+                  const Neon::index_3d&                        dataBlockSize,
+                  const Neon::int32_3d&                        domainSize,
+                  const Neon::domain::Stencil                  stencil,
+                  Neon::domain::tool::spaceCurves::EncoderType spaceFillingType,
+                  const int&                                   multiResDiscreteIdxSpacing = 1)
     {
         mData = std::make_shared<Data>();
 
@@ -119,6 +120,7 @@ class Partitioner1D
         mData->mMultiResDiscreteIdxSpacing = multiResDiscreteIdxSpacing;
         mData->mStencil = stencil;
         mData->mDomainSize = domainSize;
+        mData->spaceCurve = spaceFillingType;
 
         // Block space interval (i.e. indexing space at the block granularity)
 
@@ -164,6 +166,7 @@ class Partitioner1D
             domainSize,
             stencil,
             multiResDiscreteIdxSpacing,
+            spaceFillingType,
             mData->spanDecomposition);
 
         mData->mSpanLayout = std::make_shared<partitioning::SpanLayout>(
@@ -182,7 +185,12 @@ class Partitioner1D
     {
         return mData->block3DSpan;
     }
-    
+
+    auto getSpaceCurve() const -> Neon::domain::tool::spaceCurves::EncoderType
+    {
+        return mData->spaceCurve;
+    }
+
     auto getMemoryGrid() -> Neon::aGrid&
     {
         return mData->mTopologyWithGhost;
@@ -288,7 +296,7 @@ class Partitioner1D
 
     auto getDenseMeta() -> const DenseMeta&
     {
-        //setDenseMeta();
+        // setDenseMeta();
         return *mData->mDenseMeta;
     }
 
@@ -443,13 +451,14 @@ class Partitioner1D
     class Data
     {
        public:
-        Neon::index_3d                        mDataBlockSize = 0;
-        int                                   mMultiResDiscreteIdxSpacing = 0;
-        Neon::domain::Stencil                 mStencil;
-        Neon::index_3d                        mDomainSize;
-        Neon::int32_3d                        block3DSpan;
-        bool                                  globalMappingInit = false;
-        Neon::aGrid::Field<Neon::int32_3d, 0> globalMapping;
+        Neon::index_3d                               mDataBlockSize = 0;
+        int                                          mMultiResDiscreteIdxSpacing = 0;
+        Neon::domain::Stencil                        mStencil;
+        Neon::index_3d                               mDomainSize;
+        Neon::int32_3d                               block3DSpan;
+        bool                                         globalMappingInit = false;
+        Neon::aGrid::Field<Neon::int32_3d, 0>        globalMapping;
+        Neon::domain::tool::spaceCurves::EncoderType spaceCurve;
 
         bool                      getStencil3dTo1dOffsetInit = false;
         Neon::set::MemSet<int8_t> stencil3dTo1dOffset;
diff --git a/libNeonDomain/include/Neon/domain/tools/PointHashTable.h b/libNeonDomain/include/Neon/domain/tools/PointHashTable.h
index 1b3e547e..d7bca923 100644
--- a/libNeonDomain/include/Neon/domain/tools/PointHashTable.h
+++ b/libNeonDomain/include/Neon/domain/tools/PointHashTable.h
@@ -61,6 +61,8 @@ class PointHashTable
     */
     auto size() const -> size_t;
 
+    auto getBBox() const -> Point const&;
+
    private:
     using Key = size_t;
 
diff --git a/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h b/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h
index 3a7375af..1c9abbef 100644
--- a/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h
+++ b/libNeonDomain/include/Neon/domain/tools/PointHashTable_imp.h
@@ -105,4 +105,10 @@ auto PointHashTable<IntegerT, MetaT>::size() const -> size_t
 {
     return mMap.size();
 }
+
+template <typename IntegerT, typename MetaT>
+auto PointHashTable<IntegerT, MetaT>::getBBox() const -> Point const&{
+    return mBBox;
+}
+
 }  // namespace Neon::domain::tool
\ No newline at end of file
diff --git a/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h
new file mode 100644
index 00000000..add3f51e
--- /dev/null
+++ b/libNeonDomain/include/Neon/domain/tools/SpaceCurves.h
@@ -0,0 +1,352 @@
+#pragma once
+#include "Neon/Neon.h"
+#include "Neon/Report.h"
+
+namespace Neon::domain::tool::spaceCurves {
+
+
+enum struct EncoderType
+{
+    sweep = 0,
+    morton = 1,
+    hilbert = 2,
+};
+
+
+/**
+ * Set of utilities for DataView options.
+ */
+struct EncoderTypeUtil
+{
+    /**
+     * Number of configurations for the enum
+     */
+    static const int nConfig{static_cast<int>(3)};
+
+    /**
+     * Convert enum value to string
+     *
+     * @param dataView
+     * @return
+     */
+    static auto toString(EncoderType encoderType) -> std::string;
+
+    /**
+     * Returns all valid configuration for DataView
+     * @return
+     */
+    static auto getOptions() -> std::array<EncoderType, DataViewUtil::nConfig>;
+
+    static auto fromInt(int val) -> EncoderType;
+    static auto fromString(const std::string& opt) -> EncoderType;
+    static auto toInt(EncoderType encoderType) -> int;
+
+    struct Cli
+    {
+        explicit Cli(std::string);
+        explicit Cli(EncoderType model);
+        Cli();
+
+        auto getOption() const -> EncoderType;
+        auto set(const std::string& opt) -> void;
+        auto getStringOptions() const -> std::string;
+        auto getStringOption() const -> std::string;
+        auto getDoc() const -> std::string;
+
+        auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void;
+        auto addToReport(Neon::Report& report) const -> void;
+
+       private:
+        bool        mSet = false;
+        EncoderType mOption;
+    };
+};
+
+
+/**
+ * operator<<
+ *
+ * @param os
+ * @param m
+ * @return
+ */
+std::ostream& operator<<(std::ostream& os, Neon::DataView const& m);
+
+class Encoder
+{
+   private:
+    static constexpr uint8_t mortonToHilbertTable[] = {
+        48,
+        33,
+        27,
+        34,
+        47,
+        78,
+        28,
+        77,
+        66,
+        29,
+        51,
+        52,
+        65,
+        30,
+        72,
+        63,
+        76,
+        95,
+        75,
+        24,
+        53,
+        54,
+        82,
+        81,
+        18,
+        3,
+        17,
+        80,
+        61,
+        4,
+        62,
+        15,
+        0,
+        59,
+        71,
+        60,
+        49,
+        50,
+        86,
+        85,
+        84,
+        83,
+        5,
+        90,
+        79,
+        56,
+        6,
+        89,
+        32,
+        23,
+        1,
+        94,
+        11,
+        12,
+        2,
+        93,
+        42,
+        41,
+        13,
+        14,
+        35,
+        88,
+        36,
+        31,
+        92,
+        37,
+        87,
+        38,
+        91,
+        74,
+        8,
+        73,
+        46,
+        45,
+        9,
+        10,
+        7,
+        20,
+        64,
+        19,
+        70,
+        25,
+        39,
+        16,
+        69,
+        26,
+        44,
+        43,
+        22,
+        55,
+        21,
+        68,
+        57,
+        40,
+        58,
+        67,
+    };
+
+    static constexpr uint8_t hilbertToMortonTable[] = {
+        48,
+        33,
+        35,
+        26,
+        30,
+        79,
+        77,
+        44,
+        78,
+        68,
+        64,
+        50,
+        51,
+        25,
+        29,
+        63,
+        27,
+        87,
+        86,
+        74,
+        72,
+        52,
+        53,
+        89,
+        83,
+        18,
+        16,
+        1,
+        5,
+        60,
+        62,
+        15,
+        0,
+        52,
+        53,
+        57,
+        59,
+        87,
+        86,
+        66,
+        61,
+        95,
+        91,
+        81,
+        80,
+        2,
+        6,
+        76,
+        32,
+        2,
+        6,
+        12,
+        13,
+        95,
+        91,
+        17,
+        93,
+        41,
+        40,
+        36,
+        38,
+        10,
+        11,
+        31,
+        14,
+        79,
+        77,
+        92,
+        88,
+        33,
+        35,
+        82,
+        70,
+        10,
+        11,
+        23,
+        21,
+        41,
+        40,
+        4,
+        19,
+        25,
+        29,
+        47,
+        46,
+        68,
+        64,
+        34,
+        45,
+        60,
+        62,
+        71,
+        67,
+        18,
+        16,
+        49,
+    };
+
+    static inline auto transformCurve(uint64_t in, uint64_t bits, const uint8_t* lookupTable)
+    {
+        uint64_t transform = 0;
+        uint64_t out = 0;
+
+        for (int32_t i = int(3 * (bits - 1)); i >= 0; i -= 3) {
+            transform = lookupTable[transform | ((in >> i) & 7)];
+            out = (out << 3) | (transform & 7);
+            transform &= ~7;
+        }
+
+        return out;
+    }
+
+    static inline auto mortonToHilbert3D(uint64_t mortonIndex, uint64_t bits)
+    {
+        return transformCurve(mortonIndex, bits, mortonToHilbertTable);
+    }
+
+    static inline auto hilbertToMorton3D(uint64_t hilbertIndex, uint64_t bits)
+    {
+        return transformCurve(hilbertIndex, bits, hilbertToMortonTable);
+    }
+
+
+    static inline auto splitBy3(uint64_t a)
+    {
+        uint64_t x = a & 0x1fffff;              // we only care about 21 bits
+        x = (x | x << 32) & 0x1f00000000ffff;   // shift left 32 bits, mask out bits 21-31
+        x = (x | x << 16) & 0x1f0000ff0000ff;   // shift left 16 bits, mask out bits 11-20, 43-52
+        x = (x | x << 8) & 0x100f00f00f00f00f;  // shift left 8 bits, mask out bits 5-10, 21-26, 37-42, 53-58
+        x = (x | x << 4) & 0x10c30c30c30c30c3;  // shift left 4 bits, mask out bits 3-4, 11-12, 19-20, 27-28, 35-36, 43-44, 51-52, 59-60
+        x = (x | x << 2) & 0x1249249249249249;  // shift left 2 bits, mask out bits 2, 6-7, 10, 14-15, 18, 22-23, 26, 30-31, 34, 38-39, 42, 46-47, 50, 54-55, 58
+        return x;
+    }
+
+   public:
+    static inline auto mortonEncode([[maybe_unused]] Neon::index_3d dim, Neon::index_3d idx)
+        -> uint64_t
+    {
+        auto idxU64 = idx.newType<uint64_t>();
+        return splitBy3(idxU64.x) | (splitBy3(idxU64.y) << 1) | (splitBy3(idxU64.z) << 2);
+    }
+
+    static inline auto encodeHilbert(Neon::index_3d dim, Neon::index_3d idx)
+        -> uint64_t
+    {
+        uint64_t mortonEncoded = mortonEncode(dim, idx);
+        uint64_t bits = uint64_t(std::ceil(std::log2(dim.newType<uint64_t>().rMax())));
+        return mortonToHilbert3D(mortonEncoded, bits);
+    }
+
+    static inline auto encodeSweep(Neon::index_3d dim, Neon::index_3d idx)
+        -> uint64_t
+    {
+        auto idxU64 = idx.newType<uint64_t>();
+        auto dimU64 = dim.newType<uint64_t>();
+
+        uint64_t res = idxU64.x + idxU64.y * dimU64.x + idxU64.z * dimU64.x * dimU64.y;
+        return res;
+    }
+
+    static inline auto encode(EncoderType type, Neon::index_3d dim, Neon::index_3d idx)
+    {
+        switch (type) {
+            case EncoderType::morton:
+                return mortonEncode(dim, idx);
+            case EncoderType::hilbert:
+                return encodeHilbert(dim, idx);
+            case EncoderType::sweep:
+                return encodeSweep(dim, idx);
+            default:
+                NEON_THROW_UNSUPPORTED_OPERATION("Encoder type not supported");
+        }
+    }
+};
+}  // namespace Neon::domain::tool::spaceCurves
diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h
index c9ca59b9..a1b4c90d 100644
--- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h
+++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tField.h
@@ -26,6 +26,7 @@ class tField : public Neon::domain::interface::FieldBaseTemplate<T,
     using Partition = typename GridTransformation::template Partition<T, C>;
     using Idx = typename Partition::Idx;
     using NghIdx = typename Partition::NghIdx;  // for compatibility with eGrid
+    using NghData = typename Partition::NghData;  // for compatibility with eGrid
 
    private:
     using FoundationGrid = typename GridTransformation::FoundationGrid;
diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h
index d6d98be1..ac98983c 100644
--- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h
+++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid.h
@@ -8,8 +8,8 @@
 #include "Neon/domain/interface/Stencil.h"
 #include "Neon/domain/interface/common.h"
 #include "Neon/domain/patterns/PatternScalar.h"
+#include "Neon/domain/tools/SpaceCurves.h"
 #include "Neon/domain/tools/SpanTable.h"
-
 /**
  * template <typename FoundationGrid>
  * GridTransformation {
@@ -54,6 +54,16 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate<tGrid<GridTransfo
     tGrid();
     virtual ~tGrid();
     explicit tGrid(FoundationGrid& foundationGrid);
+
+    template <typename SparsityPattern>
+    tGrid(const Neon::Backend&                         backend /**< Target for computation */,
+          const Neon::int32_3d&                        dimension /**< Dimension of the bounding box containing the domain */,
+          const SparsityPattern&                       activeCellLambda /**< InOrOutLambda({x,y,z}->{true, false}) */,
+          const Neon::domain::Stencil&                 stencil /**< Stencil used by any computation on the grid */,
+          const Vec_3d<double>&                        spacing = Vec_3d<double>(1, 1, 1) /**< Spacing, i.e. size of a voxel */,
+          const Vec_3d<double>&                        origin = Vec_3d<double>(0, 0, 0) /**< Origin  */,
+          Neon::domain::tool::spaceCurves::EncoderType encoderType = Neon::domain::tool::spaceCurves::EncoderType::sweep);
+
     tGrid(const tGrid& other);                 // copy constructor
     tGrid(tGrid&& other) noexcept;             // move constructor
     tGrid& operator=(const tGrid& other);      // copy assignment
@@ -109,7 +119,7 @@ class tGrid : public Neon::domain::interface::GridBaseTemplate<tGrid<GridTransfo
     struct Data
     {
         Data() = default;
-        explicit Data(Neon::Backend& bk)
+        explicit Data(Neon::Backend const& bk)
         {
             spanTable = Neon::domain::tool::SpanTable<Span>(bk);
         }
diff --git a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h
index 4ba1403d..b01b8718 100644
--- a/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h
+++ b/libNeonDomain/include/Neon/domain/tools/gridTransformer/tGrid_ti.h
@@ -27,7 +27,41 @@ tGrid<GridTransformation>::tGrid(FoundationGrid& foundationGrid)
                           foundationGrid.getNumActiveCellsPerPartition(),
                           foundationGrid.getDefaultBlock(),
                           foundationGrid.getSpacing(),
-                          foundationGrid.getOrigin());
+                          foundationGrid.getOrigin(),
+                          foundationGrid.getSpaceCurve(),
+                          foundationGrid.getMemoryBlock());
+}
+
+template <typename GridTransformation>
+template <typename SparsityPattern>
+tGrid<GridTransformation>::tGrid(const Neon::Backend&                         bk,
+                                 const Neon::int32_3d&                        dimension,
+                                 const SparsityPattern&                       activeCellLambda,
+                                 const Neon::domain::Stencil&                 stencil,
+                                 const Vec_3d<double>&                        spacing,
+                                 const Vec_3d<double>&                        origin,
+                                 Neon::domain::tool::spaceCurves::EncoderType encoderType)
+{
+    mData = std::make_shared<Data>(bk);
+    mData->foundationGrid = FoundationGrid(bk,
+                                           dimension,
+                                           activeCellLambda,
+                                           stencil,
+                                           spacing,
+                                           origin,
+                                           encoderType);
+    GridTransformation::initSpan(mData->foundationGrid,
+                                 NEON_OUT mData->spanTable);
+    tGrid::GridBase::init("tGrid",
+                          bk,
+                          mData->foundationGrid.getDimension(),
+                          mData->foundationGrid.getStencil(),
+                          mData->foundationGrid.getNumActiveCellsPerPartition(),
+                          mData->foundationGrid.getDefaultBlock(),
+                          mData->foundationGrid.getSpacing(),
+                          mData->foundationGrid.getOrigin(),
+                          encoderType,
+                          mData->foundationGrid.getMemoryBlock());
 }
 
 template <typename GridTransformation>
diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h
index 8833af7a..7cf442c6 100644
--- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h
+++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanClassifier.h
@@ -2,12 +2,95 @@
 #include "Neon/core/core.h"
 
 
+#include <numeric>
 #include "Cassifications.h"
 #include "Neon/domain/tools/PointHashTable.h"
+#include "Neon/domain/tools/SpaceCurves.h"
 #include "Neon/domain/tools/partitioning/SpanDecomposition.h"
-
 namespace Neon::domain::tool::partitioning {
 
+struct Hash
+{
+    std::vector<Neon::index_3d>                           id1dTo3d;
+    Neon::domain::tool::PointHashTable<int32_t, uint64_t> id3dTo1d;
+
+    auto reHash(Neon::domain::tool::spaceCurves::EncoderType encoderType) -> void
+    {
+        //        std::cout << "BEFORE Cartesian ";
+        //        for (int i = 0; i < int(id1dTo3d.size()); i++) {
+        //            std::cout << id1dTo3d[i] << " ";
+        //        }
+        //        std::cout << std::endl
+        //                  << " ID ";
+        //        for (int i = 0; i < int(id1dTo3d.size()); i++) {
+        //            std::cout << *id3dTo1d.getMetadata(id1dTo3d[i]) << " ";
+        //        }
+        //        std::cout << std::endl
+        //                  << " CODE ";
+        //        for (int i = 0; i < int(id1dTo3d.size()); i++) {
+        //            std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(spaceCurve, id3dTo1d.getBBox(), id1dTo3d[i]) << " ";
+        //        }
+        //        std::cout << std::endl;
+        //        std::cout << " BOX " << id3dTo1d.getBBox();
+        //
+        //        std::cout << std::endl;
+
+        // Encoding all points w.r.t the encoder type
+        std::vector<uint64_t> code;
+        for (auto const& point : id1dTo3d) {
+            code.push_back(Neon::domain::tool::spaceCurves::Encoder::encode(encoderType, id3dTo1d.getBBox(), point));
+        }
+        // Sort id1dTo3d w.r.t. the codes
+        std::vector<std::size_t> permutation = getSortedPermutation(code, [](uint64_t a, uint64_t b) {
+            return a < b;
+        });
+        id1dTo3d = applyPermutation(id1dTo3d, permutation);
+        for (uint64_t i = 0; i < id1dTo3d.size(); i++) {
+            *(id3dTo1d.getMetadata(id1dTo3d[i])) = i;
+        }
+//
+//        std::cout << "AFTER Cartesian ";
+//        for (int i = 0; i < int(id1dTo3d.size()); i++) {
+//            std::cout << id1dTo3d[i] << " ";
+//        }
+//        std::cout << std::endl
+//                  << " ID ";
+//        for (int i = 0; i < int(id1dTo3d.size()); i++) {
+//            std::cout << *id3dTo1d.getMetadata(id1dTo3d[i]) << " ";
+//        }
+//        std::cout << std::endl
+//                  << " CODE ";
+//        for (int i = 0; i < int(id1dTo3d.size()); i++) {
+//            std::cout << Neon::domain::tool::spaceCurves::Encoder::encode(spaceCurve, id3dTo1d.getBBox(), id1dTo3d[i]) << " ";
+//        }
+//        std::cout << std::endl;
+    }
+
+   private:
+    template <typename T, typename Compare>
+    std::vector<std::size_t> getSortedPermutation(
+        const std::vector<T>& vec,
+        Compare const&        compare)
+    {
+        std::vector<std::size_t> p(vec.size());
+        std::iota(p.begin(), p.end(), 0);
+        std::sort(p.begin(), p.end(),
+                  [&](std::size_t i, std::size_t j) { return compare(vec[i], vec[j]); });
+        return p;
+    }
+
+    template <typename T>
+    std::vector<T> applyPermutation(
+        const std::vector<T>&           vec,
+        const std::vector<std::size_t>& p)
+    {
+        std::vector<T> sorted_vec(vec.size());
+        std::transform(p.begin(), p.end(), sorted_vec.begin(),
+                       [&](std::size_t i) { return vec[i]; });
+        return sorted_vec;
+    }
+};
+
 class SpanClassifier
 {
    public:
@@ -27,6 +110,7 @@ class SpanClassifier
                    const Neon::int32_3d&                            domainSize,
                    const Neon::domain::Stencil                      stencil,
                    const int&                                       discreteVoxelSpacing,
+                   Neon::domain::tool::spaceCurves::EncoderType     encoderType,
                    std::shared_ptr<partitioning::SpanDecomposition> sp);
 
 
@@ -48,7 +132,7 @@ class SpanClassifier
                                        ByPartition,
                                        ByDirection,
                                        ByDomain) const
-        -> const Neon::domain::tool::PointHashTable<int32_t, uint32_t>&;
+        -> const Neon::domain::tool::PointHashTable<int32_t, uint64_t>&;
 
     [[nodiscard]] auto countInternal(Neon::SetIdx setIdx,
                                      ByDomain     byDomain) const -> int;
@@ -72,7 +156,7 @@ class SpanClassifier
                          ByPartition,
                          ByDirection,
                          ByDomain)
-        -> Neon::domain::tool::PointHashTable<int32_t, uint32_t>&;
+        -> Neon::domain::tool::PointHashTable<int32_t, uint64_t>&;
 
    private:
     auto addPoint(Neon::SetIdx const&   setIdx,
@@ -82,13 +166,7 @@ class SpanClassifier
                   ByDomain              byDomain) -> void;
 
 
-    struct Info
-    {
-        std::vector<Neon::index_3d>                           id1dTo3d;
-        Neon::domain::tool::PointHashTable<int32_t, uint32_t> id3dTo1d;
-    };
-
-    using Leve0_Info = Info;
+    using Leve0_Info = Hash;
     using Leve1_ByDomain = std::array<Leve0_Info, 2>;
     using Leve2_ByDirection = std::array<Leve1_ByDomain, 2>;
     using Leve3_ByPartition = std::array<Leve2_ByDirection, 2>;
@@ -103,17 +181,18 @@ template <typename ActiveCellLambda,
           typename BcLambda,
           typename Block3dIdxToBlockOrigin,
           typename GetVoxelAbsolute3DIdx>
-SpanClassifier::SpanClassifier(const Neon::Backend&               backend,
-                               const ActiveCellLambda&            activeCellLambda,
-                               const BcLambda&                    bcLambda,
-                               const Block3dIdxToBlockOrigin&     block3dIdxToBlockOrigin,
-                               const GetVoxelAbsolute3DIdx&       getVoxelAbsolute3DIdx,
-                               const Neon::int32_3d&              block3DSpan,
-                               const Neon::int32_3d&              dataBlockSize3D,
-                               const Neon::int32_3d&              domainSize,
-                               const Neon::domain::Stencil        stencil,
-                               const int&                         discreteVoxelSpacing,
-                               std::shared_ptr<SpanDecomposition> spanDecompositionNoUse)
+SpanClassifier::SpanClassifier(const Neon::Backend&                         backend,
+                               const ActiveCellLambda&                      activeCellLambda,
+                               const BcLambda&                              bcLambda,
+                               const Block3dIdxToBlockOrigin&               block3dIdxToBlockOrigin,
+                               const GetVoxelAbsolute3DIdx&                 getVoxelAbsolute3DIdx,
+                               const Neon::int32_3d&                        block3DSpan,
+                               const Neon::int32_3d&                        dataBlockSize3D,
+                               const Neon::int32_3d&                        domainSize,
+                               const Neon::domain::Stencil                  stencil,
+                               const int&                                   discreteVoxelSpacing,
+                               Neon::domain::tool::spaceCurves::EncoderType spaceFillingType,
+                               std::shared_ptr<SpanDecomposition>           spanDecompositionNoUse)
 {
     mData = backend.devSet().newDataSet<Leve3_ByPartition>();
     mSpanDecomposition = spanDecompositionNoUse;
@@ -129,7 +208,7 @@ SpanClassifier::SpanClassifier(const Neon::Backend&               backend,
         for (auto& level2 : leve3ByPartition) {
             for (auto& level1 : level2) {
                 for (auto& level0 : level1) {
-                    level0.id3dTo1d = Neon::domain::tool::PointHashTable<int32_t, uint32_t>(block3DSpan);
+                    level0.id3dTo1d = Neon::domain::tool::PointHashTable<int32_t, uint64_t>(block3DSpan);
                 }
             }
         }
@@ -236,5 +315,20 @@ SpanClassifier::SpanClassifier(const Neon::Backend&               backend,
                     }
                 }
             });
+
+    mData.forEachSeq([&](SetIdx, auto& leve3ByPartition) {
+        //        using Leve0_Info = Info;
+        //        using Leve1_ByDomain = std::array<Leve0_Info, 2>;
+        //        using Leve2_ByDirection = std::array<Leve1_ByDomain, 2>;
+        //        using Leve3_ByPartition = std::array<Leve2_ByDirection, 2>;
+        //        using Data = Neon::set::DataSet<Leve3_ByPartition>;
+        for (auto& level2 : leve3ByPartition) {
+            for (auto& level1 : level2) {
+                for (auto& level0 : level1) {
+                    level0.reHash(spaceFillingType);
+                }
+            }
+        }
+    });
 }
 }  // namespace Neon::domain::tool::partitioning
diff --git a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h
index a7e86f7c..4a01dd16 100644
--- a/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h
+++ b/libNeonDomain/include/Neon/domain/tools/partitioning/SpanLayout.h
@@ -1,7 +1,7 @@
 #pragma once
 #include "Neon/core/core.h"
+#include "Neon/domain/tools/SpaceCurves.h"
 #include "Neon/domain/tools/partitioning/SpanClassifier.h"
-
 namespace Neon::domain::tool::partitioning {
 
 class SpanLayout
@@ -30,6 +30,10 @@ class SpanLayout
         std::shared_ptr<SpanDecomposition> spanPartitionerPtr,
         std::shared_ptr<SpanClassifier>    spanClassifierPtr);
 
+    auto sort(Neon::domain::tool::spaceCurves::EncoderType encoderType,
+              SpanClassifier&                              spanClassifier)
+        -> void;
+
     auto getCount()
         -> Neon::set::DataSet<uint64_t>;
 
diff --git a/libNeonDomain/src/domain/details/aGrid/aGrid.cpp b/libNeonDomain/src/domain/details/aGrid/aGrid.cpp
index be36fd4c..87942976 100644
--- a/libNeonDomain/src/domain/details/aGrid/aGrid.cpp
+++ b/libNeonDomain/src/domain/details/aGrid/aGrid.cpp
@@ -61,7 +61,9 @@ auto aGrid::init(const Neon::Backend&              backend,
                           lenghts,
                           blockDim,
                           spacingData,
-                          origin);
+                          origin,
+                          Neon::domain::tool::spaceCurves::EncoderType::sweep,
+                          {0, 0, 0});
 
     mStorage = std::make_shared<Storage>();
 
diff --git a/libNeonDomain/src/domain/details/dGrid/dGrid.cpp b/libNeonDomain/src/domain/details/dGrid/dGrid.cpp
index 890642b3..ec8b24d8 100644
--- a/libNeonDomain/src/domain/details/dGrid/dGrid.cpp
+++ b/libNeonDomain/src/domain/details/dGrid/dGrid.cpp
@@ -59,7 +59,7 @@ auto dGrid::getLaunchParameters(const Neon::DataView  dataView,
 
     auto dimsByDataView = getBackend().devSet().newDataSet<index_3d>([&](Neon::SetIdx const& setIdx,
                                                                          auto&               value) {
-        value = getSpan(Neon::Execution::host, setIdx, dataView).mDim;
+        value = getSpan(Neon::Execution::host, setIdx, dataView).mSpanDim;
     });
 
     ret.set(Neon::sys::GpuLaunchInfo::domainGridMode,
diff --git a/libNeonDomain/src/domain/details/eGrid/eGrid.cpp b/libNeonDomain/src/domain/details/eGrid/eGrid.cpp
index 164ae3b1..d4e12b7f 100644
--- a/libNeonDomain/src/domain/details/eGrid/eGrid.cpp
+++ b/libNeonDomain/src/domain/details/eGrid/eGrid.cpp
@@ -25,7 +25,9 @@ eGrid::eGrid(const Backend&                     backend,
                               nElementsPerPartition,
                               Neon::index_3d(256, 1, 1),
                               spacing,
-                              origin);
+                              origin,
+                              partitioner.getSpaceCurve(),
+                              {1,1,1});
     }
 
 
@@ -35,7 +37,7 @@ eGrid::eGrid(const Backend&                     backend,
     mData->mGlobalMappingAField = mData->partitioner1D.getGlobalMapping();
     mData->mStencil3dTo1dOffset = mData->partitioner1D.getStencil3dTo1dOffset();
     mData->memoryGrid = mData->partitioner1D.getMemoryGrid();
-    //mData->partitioner1D.getDenseMeta(mData->denseMeta);
+    // mData->partitioner1D.getDenseMeta(mData->denseMeta);
 
     const int32_t numDevices = getBackend().devSet().setCardinality();
 
@@ -109,7 +111,9 @@ eGrid::eGrid(const Backend&                     backend,
                               nElementsPerPartition,
                               defaultBlockSize,
                               spacing,
-                              origin);
+                              origin,
+                              partitioner.getSpaceCurve(),
+                              {1,1,1});
     }
 }
 
@@ -200,7 +204,7 @@ auto eGrid::convertToNghIdx(Neon::index_3d const& offset)
 
 auto eGrid::isInsideDomain(const index_3d& idx) const -> bool
 {
-    //auto const& metaInfo = mData->denseMeta.get(idx);
+    // auto const& metaInfo = mData->denseMeta.get(idx);
     auto const& metaInfo = mData->partitioner1D.getDenseMeta().get(idx);
     return metaInfo.isValid();
 }
@@ -225,7 +229,7 @@ auto eGrid::getProperties(const index_3d& idx) const -> GridBaseTemplate::CellPr
     if (this->getDevSet().setCardinality() == 1) {
         cellProperties.init(0, DataView::INTERNAL);
     } else {
-        //auto const& metaInfo = mData->denseMeta.get(idx);
+        // auto const& metaInfo = mData->denseMeta.get(idx);
         auto const& metaInfo = mData->partitioner1D.getDenseMeta().get(idx);
         cellProperties.init(metaInfo.setIdx, metaInfo.dw);
     }
@@ -262,7 +266,7 @@ auto eGrid::helpGetSetIdxAndGridIdx(Neon::index_3d idx) const -> std::tuple<Neon
     SetIdx setIdx;
     setIdx.invalidate();
     {
-        //auto const& meta = mData->denseMeta.get(idx);
+        // auto const& meta = mData->denseMeta.get(idx);
         auto const& meta = mData->partitioner1D.getDenseMeta().get(idx);
         if (meta.isValid()) {
             auto const& span = getSpan(Execution::host, meta.setIdx, Neon::DataView::STANDARD);
diff --git a/libNeonDomain/src/domain/interface/GridBase.cpp b/libNeonDomain/src/domain/interface/GridBase.cpp
index 81663239..3bfd8a21 100644
--- a/libNeonDomain/src/domain/interface/GridBase.cpp
+++ b/libNeonDomain/src/domain/interface/GridBase.cpp
@@ -3,14 +3,16 @@
 
 namespace Neon::domain::interface {
 
-auto GridBase::init(const std::string&                gridImplementationName,
-                    const Neon::Backend&              backend,
-                    const Neon::index_3d&             dimension,
-                    const Neon::domain::Stencil&      stencil,
-                    const Neon::set::DataSet<size_t>& nPartitionElements,
-                    const Neon::index_3d&             defaultBlockSize,
-                    const Vec_3d<double>&             spacingData,
-                    const Vec_3d<double>&             origin) -> void
+auto GridBase::init(const std::string&                           gridImplementationName,
+                    const Neon::Backend&                         backend,
+                    const Neon::index_3d&                        dimension,
+                    const Neon::domain::Stencil&                 stencil,
+                    const Neon::set::DataSet<size_t>&            nPartitionElements,
+                    const Neon::index_3d&                        defaultBlockSize,
+                    const Vec_3d<double>&                        spacingData,
+                    const Vec_3d<double>&                        origin,
+                    Neon::domain::tool::spaceCurves::EncoderType spaceCurve,
+                    Neon::index_3d memoryBlock) -> void
 {
     mStorage->backend = backend;
     mStorage->dimension = dimension;
@@ -24,6 +26,8 @@ auto GridBase::init(const std::string&                gridImplementationName,
         mStorage->defaults.launchParameters[DataViewUtil::toInt(dw)] = backend.devSet().newLaunchParameters();
     }
     mStorage->defaults.blockDim = defaultBlockSize;
+    mStorage->spaceCurve = spaceCurve;
+    mStorage->memoryBlock = memoryBlock;
 }
 
 GridBase::GridBase()
@@ -31,14 +35,16 @@ GridBase::GridBase()
 {
 }
 
-GridBase::GridBase(const std::string&                gridImplementationName,
-                   const Neon::Backend&              backend,
-                   const Neon::index_3d&             dimension,
-                   const Neon::domain::Stencil&      stencil,
-                   const Neon::set::DataSet<size_t>& nPartitionElements,
-                   const Neon::index_3d&             defaultBlockSize,
-                   const Vec_3d<double>&             spacingData,
-                   const Vec_3d<double>&             origin)
+GridBase::GridBase(const std::string&                           gridImplementationName,
+                   const Neon::Backend&                         backend,
+                   const Neon::index_3d&                        dimension,
+                   const Neon::domain::Stencil&                 stencil,
+                   const Neon::set::DataSet<size_t>&            nPartitionElements,
+                   const Neon::index_3d&                        defaultBlockSize,
+                   const Vec_3d<double>&                        spacingData,
+                   const Vec_3d<double>&                        origin,
+                   Neon::domain::tool::spaceCurves::EncoderType spaceCurve,
+                   Neon::index_3d memoryBlock)
     : mStorage(std::make_shared<GridBase::Storage>())
 {
     init(gridImplementationName,
@@ -48,7 +54,9 @@ GridBase::GridBase(const std::string&                gridImplementationName,
          nPartitionElements,
          defaultBlockSize,
          spacingData,
-         origin);
+         origin,
+         spaceCurve,
+         memoryBlock);
 }
 
 auto GridBase::getDimension() const -> const Neon::index_3d&
@@ -161,7 +169,8 @@ auto GridBase::toString() const -> std::string
              return tmp.str();
          }()
       << "}"
-      << ", [Backend]:{" << getBackend().toString() << "}";
+      << ", [Backend]:{" << getBackend().toString() << "}"
+      << ", [Memory]:{" << Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(mStorage->spaceCurve) << ", " << this->mStorage->memoryBlock << "}";
 
     return s.str();
 }
@@ -232,10 +241,41 @@ auto GridBase::toReport(Neon::Report& report,
         }(),
         &subdoc);
 
+    report.addMember(
+        "MemoryBlock",
+        [&] {
+            std::stringstream list;
+            list << "[";
+            list << getMemoryBlock().x << " "
+                 << getMemoryBlock().y << " "
+                 << getMemoryBlock().z << "]";
+            return list.str();
+        }(),
+        &subdoc);
+
+    report.addMember(
+        "SpaceCurve",
+        [&] {
+            std::stringstream list;
+            list << Neon::domain::tool::spaceCurves::EncoderTypeUtil::toString(mStorage->spaceCurve);
+            return list.str();
+        }(),
+        &subdoc);
+
     if (includeBackendInfo)
         getBackend().toReport(report, &subdoc);
 
     report.addSubdoc("Grid", subdoc);
 }
 
+auto GridBase::getMemoryBlock() const -> Neon::index_3d
+{
+    return mStorage->memoryBlock;
+}
+
+auto GridBase::getSpaceCurve() const -> Neon::domain::tool::spaceCurves::EncoderType
+{
+    return mStorage->spaceCurve;
+}
+
 }  // namespace Neon::domain::interface
\ No newline at end of file
diff --git a/libNeonDomain/src/domain/tools/SpaceCurves.cpp b/libNeonDomain/src/domain/tools/SpaceCurves.cpp
new file mode 100644
index 00000000..cca20e19
--- /dev/null
+++ b/libNeonDomain/src/domain/tools/SpaceCurves.cpp
@@ -0,0 +1,164 @@
+#include "Neon/domain/tools/SpaceCurves.h"
+#include "Neon/core/types/Exceptions.h"
+
+namespace Neon::domain::tool::spaceCurves {
+
+auto EncoderTypeUtil::getOptions() -> std::array<EncoderType, EncoderTypeUtil::nConfig>
+{
+    std::array<EncoderType, EncoderTypeUtil::nConfig> options = {EncoderType::sweep,
+                                                                 EncoderType::morton,
+                                                                 EncoderType::hilbert};
+    return options;
+}
+
+auto EncoderTypeUtil::toString(EncoderType e) -> std::string
+{
+    switch (e) {
+        case EncoderType::sweep: {
+            return "sweep";
+        }
+        case EncoderType::morton: {
+            return "morton";
+        }
+        case EncoderType::hilbert: {
+            return "hilbert";
+        }
+        default: {
+            NEON_THROW_UNSUPPORTED_OPTION("EncoderTypeUtil");
+        }
+    }
+}
+
+auto EncoderTypeUtil::fromInt(int val) -> EncoderType
+{
+    switch (val) {
+        case static_cast<int>(EncoderType::sweep): {
+            return EncoderType::sweep;
+        }
+        case static_cast<int>(EncoderType::morton): {
+            return EncoderType::morton;
+        }
+        case static_cast<int>(EncoderType::hilbert): {
+            return EncoderType::hilbert;
+        }
+        default: {
+            NEON_THROW_UNSUPPORTED_OPTION("EncoderTypeUtil");
+        }
+    }
+}
+
+auto EncoderTypeUtil::fromString(const std::string& occ) -> EncoderType
+{
+    std::array<EncoderType, 3> opts = getOptions();
+    for (auto a : opts) {
+        if (toString(a) == occ) {
+            return a;
+        }
+    }
+    NEON_THROW_UNSUPPORTED_OPTION("");
+}
+
+auto EncoderTypeUtil::toInt(EncoderType dataView) -> int
+{
+    return static_cast<int>(dataView);
+}
+
+std::ostream& operator<<(std::ostream& os, EncoderType const& m)
+{
+    return os << std::string(EncoderTypeUtil::toString(m));
+}
+
+
+EncoderTypeUtil::Cli::Cli()
+{
+    mSet = false;
+}
+
+EncoderTypeUtil::Cli::Cli(std::string s)
+{
+    set(s);
+}
+
+EncoderTypeUtil::Cli::Cli(EncoderType model)
+{
+    mOption = model;
+}
+
+auto EncoderTypeUtil::Cli::getOption() const -> EncoderType
+{
+    if (!mSet) {
+        std::stringstream errorMsg;
+        errorMsg << "TransferSemantic was not set.";
+        NEON_ERROR(errorMsg.str());
+    }
+    return mOption;
+}
+
+auto EncoderTypeUtil::Cli::set(const std::string& opt)
+    -> void
+{
+    try {
+        mOption = EncoderTypeUtil::fromString(opt);
+    } catch (...) {
+        std::stringstream errorMsg;
+        errorMsg << "TransferSemantic: " << opt << " is not a valid option (valid options are {";
+        auto options = EncoderTypeUtil::getOptions();
+        int  i = 0;
+        for (auto o : options) {
+            if (i != 0) {
+                errorMsg << ", " << EncoderTypeUtil::toString(o);
+            }
+            errorMsg << EncoderTypeUtil::toString(o);
+            i = 1;
+        }
+        errorMsg << "})";
+        NEON_ERROR(errorMsg.str());
+    }
+    mSet = true;
+}
+
+auto EncoderTypeUtil::Cli::getStringOptions() const -> std::string
+{
+    std::stringstream s;
+    auto              options = EncoderTypeUtil::getOptions();
+    int               i = 0;
+    for (auto o : options) {
+        if (i != 0) {
+            s << ", ";
+        }
+        s << EncoderTypeUtil::toString(o);
+        i = 1;
+    }
+    std::string msg = s.str();
+    return msg;
+}
+
+auto EncoderTypeUtil::Cli::getStringOption() const -> std::string
+{
+    if (!mSet) {
+        std::stringstream errorMsg;
+        errorMsg << "TransferSemantic was not set.";
+        NEON_ERROR(errorMsg.str());
+    }
+    return EncoderTypeUtil::toString(mOption);
+}
+
+auto EncoderTypeUtil::Cli::getDoc() const -> std::string
+{
+    std::stringstream s;
+    s << getStringOptions();
+    s << " default: " << getStringOptions();
+    return s.str();
+}
+
+auto EncoderTypeUtil::Cli::addToReport(Neon::Report& report) const -> void
+{
+    report.addMember("EncoderType", EncoderTypeUtil::toString(this->getOption()));
+}
+
+auto EncoderTypeUtil::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void
+{
+    report.addMember("EncoderType", EncoderTypeUtil::toString(this->getOption()), &subBlock);
+}
+
+}  // namespace Neon::domain::tool::spaceCurves
diff --git a/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp b/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp
index e9c62754..4b372fc9 100644
--- a/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp
+++ b/libNeonDomain/src/domain/tools/partitioning/SpanClassifier.cpp
@@ -37,7 +37,7 @@ auto SpanClassifier::getMapper1Dto3D(const SetIdx& setIdx,
 auto SpanClassifier::getMapper3Dto1D(const SetIdx& setIdx,
                                      ByPartition   byPartition,
                                      ByDirection   byDirection,
-                                     ByDomain      byDomain) const -> const Neon::domain::tool::PointHashTable<int32_t, uint32_t>&
+                                     ByDomain      byDomain) const -> const Neon::domain::tool::PointHashTable<int32_t, uint64_t>&
 {
     return mData[setIdx]
                 [static_cast<int>(byPartition)]
@@ -63,7 +63,7 @@ auto SpanClassifier::getMapper3Dto1D(const SetIdx& setIdx,
                                      ByPartition   byPartition,
                                      ByDirection   byDirection,
                                      ByDomain      byDomain)
-    -> Neon::domain::tool::PointHashTable<int32_t, uint32_t>&
+    -> Neon::domain::tool::PointHashTable<int32_t, uint64_t>&
 {
     return mData[setIdx]
                 [static_cast<int>(byPartition)]
diff --git a/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp b/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp
index 591bd07f..9a81de6b 100644
--- a/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp
+++ b/libNeonDomain/src/domain/tools/partitioning/SpanLayout.cpp
@@ -207,7 +207,7 @@ auto SpanLayout::findPossiblyLocalPointOffset(
                                                                          byDomain);
                 auto const  infoPtr = mapper.getMetadata(point);
                 if (infoPtr != nullptr) {
-                    return {true, *infoPtr, byPartition, byDirection, byDomain};
+                    return {true, int32_t(*infoPtr), byPartition, byDirection, byDomain};
                 }
             }
         }
diff --git a/libNeonDomain/tests/CMakeLists.txt b/libNeonDomain/tests/CMakeLists.txt
index 3f76cb4e..874e58fc 100644
--- a/libNeonDomain/tests/CMakeLists.txt
+++ b/libNeonDomain/tests/CMakeLists.txt
@@ -7,6 +7,7 @@ add_subdirectory("domain-neighbour-globalIdx")
 add_subdirectory("domain-halos")
 add_subdirectory("domain-stencil")
 add_subdirectory("domain-bGrid-tray")
+add_subdirectory("domain-space-filling-curves")
 
 add_subdirectory("domainUt_sGrid")
 add_subdirectory("domain-unit-test-eGrid")
diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu
index 158d3e05..1b94b566 100644
--- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu
+++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.cu
@@ -1,5 +1,6 @@
 #include <functional>
 #include "Neon/domain/Grids.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 
 #include "Neon/domain/tools/TestData.h"
 #include "TestInformation.h"
@@ -27,18 +28,18 @@ auto defContainer(int    streamIdx,
             return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable {
                 // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val);
                 Neon::index_3d globalPoint = a.getGlobalIndex(e);
-                a(e, 0) = globalPoint.x ;
+                a(e, 0) = globalPoint.x;
                 b(e, 0) = globalPoint.y;
                 c(e, 0) = globalPoint.z;
-//                if constexpr (std::is_same_v<typename Field::Grid, Neon::bGrid>) {
-//                    printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx,
-//                           e.mInDataBlockIdx.x,
-//                           e.mInDataBlockIdx.y,
-//                           e.mInDataBlockIdx.z,
-//                           globalPoint.x,
-//                           globalPoint.y,
-//                           globalPoint.z);
-//                }
+                //                if constexpr (std::is_same_v<typename Field::Grid, Neon::bGrid>) {
+                //                    printf("Block %d Th %d %d %d Loc %d %d %d\n", e.mDataBlockIdx,
+                //                           e.mInDataBlockIdx.x,
+                //                           e.mInDataBlockIdx.y,
+                //                           e.mInDataBlockIdx.z,
+                //                           globalPoint.x,
+                //                           globalPoint.y,
+                //                           globalPoint.z);
+                //                }
             };
         });
 }
@@ -98,5 +99,6 @@ auto run(TestData<G, T, C>& data) -> void
 template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 
 }  // namespace globalIdx
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h
index 0a3b87eb..c766f7ca 100644
--- a/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h
+++ b/libNeonDomain/tests/domain-globalIdx/src/globalIdx.h
@@ -3,9 +3,9 @@
 #include <functional>
 
 #include "Neon/domain/Grids.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 #include "Neon/domain/tools/TestData.h"
 
-
 namespace globalIdx {
 using namespace Neon::domain::tool::testing;
 
@@ -15,6 +15,7 @@ auto run(TestData<G, T, C>& data) -> void;
 extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 
 
-}  // namespace map
+}  // namespace globalIdx
diff --git a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp
index 783830ca..f0ecce78 100644
--- a/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-globalIdx/src/gtests.cpp
@@ -4,7 +4,7 @@
 #include "globalIdx.h"
 #include "runHelper.h"
 
-TEST(domain_unit_test_globalIdx, dGrid)
+TEST(domain_globalIdx, dGrid)
 {
     int nGpus = 3;
     using Type = int64_t;
@@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid)
                             1);
 }
 
-TEST(domain_unit_test_globalIdx, eGrid)
+TEST(domain_globalIdx, eGrid)
 {
     int nGpus = 3;
     using Type = int64_t;
@@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid)
                             1);
 }
 
-TEST(domain_unit_test_globalIdx, bGrid)
+TEST(domain_globalIdx, bGrid)
 {
     int nGpus = 3;
     using Type = int64_t;
@@ -31,6 +31,15 @@ TEST(domain_unit_test_globalIdx, bGrid)
                             1);
 }
 
+TEST(domain_globalIdx, dGridSoA)
+{
+    int nGpus = 3;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::run<Neon::domain::details::dGridSoA::dGridSoA , Type, 0>),
+                            nGpus,
+                            1);
+}
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/libNeonDomain/tests/domain-map/src/gtests.cpp b/libNeonDomain/tests/domain-map/src/gtests.cpp
index d0d43b60..c48511b7 100644
--- a/libNeonDomain/tests/domain-map/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-map/src/gtests.cpp
@@ -13,6 +13,15 @@ TEST(domain_map, dGrid)
                             1);
 }
 
+TEST(domain_map_dataView, dGrid)
+{
+    int nGpus = 2;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::dataView::run<Neon::dGrid, Type, 0>),
+                            nGpus,
+                            2);
+}
+
 TEST(domain_map, eGrid)
 {
     int nGpus = 3;
@@ -31,6 +40,15 @@ TEST(domain_map, bGrid)
                             1);
 }
 
+TEST(domain_map, dGridSoA)
+{
+    int nGpus = 1;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::run<Neon::domain::details::dGridSoA::dGridSoA, Type, 0>),
+                            nGpus,
+                            1);
+}
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/libNeonDomain/tests/domain-map/src/map.cu b/libNeonDomain/tests/domain-map/src/map.cu
index bd25f178..2ed92ddb 100644
--- a/libNeonDomain/tests/domain-map/src/map.cu
+++ b/libNeonDomain/tests/domain-map/src/map.cu
@@ -1,6 +1,7 @@
 #include <functional>
 #include "Neon/domain/Grids.h"
 
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 #include "Neon/domain/tools/TestData.h"
 #include "TestInformation.h"
 #include "gtest/gtest.h"
@@ -31,6 +32,27 @@ auto mapContainer_axpy(int                   streamIdx,
         });
 }
 
+template <typename Field>
+auto mapContainer_add(int                   streamIdx,
+                       typename Field::Type& val,
+                       Field&                fieldB)
+    -> Neon::set::Container
+{
+    const auto& grid = fieldB.getGrid();
+    return grid.newContainer(
+        "mapContainer_axpy",
+        [&, val](Neon::set::Loader& loader) {
+            auto       b = loader.load(fieldB);
+
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable {
+                for (int i = 0; i < b.cardinality(); i++) {
+                    // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val);
+                    b(e, i) += val;
+                }
+            };
+        });
+}
+
 using namespace Neon::domain::tool::testing;
 
 template <typename G, typename T, int C>
@@ -75,6 +97,55 @@ auto run(TestData<G, T, C>& data) -> void
 template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
+
+namespace dataView {
+template <typename G, typename T, int C>
+auto run(TestData<G, T, C>& data) -> void
+{
+
+    using Type = typename TestData<G, T, C>::Type;
+    auto&             grid = data.getGrid();
+    const std::string appName = TestInformation::fullName(grid.getImplementationName());
+
+    data.resetValuesToLinear(1, 100);
+    T val = T(33);
+
+    {  // NEON
+        const Neon::index_3d        dim = grid.getDimension();
+        std::vector<Neon::index_3d> elements;
+
+        auto& X = data.getField(FieldNames::X);
+        auto& Y = data.getField(FieldNames::Y);
+
+
+        mapContainer_axpy(Neon::Backend::mainStreamIdx,
+                          val, X, Y)
+            .run(0, Neon::DataView::BOUNDARY);
+
+        mapContainer_axpy(Neon::Backend::mainStreamIdx,
+                          val, X, Y)
+            .run(0, Neon::DataView::INTERNAL);
+
+        X.updateHostData(0);
+        Y.updateHostData(0);
 
+        data.getBackend().sync(0);
+    }
+
+    {  // Golden data
+        auto& X = data.getIODomain(FieldNames::X);
+        auto& Y = data.getIODomain(FieldNames::Y);
+        data.axpy(&val, X, Y);
+    }
+
+    bool isOk = data.compare(FieldNames::Y);
+    ASSERT_TRUE(isOk);
+}
+template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 
+}  // namespace dataView
 }  // namespace map
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-map/src/map.h b/libNeonDomain/tests/domain-map/src/map.h
index 611f2046..99864a3f 100644
--- a/libNeonDomain/tests/domain-map/src/map.h
+++ b/libNeonDomain/tests/domain-map/src/map.h
@@ -3,6 +3,7 @@
 #include <functional>
 
 #include "Neon/domain/Grids.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 #include "Neon/domain/tools/TestData.h"
 
 
@@ -14,6 +15,19 @@ auto run(TestData<G, T, C>& data) -> void;
 
 extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
 
+namespace dataView {
+
+template <typename G, typename T, int C>
+auto run(TestData<G, T, C>& data) -> void;
+
+extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>(TestData<Neon::domain::details::dGridSoA::dGridSoA, int64_t, 0>&) -> void;
+
+}  // namespace dataView
 
 }  // namespace map
diff --git a/libNeonDomain/tests/domain-map/src/runHelper.h b/libNeonDomain/tests/domain-map/src/runHelper.h
index 53ea8681..593e31c2 100644
--- a/libNeonDomain/tests/domain-map/src/runHelper.h
+++ b/libNeonDomain/tests/domain-map/src/runHelper.h
@@ -31,7 +31,7 @@ void runAllTestConfiguration(
         nGpuTest.push_back(i);
     }
     // std::vector<int> nGpuTest{2,4,6,8};
-    std::vector<int> cardinalityTest{1};
+    std::vector<int> cardinalityTest{1,3,19};
 
     std::vector<Neon::index_3d> dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}};
     std::vector<Neon::Runtime>  runtimeE{Neon::Runtime::openmp};
@@ -95,6 +95,7 @@ void runAllTestConfiguration(
     }
 }
 
+#if 0
 
 template <typename G, typename T, int C>
 void runOneTestConfiguration(const std::string&                      gname,
@@ -144,3 +145,4 @@ void runOneTestConfiguration(const std::string&                      gname,
         }
     }
 }
+#endif
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp
index feba5a9b..21bba9b5 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/gtests.cpp
@@ -1,10 +1,10 @@
 
+#include "./testsAndContainers.h"
 #include "Neon/Neon.h"
 #include "gtest/gtest.h"
-#include "./testsAndContainers.h"
 #include "runHelper.h"
 
-TEST(domain_unit_test_globalIdx, dGrid)
+TEST(domain_neighbour_globalIdx, dGrid)
 {
     int nGpus = 5;
     using Type = int64_t;
@@ -13,7 +13,7 @@ TEST(domain_unit_test_globalIdx, dGrid)
                             1);
 }
 
-TEST(domain_unit_test_globalIdx, eGrid)
+TEST(domain_neighbour_globalIdx, eGrid)
 {
     int nGpus = 5;
     using Type = int64_t;
@@ -22,7 +22,7 @@ TEST(domain_unit_test_globalIdx, eGrid)
                             1);
 }
 
-TEST(domain_unit_test_globalIdx, bGrid)
+TEST(domain_neighbour_globalIdx, bGrid)
 {
     int nGpus = 5;
     using Type = int64_t;
@@ -31,6 +31,53 @@ TEST(domain_unit_test_globalIdx, bGrid)
                             1);
 }
 
+TEST(domain_neighbour_globalIdx, dGridSoA)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::run<Neon::dGridSoA, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+///////////////////////////////////////////
+
+TEST(domain_neighbour_globalIdx, dGrid_template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::runTemplate<Neon::dGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_neighbour_globalIdx, eGrid_template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::runTemplate<Neon::eGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_neighbour_globalIdx, bGrid_template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::runTemplate<Neon::bGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_neighbour_globalIdx, dGridSoA_template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(globalIdx::runTemplate<Neon::dGridSoA, Type, 0>),
+                            nGpus,
+                            1);
+}
+
 int main(int argc, char** argv)
 {
     ::testing::InitGoogleTest(&argc, argv);
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h
index 0014594c..d74db246 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/runHelper.h
@@ -8,6 +8,7 @@
 #include "Neon/core/types/DeviceType.h"
 
 #include "Neon/domain/dGrid.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 #include "Neon/domain/eGrid.h"
 #include "Neon/domain/tools/Geometries.h"
 #include "Neon/domain/tools/TestData.h"
@@ -82,8 +83,8 @@ void runAllTestConfiguration(
                                 if (dim.z < 8 * ngpu * 3) {
                                     dim.z = ngpu * 3 * 8;
                                 }
-                                if(memoryLayout == Neon::MemoryLayout::arrayOfStructs){
-                                    continue ;
+                                if (memoryLayout == Neon::MemoryLayout::arrayOfStructs) {
+                                    continue;
                                 }
                             }
 
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu
index 49dd3bd2..7b2c3fef 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.cu
@@ -1,5 +1,6 @@
 #include <functional>
 #include "Neon/domain/Grids.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 
 #include "Neon/domain/tools/TestData.h"
 #include "TestInformation.h"
@@ -61,15 +62,15 @@ auto checkNeighbourData(Field const&   filedA,
                         Field const&   filedB,
                         Field const&   filedC,
                         Neon::index_3d testDirection,
-                        Field const&   checkFlatA,
-                        Field const&   checkFlatB,
-                        Field const&   checkFlatC)
+                        Field&         checkFlatA,
+                        Field&         checkFlatB,
+                        Field&         checkFlatC)
     -> Neon::set::Container
 {
     const auto& grid = filedA.getGrid();
     return grid.newContainer(
         "defContainer",
-        [&](Neon::set::Loader& loader) {
+        [&, testDirection](Neon::set::Loader& loader) {
             auto a = loader.load(filedA, Neon::Pattern::STENCIL);
             auto b = loader.load(filedB, Neon::Pattern::STENCIL);
             auto c = loader.load(filedC, Neon::Pattern::STENCIL);
@@ -102,6 +103,58 @@ auto checkNeighbourData(Field const&   filedA,
         });
 }
 
+template <int xOff,
+          int yOff,
+          int zOff,
+          typename Field>
+auto checkNeighbourDataTemplate(Field const& filedA,
+                                Field const& filedB,
+                                Field const& filedC,
+                                Field&       checkFlatA,
+                                Field&       checkFlatB,
+                                Field&       checkFlatC)
+    -> Neon::set::Container
+{
+    const auto& grid = filedA.getGrid();
+    return grid.newContainer(
+        "defContainer",
+        [&](Neon::set::Loader& loader) {
+            auto a = loader.load(filedA, Neon::Pattern::STENCIL);
+            auto b = loader.load(filedB, Neon::Pattern::STENCIL);
+            auto c = loader.load(filedC, Neon::Pattern::STENCIL);
+
+            auto resA = loader.load(checkFlatA, Neon::Pattern::MAP);
+            auto resB = loader.load(checkFlatB, Neon::Pattern::MAP);
+            auto resC = loader.load(checkFlatC, Neon::Pattern::MAP);
+
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& e) mutable {
+                constexpr Neon::index_3d testDirection(xOff, yOff, zOff);
+
+                // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val);
+                Neon::index_3d globalPoint = a.getGlobalIndex(e);
+                auto           ngh = globalPoint + testDirection;
+
+                decltype(a)* nghInfo[3] = {&a, &b, &c};
+                decltype(a)* results[3] = {&resA, &resB, &resC};
+
+                for (int i = 0; i < 3; i++) {
+                    auto d = nghInfo[i]->template getNghData<testDirection.x, testDirection.y, testDirection.z>(e, 0);
+                    // auto d = nghInfo[i]->getNghData(e, testDirection.newType<int8_t>(), 0);
+
+                    if (d.isValid()) {
+                        results[i]->operator()(e, 0) = d.getData() == ngh.v[i] ? +1 : -1;
+                        if (d.getData() != ngh.v[i]) {
+                            printf("ERROR: %d %d %d %d %d %d\n", globalPoint.x, globalPoint.y, globalPoint.z, ngh.v[0], ngh.v[1], ngh.v[2]);
+                            d = nghInfo[i]->getNghData(e, testDirection.newType<int8_t>(), 0);
+                        }
+                    } else {
+                        results[i]->operator()(e, 0) = 0;
+                    }
+                }
+            };
+        });
+}
+
 using namespace Neon::domain::tool::testing;
 
 template <typename G, typename T, int C>
@@ -165,15 +218,15 @@ auto run(TestData<G, T, C>& data) -> void
                                    X, Y, Z);
     };
 
-    //    constexpr std::array<const Ngh3DIdx, 6>
-    //        stencil{Ngh3DIdx(1, 0, 0),
-    //                Ngh3DIdx(-1, 0, 0),
-    //                Ngh3DIdx(0, 1, 0),
-    //                Ngh3DIdx(0, -1, 0),
-    //                Ngh3DIdx(0, 0, 1),
-    //                Ngh3DIdx(0, 0, -1)};
-    constexpr std::array<const Ngh3DIdx, 1>
-        stencil{Ngh3DIdx(0, 0, -1)};
+    constexpr std::array<const Ngh3DIdx, 6>
+        stencil{Ngh3DIdx(1, 0, 0),
+                Ngh3DIdx(-1, 0, 0),
+                Ngh3DIdx(0, 1, 0),
+                Ngh3DIdx(0, -1, 0),
+                Ngh3DIdx(0, 0, 1),
+                Ngh3DIdx(0, 0, -1)};
+    //    constexpr std::array<const Ngh3DIdx, 1>
+    //        stencil{Ngh3DIdx(0, 0, -1)};
 
     for (auto const& direction : stencil) {
         reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx);
@@ -214,8 +267,149 @@ auto run(TestData<G, T, C>& data) -> void
     }
 }
 
+template <typename G, typename T, int C>
+auto runTemplate(TestData<G, T, C>& data) -> void
+{
+
+    using Type = typename TestData<G, T, C>::Type;
+    auto&             grid = data.getGrid();
+    const std::string appName = TestInformation::fullName(grid.getImplementationName());
+
+    data.resetValuesToLinear(1, 100);
+
+    auto aField = grid.template newField<int64_t>("a", 1, 0);
+    auto bField = grid.template newField<int64_t>("a", 1, 0);
+    auto cField = grid.template newField<int64_t>("a", 1, 0);
+
+    auto& X = data.getField(FieldNames::X);
+    auto& Y = data.getField(FieldNames::Y);
+    auto& Z = data.getField(FieldNames::Z);
+
+    const Neon::index_3d dim = grid.getDimension();
+    auto                 bk = grid.getBackend();
+
+    {  // NEON
+        {
+            initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+            aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+        }
+    }
+    using Ngh3DIdx = Neon::int32_3d;
+
+    auto setGolden = [&](Ngh3DIdx const& direction) {  // Golden data
+        auto& X = data.getIODomain(FieldNames::X);
+        auto& Y = data.getIODomain(FieldNames::Y);
+        auto& Z = data.getIODomain(FieldNames::Z);
+
+        data.forEachActiveIODomain([&](const Neon::index_3d& idx,
+                                       int                   cardinality,
+                                       Type&                 a,
+                                       Type&                 b,
+                                       Type&                 c) {
+            a = 1;
+            b = 1;
+            c = 1;
+            auto ngh = direction + idx;
+            if (!(ngh >= 0)) {
+                a = 0;
+                b = 0;
+                c = 0;
+            }
+            if (!(dim > ngh)) {
+                a = 0;
+                b = 0;
+                c = 0;
+            }
+        },
+                                   X, Y, Z);
+    };
+
+    constexpr std::array<const Ngh3DIdx, 6>
+        stencil{Ngh3DIdx(1, 0, 0),
+                Ngh3DIdx(-1, 0, 0),
+                Ngh3DIdx(0, 1, 0),
+                Ngh3DIdx(0, -1, 0),
+                Ngh3DIdx(0, 0, 1),
+                Ngh3DIdx(0, 0, -1)};
+    //    constexpr std::array<const Ngh3DIdx, 1>
+    //        stencil{Ngh3DIdx(0, 0, -1)};
+
+    for (auto const& direction : stencil) {
+        reset(aField, bField, cField).run(Neon::Backend::mainStreamIdx);
+        reset(X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        {  // Updating halo with wrong data
+            bk.sync(Neon::Backend::mainStreamIdx);
+            aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+        }
+        {
+            initData(aField, bField, cField).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+            aField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            cField.newHaloUpdate(Neon::set::StencilSemantic::standard, Neon::set::TransferMode::put, Neon::Execution::device).run(Neon::Backend::mainStreamIdx);
+            bk.sync(Neon::Backend::mainStreamIdx);
+        }
+
+
+        // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+
+        if (direction == Neon::index_3d(1, 0, 0)) {
+            checkNeighbourDataTemplate<1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(-1, 0, 0)) {
+            checkNeighbourDataTemplate<-1, 0, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(0, 1, 0)) {
+            checkNeighbourDataTemplate<0, 1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(0, -1, 0)) {
+            checkNeighbourDataTemplate<0, -1, 0>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(0, 0, 1)) {
+            checkNeighbourDataTemplate<0, 0, 1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else if (direction == Neon::index_3d(0, 0, -1)) {
+            checkNeighbourDataTemplate<0, 0, -1>(aField, bField, cField, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+            // checkNeighbourData(aField, bField, cField, direction, X, Y, Z).run(Neon::Backend::mainStreamIdx);
+        } else {
+            std::cout << "Direction not implemented " << direction << std::endl;
+            exit(99);
+        }
+        setGolden(direction);
+
+        bk.sync(Neon::Backend::mainStreamIdx);
+        bool isOk = data.compare(FieldNames::X);
+        isOk = isOk && data.compare(FieldNames::Y);
+        isOk = isOk && data.compare(FieldNames::Z);
+
+        if (!isOk) {
+            std::cout << "Direction with errors " << direction << std::endl;
+            data.getField(FieldNames::X).ioToVtk(grid.getImplementationName() + "X", "X", true);
+            data.getField(FieldNames::Y).ioToVtk(grid.getImplementationName() + "Y", "Y", true);
+            data.getField(FieldNames::Z).ioToVtk(grid.getImplementationName() + "Z", "Z", true);
+            exit(77);
+            ASSERT_TRUE(isOk);
+        }
+    }
+}
+
+
 template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto run<Neon::dGridSoA, int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
+
+
+template auto runTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::dGridSoA, int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
 }  // namespace globalIdx
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h
index 0a3b87eb..bcf503f2 100644
--- a/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h
+++ b/libNeonDomain/tests/domain-neighbour-globalIdx/src/testsAndContainers.h
@@ -4,6 +4,7 @@
 
 #include "Neon/domain/Grids.h"
 #include "Neon/domain/tools/TestData.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
 
 
 namespace globalIdx {
@@ -12,9 +13,17 @@ using namespace Neon::domain::tool::testing;
 template <typename G, typename T, int C>
 auto run(TestData<G, T, C>& data) -> void;
 
+template <typename G, typename T, int C>
+auto runTemplate(TestData<G, T, C>& data) -> void;
+
 extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
 extern template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto run<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
+extern template auto runTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
 }  // namespace map
diff --git a/libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt b/libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt
new file mode 100644
index 00000000..76af1689
--- /dev/null
+++ b/libNeonDomain/tests/domain-space-filling-curves/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
+
+set(APP_NAME domain-space-filling-curves)
+file(GLOB_RECURSE SrcFiles src/*.*)
+
+add_executable(${APP_NAME} ${SrcFiles})
+
+target_link_libraries(${APP_NAME} 
+	PUBLIC libNeonDomain
+	PUBLIC gtest_main)
+
+set_target_properties(${APP_NAME} PROPERTIES 
+	CUDA_SEPARABLE_COMPILATION ON
+	CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+
+set_target_properties(${APP_NAME} PROPERTIES FOLDER "libNeonDomain")
+source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "${APP_NAME}" FILES ${SrcFiles})
+
+add_test(NAME ${APP_NAME} COMMAND ${APP_NAME})
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h b/libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h
new file mode 100644
index 00000000..3ac50ecd
--- /dev/null
+++ b/libNeonDomain/tests/domain-space-filling-curves/src/TestInformation.h
@@ -0,0 +1,17 @@
+#pragma once
+namespace {
+struct TestInformation
+{
+    static auto prefix()
+        -> std::string
+    {
+        return "domain-unit-test-map";
+    }
+
+    static auto fullName(const std::string& gridName)
+        -> std::string
+    {
+        return prefix() + "-" + gridName;
+    }
+};
+}  // namespace
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu
new file mode 100644
index 00000000..b43ca7f4
--- /dev/null
+++ b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.cu
@@ -0,0 +1,74 @@
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include "Neon/domain/Grids.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
+#include "Neon/domain/tools/SpaceCurves.h"
+#include "Neon/domain/tools/TestData.h"
+#include "TestInformation.h"
+#include "gtest/gtest.h"
+
+#include <cmath>
+#include <iostream>
+
+namespace space_filling_curves {
+
+template <typename Field>
+auto defHostContainer(Field& filedSweep,
+                      Field& filedMorton,
+                      Field& filedHilbert)
+    -> Neon::set::Container
+{
+    const auto& grid = filedSweep.getGrid();
+    return grid.template newContainer<Neon::Execution::host>(
+        "defContainer",
+        [&](Neon::set::Loader& loader) {
+            auto sweep = loader.load(filedSweep);
+            auto morton = loader.load(filedMorton);
+            auto hilbert = loader.load(filedHilbert);
+
+            return [=] NEON_CUDA_HOST_DEVICE(const typename Field::Idx& gidx) mutable {
+                Neon::index_3d p = sweep.getGlobalIndex(gidx);
+                Neon::index_3d dim = sweep.getDomainSize();
+                using namespace Neon::domain::tool::spaceCurves;
+                sweep(gidx, 0) = Encoder::encode(EncoderType::sweep, dim, p);
+                morton(gidx, 0) = Encoder::encode(EncoderType::morton, dim, p);
+                hilbert(gidx, 0) = Encoder::encode(EncoderType::hilbert, dim, p);
+            };
+        });
+}
+
+
+using namespace Neon::domain::tool::testing;
+
+template <typename G, typename T, int C>
+auto run(TestData<G, T, C>& data) -> void
+{
+
+    using Type = typename TestData<G, T, C>::Type;
+    auto&             grid = data.getGrid();
+    const std::string appName = TestInformation::fullName(grid.getImplementationName());
+
+    data.resetValuesToLinear(1, 100);
+
+    {  // NEON
+        const Neon::index_3d        dim = grid.getDimension();
+        std::vector<Neon::index_3d> elements;
+
+        auto& X = data.getField(FieldNames::X);
+        auto& Y = data.getField(FieldNames::Y);
+        auto& Z = data.getField(FieldNames::Z);
+
+        defHostContainer(X, Y, Z).run(0);
+        data.getBackend().sync(0);
+
+        data.getField(FieldNames::X).ioToVtk("spaceCurveSweep", "code", false);
+        data.getField(FieldNames::Y).ioToVtk("spaceCurveMorton", "code", false);
+        data.getField(FieldNames::Z).ioToVtk("spaceCurveHilbert", "code", false);
+    }
+}
+
+template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+
+
+}  // namespace space_filling_curves
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h
new file mode 100644
index 00000000..a5b9fd3a
--- /dev/null
+++ b/libNeonDomain/tests/domain-space-filling-curves/src/domain-space-filling-curves.h
@@ -0,0 +1,18 @@
+
+#pragma once
+#include <functional>
+
+#include "Neon/domain/Grids.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
+#include "Neon/domain/tools/TestData.h"
+
+namespace space_filling_curves {
+using namespace Neon::domain::tool::testing;
+
+template <typename G, typename T, int C>
+auto run(TestData<G, T, C>& data) -> void;
+
+extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+
+
+}  // namespace globalIdx
diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h b/libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h
new file mode 100644
index 00000000..d6292c4b
--- /dev/null
+++ b/libNeonDomain/tests/domain-space-filling-curves/src/goldenEncoding.h
@@ -0,0 +1,11 @@
+
+#include "Neon/Neon.h"
+#include "domain-space-filling-curves.h"
+#include "gtest/gtest.h"
+#include "runHelper.h"
+
+uint64_t morton_grid_16_16_16[16 * 16* 16] = {
+    0, 4, 32, 36, 256, 260, 288, 292, 2048, 2052, 2080, 2084, 2304, 2308, 2336, 2340, 2, 6, 34, 38, 258, 262, 290, 294, 2050, 2054, 2082, 2086, 2306, 2310, 2338, 2342, 16, 20, 48, 52, 272, 276, 304, 308, 2064, 2068, 2096, 2100, 2320, 2324, 2352, 2356, 18, 22, 50, 54, 274, 278, 306, 310, 2066, 2070, 2098, 2102, 2322, 2326, 2354, 2358, 128, 132, 160, 164, 384, 388, 416, 420, 2176, 2180, 2208, 2212, 2432, 2436, 2464, 2468, 130, 134, 162, 166, 386, 390, 418, 422, 2178, 2182, 2210, 2214, 2434, 2438, 2466, 2470, 144, 148, 176, 180, 400, 404, 432, 436, 2192, 2196, 2224, 2228, 2448, 2452, 2480, 2484, 146, 150, 178, 182, 402, 406, 434, 438, 2194, 2198, 2226, 2230, 2450, 2454, 2482, 2486, 1024, 1028, 1056, 1060, 1280, 1284, 1312, 1316, 3072, 3076, 3104, 3108, 3328, 3332, 3360, 3364, 1026, 1030, 1058, 1062, 1282, 1286, 1314, 1318, 3074, 3078, 3106, 3110, 3330, 3334, 3362, 3366, 1040, 1044, 1072, 1076, 1296, 1300, 1328, 1332, 3088, 3092, 3120, 3124, 3344, 3348, 3376, 3380, 1042, 1046, 1074, 1078, 1298, 1302, 1330, 1334, 3090, 3094, 3122, 3126, 3346, 3350, 3378, 3382, 1152, 1156, 1184, 1188, 1408, 1412, 1440, 1444, 3200, 3204, 3232, 3236, 3456, 3460, 3488, 3492, 1154, 1158, 1186, 1190, 1410, 1414, 1442, 1446, 3202, 3206, 3234, 3238, 3458, 3462, 3490, 3494, 1168, 1172, 1200, 1204, 1424, 1428, 1456, 1460, 3216, 3220, 3248, 3252, 3472, 3476, 3504, 3508, 1170, 1174, 1202, 1206, 1426, 1430, 1458, 1462, 3218, 3222, 3250, 3254, 3474, 3478, 3506, 3510, 1, 5, 33, 37, 257, 261, 289, 293, 2049, 2053, 2081, 2085, 2305, 2309, 2337, 2341, 3, 7, 35, 39, 259, 263, 291, 295, 2051, 2055, 2083, 2087, 2307, 2311, 2339, 2343, 17, 21, 49, 53, 273, 277, 305, 309, 2065, 2069, 2097, 2101, 2321, 2325, 2353, 2357, 19, 23, 51, 55, 275, 279, 307, 311, 2067, 2071, 2099, 2103, 2323, 2327, 2355, 2359, 129, 133, 161, 165, 385, 389, 417, 421, 2177, 2181, 2209, 2213, 2433, 2437, 2465, 2469, 131, 135, 163, 167, 387, 391, 419, 423, 2179, 2183, 2211, 2215, 2435, 2439, 2467, 2471, 145, 149, 177, 181, 401, 405, 433, 437, 2193, 2197, 2225, 2229, 2449, 2453, 2481, 2485, 147, 151, 179, 183, 403, 407, 435, 439, 2195, 2199, 2227, 2231, 2451, 2455, 2483, 2487, 1025, 1029, 1057, 1061, 1281, 1285, 1313, 1317, 3073, 3077, 3105, 3109, 3329, 3333, 3361, 3365, 1027, 1031, 1059, 1063, 1283, 1287, 1315, 1319, 3075, 3079, 3107, 3111, 3331, 3335, 3363, 3367, 1041, 1045, 1073, 1077, 1297, 1301, 1329, 1333, 3089, 3093, 3121, 3125, 3345, 3349, 3377, 3381, 1043, 1047, 1075, 1079, 1299, 1303, 1331, 1335, 3091, 3095, 3123, 3127, 3347, 3351, 3379, 3383, 1153, 1157, 1185, 1189, 1409, 1413, 1441, 1445, 3201, 3205, 3233, 3237, 3457, 3461, 3489, 3493, 1155, 1159, 1187, 1191, 1411, 1415, 1443, 1447, 3203, 3207, 3235, 3239, 3459, 3463, 3491, 3495, 1169, 1173, 1201, 1205, 1425, 1429, 1457, 1461, 3217, 3221, 3249, 3253, 3473, 3477, 3505, 3509, 1171, 1175, 1203, 1207, 1427, 1431, 1459, 1463, 3219, 3223, 3251, 3255, 3475, 3479, 3507, 3511, 8, 12, 40, 44, 264, 268, 296, 300, 2056, 2060, 2088, 2092, 2312, 2316, 2344, 2348, 10, 14, 42, 46, 266, 270, 298, 302, 2058, 2062, 2090, 2094, 2314, 2318, 2346, 2350, 24, 28, 56, 60, 280, 284, 312, 316, 2072, 2076, 2104, 2108, 2328, 2332, 2360, 2364, 26, 30, 58, 62, 282, 286, 314, 318, 2074, 2078, 2106, 2110, 2330, 2334, 2362, 2366, 136, 140, 168, 172, 392, 396, 424, 428, 2184, 2188, 2216, 2220, 2440, 2444, 2472, 2476, 138, 142, 170, 174, 394, 398, 426, 430, 2186, 2190, 2218, 2222, 2442, 2446, 2474, 2478, 152, 156, 184, 188, 408, 412, 440, 444, 2200, 2204, 2232, 2236, 2456, 2460, 2488, 2492, 154, 158, 186, 190, 410, 414, 442, 446, 2202, 2206, 2234, 2238, 2458, 2462, 2490, 2494, 1032, 1036, 1064, 1068, 1288, 1292, 1320, 1324, 3080, 3084, 3112, 3116, 3336, 3340, 3368, 3372, 1034, 1038, 1066, 1070, 1290, 1294, 1322, 1326, 3082, 3086, 3114, 3118, 3338, 3342, 3370, 3374, 1048, 1052, 1080, 1084, 1304, 1308, 1336, 1340, 3096, 3100, 3128, 3132, 3352, 3356, 3384, 3388, 1050, 1054, 1082, 1086, 1306, 1310, 1338, 1342, 3098, 3102, 3130, 3134, 3354, 3358, 3386, 3390, 1160, 1164, 1192, 1196, 1416, 1420, 1448, 1452, 3208, 3212, 3240, 3244, 3464, 3468, 3496, 3500, 1162, 1166, 1194, 1198, 1418, 1422, 1450, 1454, 3210, 3214, 3242, 3246, 3466, 3470, 3498, 3502, 1176, 1180, 1208, 1212, 1432, 1436, 1464, 1468, 3224, 3228, 3256, 3260, 3480, 3484, 3512, 3516, 1178, 1182, 1210, 1214, 1434, 1438, 1466, 1470, 3226, 3230, 3258, 3262, 3482, 3486, 3514, 3518, 9, 13, 41, 45, 265, 269, 297, 301, 2057, 2061, 2089, 2093, 2313, 2317, 2345, 2349, 11, 15, 43, 47, 267, 271, 299, 303, 2059, 2063, 2091, 2095, 2315, 2319, 2347, 2351, 25, 29, 57, 61, 281, 285, 313, 317, 2073, 2077, 2105, 2109, 2329, 2333, 2361, 2365, 27, 31, 59, 63, 283, 287, 315, 319, 2075, 2079, 2107, 2111, 2331, 2335, 2363, 2367, 137, 141, 169, 173, 393, 397, 425, 429, 2185, 2189, 2217, 2221, 2441, 2445, 2473, 2477, 139, 143, 171, 175, 395, 399, 427, 431, 2187, 2191, 2219, 2223, 2443, 2447, 2475, 2479, 153, 157, 185, 189, 409, 413, 441, 445, 2201, 2205, 2233, 2237, 2457, 2461, 2489, 2493, 155, 159, 187, 191, 411, 415, 443, 447, 2203, 2207, 2235, 2239, 2459, 2463, 2491, 2495, 1033, 1037, 1065, 1069, 1289, 1293, 1321, 1325, 3081, 3085, 3113, 3117, 3337, 3341, 3369, 3373, 1035, 1039, 1067, 1071, 1291, 1295, 1323, 1327, 3083, 3087, 3115, 3119, 3339, 3343, 3371, 3375, 1049, 1053, 1081, 1085, 1305, 1309, 1337, 1341, 3097, 3101, 3129, 3133, 3353, 3357, 3385, 3389, 1051, 1055, 1083, 1087, 1307, 1311, 1339, 1343, 3099, 3103, 3131, 3135, 3355, 3359, 3387, 3391, 1161, 1165, 1193, 1197, 1417, 1421, 1449, 1453, 3209, 3213, 3241, 3245, 3465, 3469, 3497, 3501, 1163, 1167, 1195, 1199, 1419, 1423, 1451, 1455, 3211, 3215, 3243, 3247, 3467, 3471, 3499, 3503, 1177, 1181, 1209, 1213, 1433, 1437, 1465, 1469, 3225, 3229, 3257, 3261, 3481, 3485, 3513, 3517, 1179, 1183, 1211, 1215, 1435, 1439, 1467, 1471, 3227, 3231, 3259, 3263, 3483, 3487, 3515, 3519, 64, 68, 96, 100, 320, 324, 352, 356, 2112, 2116, 2144, 2148, 2368, 2372, 2400, 2404, 66, 70, 98, 102, 322, 326, 354, 358, 2114, 2118, 2146, 2150, 2370, 2374, 2402, 2406, 80, 84, 112, 116, 336, 340, 368, 372, 2128, 2132, 2160, 2164, 2384, 2388, 2416, 2420, 82, 86, 114, 118, 338, 342, 370, 374, 2130, 2134, 2162, 2166, 2386, 2390, 2418, 2422, 192, 196, 224, 228, 448, 452, 480, 484, 2240, 2244, 2272, 2276, 2496, 2500, 2528, 2532, 194, 198, 226, 230, 450, 454, 482, 486, 2242, 2246, 2274, 2278, 2498, 2502, 2530, 2534, 208, 212, 240, 244, 464, 468, 496, 500, 2256, 2260, 2288, 2292, 2512, 2516, 2544, 2548, 210, 214, 242, 246, 466, 470, 498, 502, 2258, 2262, 2290, 2294, 2514, 2518, 2546, 2550, 1088, 1092, 1120, 1124, 1344, 1348, 1376, 1380, 3136, 3140, 3168, 3172, 3392, 3396, 3424, 3428, 1090, 1094, 1122, 1126, 1346, 1350, 1378, 1382, 3138, 3142, 3170, 3174, 3394, 3398, 3426, 3430, 1104, 1108, 1136, 1140, 1360, 1364, 1392, 1396, 3152, 3156, 3184, 3188, 3408, 3412, 3440, 3444, 1106, 1110, 1138, 1142, 1362, 1366, 1394, 1398, 3154, 3158, 3186, 3190, 3410, 3414, 3442, 3446, 1216, 1220, 1248, 1252, 1472, 1476, 1504, 1508, 3264, 3268, 3296, 3300, 3520, 3524, 3552, 3556, 1218, 1222, 1250, 1254, 1474, 1478, 1506, 1510, 3266, 3270, 3298, 3302, 3522, 3526, 3554, 3558, 1232, 1236, 1264, 1268, 1488, 1492, 1520, 1524, 3280, 3284, 3312, 3316, 3536, 3540, 3568, 3572, 1234, 1238, 1266, 1270, 1490, 1494, 1522, 1526, 3282, 3286, 3314, 3318, 3538, 3542, 3570, 3574, 65, 69, 97, 101, 321, 325, 353, 357, 2113, 2117, 2145, 2149, 2369, 2373, 2401, 2405, 67, 71, 99, 103, 323, 327, 355, 359, 2115, 2119, 2147, 2151, 2371, 2375, 2403, 2407, 81, 85, 113, 117, 337, 341, 369, 373, 2129, 2133, 2161, 2165, 2385, 2389, 2417, 2421, 83, 87, 115, 119, 339, 343, 371, 375, 2131, 2135, 2163, 2167, 2387, 2391, 2419, 2423, 193, 197, 225, 229, 449, 453, 481, 485, 2241, 2245, 2273, 2277, 2497, 2501, 2529, 2533, 195, 199, 227, 231, 451, 455, 483, 487, 2243, 2247, 2275, 2279, 2499, 2503, 2531, 2535, 209, 213, 241, 245, 465, 469, 497, 501, 2257, 2261, 2289, 2293, 2513, 2517, 2545, 2549, 211, 215, 243, 247, 467, 471, 499, 503, 2259, 2263, 2291, 2295, 2515, 2519, 2547, 2551, 1089, 1093, 1121, 1125, 1345, 1349, 1377, 1381, 3137, 3141, 3169, 3173, 3393, 3397, 3425, 3429, 1091, 1095, 1123, 1127, 1347, 1351, 1379, 1383, 3139, 3143, 3171, 3175, 3395, 3399, 3427, 3431, 1105, 1109, 1137, 1141, 1361, 1365, 1393, 1397, 3153, 3157, 3185, 3189, 3409, 3413, 3441, 3445, 1107, 1111, 1139, 1143, 1363, 1367, 1395, 1399, 3155, 3159, 3187, 3191, 3411, 3415, 3443, 3447, 1217, 1221, 1249, 1253, 1473, 1477, 1505, 1509, 3265, 3269, 3297, 3301, 3521, 3525, 3553, 3557, 1219, 1223, 1251, 1255, 1475, 1479, 1507, 1511, 3267, 3271, 3299, 3303, 3523, 3527, 3555, 3559, 1233, 1237, 1265, 1269, 1489, 1493, 1521, 1525, 3281, 3285, 3313, 3317, 3537, 3541, 3569, 3573, 1235, 1239, 1267, 1271, 1491, 1495, 1523, 1527, 3283, 3287, 3315, 3319, 3539, 3543, 3571, 3575, 72, 76, 104, 108, 328, 332, 360, 364, 2120, 2124, 2152, 2156, 2376, 2380, 2408, 2412, 74, 78, 106, 110, 330, 334, 362, 366, 2122, 2126, 2154, 2158, 2378, 2382, 2410, 2414, 88, 92, 120, 124, 344, 348, 376, 380, 2136, 2140, 2168, 2172, 2392, 2396, 2424, 2428, 90, 94, 122, 126, 346, 350, 378, 382, 2138, 2142, 2170, 2174, 2394, 2398, 2426, 2430, 200, 204, 232, 236, 456, 460, 488, 492, 2248, 2252, 2280, 2284, 2504, 2508, 2536, 2540, 202, 206, 234, 238, 458, 462, 490, 494, 2250, 2254, 2282, 2286, 2506, 2510, 2538, 2542, 216, 220, 248, 252, 472, 476, 504, 508, 2264, 2268, 2296, 2300, 2520, 2524, 2552, 2556, 218, 222, 250, 254, 474, 478, 506, 510, 2266, 2270, 2298, 2302, 2522, 2526, 2554, 2558, 1096, 1100, 1128, 1132, 1352, 1356, 1384, 1388, 3144, 3148, 3176, 3180, 3400, 3404, 3432, 3436, 1098, 1102, 1130, 1134, 1354, 1358, 1386, 1390, 3146, 3150, 3178, 3182, 3402, 3406, 3434, 3438, 1112, 1116, 1144, 1148, 1368, 1372, 1400, 1404, 3160, 3164, 3192, 3196, 3416, 3420, 3448, 3452, 1114, 1118, 1146, 1150, 1370, 1374, 1402, 1406, 3162, 3166, 3194, 3198, 3418, 3422, 3450, 3454, 1224, 1228, 1256, 1260, 1480, 1484, 1512, 1516, 3272, 3276, 3304, 3308, 3528, 3532, 3560, 3564, 1226, 1230, 1258, 1262, 1482, 1486, 1514, 1518, 3274, 3278, 3306, 3310, 3530, 3534, 3562, 3566, 1240, 1244, 1272, 1276, 1496, 1500, 1528, 1532, 3288, 3292, 3320, 3324, 3544, 3548, 3576, 3580, 1242, 1246, 1274, 1278, 1498, 1502, 1530, 1534, 3290, 3294, 3322, 3326, 3546, 3550, 3578, 3582, 73, 77, 105, 109, 329, 333, 361, 365, 2121, 2125, 2153, 2157, 2377, 2381, 2409, 2413, 75, 79, 107, 111, 331, 335, 363, 367, 2123, 2127, 2155, 2159, 2379, 2383, 2411, 2415, 89, 93, 121, 125, 345, 349, 377, 381, 2137, 2141, 2169, 2173, 2393, 2397, 2425, 2429, 91, 95, 123, 127, 347, 351, 379, 383, 2139, 2143, 2171, 2175, 2395, 2399, 2427, 2431, 201, 205, 233, 237, 457, 461, 489, 493, 2249, 2253, 2281, 2285, 2505, 2509, 2537, 2541, 203, 207, 235, 239, 459, 463, 491, 495, 2251, 2255, 2283, 2287, 2507, 2511, 2539, 2543, 217, 221, 249, 253, 473, 477, 505, 509, 2265, 2269, 2297, 2301, 2521, 2525, 2553, 2557, 219, 223, 251, 255, 475, 479, 507, 511, 2267, 2271, 2299, 2303, 2523, 2527, 2555, 2559, 1097, 1101, 1129, 1133, 1353, 1357, 1385, 1389, 3145, 3149, 3177, 3181, 3401, 3405, 3433, 3437, 1099, 1103, 1131, 1135, 1355, 1359, 1387, 1391, 3147, 3151, 3179, 3183, 3403, 3407, 3435, 3439, 1113, 1117, 1145, 1149, 1369, 1373, 1401, 1405, 3161, 3165, 3193, 3197, 3417, 3421, 3449, 3453, 1115, 1119, 1147, 1151, 1371, 1375, 1403, 1407, 3163, 3167, 3195, 3199, 3419, 3423, 3451, 3455, 1225, 1229, 1257, 1261, 1481, 1485, 1513, 1517, 3273, 3277, 3305, 3309, 3529, 3533, 3561, 3565, 1227, 1231, 1259, 1263, 1483, 1487, 1515, 1519, 3275, 3279, 3307, 3311, 3531, 3535, 3563, 3567, 1241, 1245, 1273, 1277, 1497, 1501, 1529, 1533, 3289, 3293, 3321, 3325, 3545, 3549, 3577, 3581, 1243, 1247, 1275, 1279, 1499, 1503, 1531, 1535, 3291, 3295, 3323, 3327, 3547, 3551, 3579, 3583, 512, 516, 544, 548, 768, 772, 800, 804, 2560, 2564, 2592, 2596, 2816, 2820, 2848, 2852, 514, 518, 546, 550, 770, 774, 802, 806, 2562, 2566, 2594, 2598, 2818, 2822, 2850, 2854, 528, 532, 560, 564, 784, 788, 816, 820, 2576, 2580, 2608, 2612, 2832, 2836, 2864, 2868, 530, 534, 562, 566, 786, 790, 818, 822, 2578, 2582, 2610, 2614, 2834, 2838, 2866, 2870, 640, 644, 672, 676, 896, 900, 928, 932, 2688, 2692, 2720, 2724, 2944, 2948, 2976, 2980, 642, 646, 674, 678, 898, 902, 930, 934, 2690, 2694, 2722, 2726, 2946, 2950, 2978, 2982, 656, 660, 688, 692, 912, 916, 944, 948, 2704, 2708, 2736, 2740, 2960, 2964, 2992, 2996, 658, 662, 690, 694, 914, 918, 946, 950, 2706, 2710, 2738, 2742, 2962, 2966, 2994, 2998, 1536, 1540, 1568, 1572, 1792, 1796, 1824, 1828, 3584, 3588, 3616, 3620, 3840, 3844, 3872, 3876, 1538, 1542, 1570, 1574, 1794, 1798, 1826, 1830, 3586, 3590, 3618, 3622, 3842, 3846, 3874, 3878, 1552, 1556, 1584, 1588, 1808, 1812, 1840, 1844, 3600, 3604, 3632, 3636, 3856, 3860, 3888, 3892, 1554, 1558, 1586, 1590, 1810, 1814, 1842, 1846, 3602, 3606, 3634, 3638, 3858, 3862, 3890, 3894, 1664, 1668, 1696, 1700, 1920, 1924, 1952, 1956, 3712, 3716, 3744, 3748, 3968, 3972, 4000, 4004, 1666, 1670, 1698, 1702, 1922, 1926, 1954, 1958, 3714, 3718, 3746, 3750, 3970, 3974, 4002, 4006, 1680, 1684, 1712, 1716, 1936, 1940, 1968, 1972, 3728, 3732, 3760, 3764, 3984, 3988, 4016, 4020, 1682, 1686, 1714, 1718, 1938, 1942, 1970, 1974, 3730, 3734, 3762, 3766, 3986, 3990, 4018, 4022, 513, 517, 545, 549, 769, 773, 801, 805, 2561, 2565, 2593, 2597, 2817, 2821, 2849, 2853, 515, 519, 547, 551, 771, 775, 803, 807, 2563, 2567, 2595, 2599, 2819, 2823, 2851, 2855, 529, 533, 561, 565, 785, 789, 817, 821, 2577, 2581, 2609, 2613, 2833, 2837, 2865, 2869, 531, 535, 563, 567, 787, 791, 819, 823, 2579, 2583, 2611, 2615, 2835, 2839, 2867, 2871, 641, 645, 673, 677, 897, 901, 929, 933, 2689, 2693, 2721, 2725, 2945, 2949, 2977, 2981, 643, 647, 675, 679, 899, 903, 931, 935, 2691, 2695, 2723, 2727, 2947, 2951, 2979, 2983, 657, 661, 689, 693, 913, 917, 945, 949, 2705, 2709, 2737, 2741, 2961, 2965, 2993, 2997, 659, 663, 691, 695, 915, 919, 947, 951, 2707, 2711, 2739, 2743, 2963, 2967, 2995, 2999, 1537, 1541, 1569, 1573, 1793, 1797, 1825, 1829, 3585, 3589, 3617, 3621, 3841, 3845, 3873, 3877, 1539, 1543, 1571, 1575, 1795, 1799, 1827, 1831, 3587, 3591, 3619, 3623, 3843, 3847, 3875, 3879, 1553, 1557, 1585, 1589, 1809, 1813, 1841, 1845, 3601, 3605, 3633, 3637, 3857, 3861, 3889, 3893, 1555, 1559, 1587, 1591, 1811, 1815, 1843, 1847, 3603, 3607, 3635, 3639, 3859, 3863, 3891, 3895, 1665, 1669, 1697, 1701, 1921, 1925, 1953, 1957, 3713, 3717, 3745, 3749, 3969, 3973, 4001, 4005, 1667, 1671, 1699, 1703, 1923, 1927, 1955, 1959, 3715, 3719, 3747, 3751, 3971, 3975, 4003, 4007, 1681, 1685, 1713, 1717, 1937, 1941, 1969, 1973, 3729, 3733, 3761, 3765, 3985, 3989, 4017, 4021, 1683, 1687, 1715, 1719, 1939, 1943, 1971, 1975, 3731, 3735, 3763, 3767, 3987, 3991, 4019, 4023, 520, 524, 552, 556, 776, 780, 808, 812, 2568, 2572, 2600, 2604, 2824, 2828, 2856, 2860, 522, 526, 554, 558, 778, 782, 810, 814, 2570, 2574, 2602, 2606, 2826, 2830, 2858, 2862, 536, 540, 568, 572, 792, 796, 824, 828, 2584, 2588, 2616, 2620, 2840, 2844, 2872, 2876, 538, 542, 570, 574, 794, 798, 826, 830, 2586, 2590, 2618, 2622, 2842, 2846, 2874, 2878, 648, 652, 680, 684, 904, 908, 936, 940, 2696, 2700, 2728, 2732, 2952, 2956, 2984, 2988, 650, 654, 682, 686, 906, 910, 938, 942, 2698, 2702, 2730, 2734, 2954, 2958, 2986, 2990, 664, 668, 696, 700, 920, 924, 952, 956, 2712, 2716, 2744, 2748, 2968, 2972, 3000, 3004, 666, 670, 698, 702, 922, 926, 954, 958, 2714, 2718, 2746, 2750, 2970, 2974, 3002, 3006, 1544, 1548, 1576, 1580, 1800, 1804, 1832, 1836, 3592, 3596, 3624, 3628, 3848, 3852, 3880, 3884, 1546, 1550, 1578, 1582, 1802, 1806, 1834, 1838, 3594, 3598, 3626, 3630, 3850, 3854, 3882, 3886, 1560, 1564, 1592, 1596, 1816, 1820, 1848, 1852, 3608, 3612, 3640, 3644, 3864, 3868, 3896, 3900, 1562, 1566, 1594, 1598, 1818, 1822, 1850, 1854, 3610, 3614, 3642, 3646, 3866, 3870, 3898, 3902, 1672, 1676, 1704, 1708, 1928, 1932, 1960, 1964, 3720, 3724, 3752, 3756, 3976, 3980, 4008, 4012, 1674, 1678, 1706, 1710, 1930, 1934, 1962, 1966, 3722, 3726, 3754, 3758, 3978, 3982, 4010, 4014, 1688, 1692, 1720, 1724, 1944, 1948, 1976, 1980, 3736, 3740, 3768, 3772, 3992, 3996, 4024, 4028, 1690, 1694, 1722, 1726, 1946, 1950, 1978, 1982, 3738, 3742, 3770, 3774, 3994, 3998, 4026, 4030, 521, 525, 553, 557, 777, 781, 809, 813, 2569, 2573, 2601, 2605, 2825, 2829, 2857, 2861, 523, 527, 555, 559, 779, 783, 811, 815, 2571, 2575, 2603, 2607, 2827, 2831, 2859, 2863, 537, 541, 569, 573, 793, 797, 825, 829, 2585, 2589, 2617, 2621, 2841, 2845, 2873, 2877, 539, 543, 571, 575, 795, 799, 827, 831, 2587, 2591, 2619, 2623, 2843, 2847, 2875, 2879, 649, 653, 681, 685, 905, 909, 937, 941, 2697, 2701, 2729, 2733, 2953, 2957, 2985, 2989, 651, 655, 683, 687, 907, 911, 939, 943, 2699, 2703, 2731, 2735, 2955, 2959, 2987, 2991, 665, 669, 697, 701, 921, 925, 953, 957, 2713, 2717, 2745, 2749, 2969, 2973, 3001, 3005, 667, 671, 699, 703, 923, 927, 955, 959, 2715, 2719, 2747, 2751, 2971, 2975, 3003, 3007, 1545, 1549, 1577, 1581, 1801, 1805, 1833, 1837, 3593, 3597, 3625, 3629, 3849, 3853, 3881, 3885, 1547, 1551, 1579, 1583, 1803, 1807, 1835, 1839, 3595, 3599, 3627, 3631, 3851, 3855, 3883, 3887, 1561, 1565, 1593, 1597, 1817, 1821, 1849, 1853, 3609, 3613, 3641, 3645, 3865, 3869, 3897, 3901, 1563, 1567, 1595, 1599, 1819, 1823, 1851, 1855, 3611, 3615, 3643, 3647, 3867, 3871, 3899, 3903, 1673, 1677, 1705, 1709, 1929, 1933, 1961, 1965, 3721, 3725, 3753, 3757, 3977, 3981, 4009, 4013, 1675, 1679, 1707, 1711, 1931, 1935, 1963, 1967, 3723, 3727, 3755, 3759, 3979, 3983, 4011, 4015, 1689, 1693, 1721, 1725, 1945, 1949, 1977, 1981, 3737, 3741, 3769, 3773, 3993, 3997, 4025, 4029, 1691, 1695, 1723, 1727, 1947, 1951, 1979, 1983, 3739, 3743, 3771, 3775, 3995, 3999, 4027, 4031, 576, 580, 608, 612, 832, 836, 864, 868, 2624, 2628, 2656, 2660, 2880, 2884, 2912, 2916, 578, 582, 610, 614, 834, 838, 866, 870, 2626, 2630, 2658, 2662, 2882, 2886, 2914, 2918, 592, 596, 624, 628, 848, 852, 880, 884, 2640, 2644, 2672, 2676, 2896, 2900, 2928, 2932, 594, 598, 626, 630, 850, 854, 882, 886, 2642, 2646, 2674, 2678, 2898, 2902, 2930, 2934, 704, 708, 736, 740, 960, 964, 992, 996, 2752, 2756, 2784, 2788, 3008, 3012, 3040, 3044, 706, 710, 738, 742, 962, 966, 994, 998, 2754, 2758, 2786, 2790, 3010, 3014, 3042, 3046, 720, 724, 752, 756, 976, 980, 1008, 1012, 2768, 2772, 2800, 2804, 3024, 3028, 3056, 3060, 722, 726, 754, 758, 978, 982, 1010, 1014, 2770, 2774, 2802, 2806, 3026, 3030, 3058, 3062, 1600, 1604, 1632, 1636, 1856, 1860, 1888, 1892, 3648, 3652, 3680, 3684, 3904, 3908, 3936, 3940, 1602, 1606, 1634, 1638, 1858, 1862, 1890, 1894, 3650, 3654, 3682, 3686, 3906, 3910, 3938, 3942, 1616, 1620, 1648, 1652, 1872, 1876, 1904, 1908, 3664, 3668, 3696, 3700, 3920, 3924, 3952, 3956, 1618, 1622, 1650, 1654, 1874, 1878, 1906, 1910, 3666, 3670, 3698, 3702, 3922, 3926, 3954, 3958, 1728, 1732, 1760, 1764, 1984, 1988, 2016, 2020, 3776, 3780, 3808, 3812, 4032, 4036, 4064, 4068, 1730, 1734, 1762, 1766, 1986, 1990, 2018, 2022, 3778, 3782, 3810, 3814, 4034, 4038, 4066, 4070, 1744, 1748, 1776, 1780, 2000, 2004, 2032, 2036, 3792, 3796, 3824, 3828, 4048, 4052, 4080, 4084, 1746, 1750, 1778, 1782, 2002, 2006, 2034, 2038, 3794, 3798, 3826, 3830, 4050, 4054, 4082, 4086, 577, 581, 609, 613, 833, 837, 865, 869, 2625, 2629, 2657, 2661, 2881, 2885, 2913, 2917, 579, 583, 611, 615, 835, 839, 867, 871, 2627, 2631, 2659, 2663, 2883, 2887, 2915, 2919, 593, 597, 625, 629, 849, 853, 881, 885, 2641, 2645, 2673, 2677, 2897, 2901, 2929, 2933, 595, 599, 627, 631, 851, 855, 883, 887, 2643, 2647, 2675, 2679, 2899, 2903, 2931, 2935, 705, 709, 737, 741, 961, 965, 993, 997, 2753, 2757, 2785, 2789, 3009, 3013, 3041, 3045, 707, 711, 739, 743, 963, 967, 995, 999, 2755, 2759, 2787, 2791, 3011, 3015, 3043, 3047, 721, 725, 753, 757, 977, 981, 1009, 1013, 2769, 2773, 2801, 2805, 3025, 3029, 3057, 3061, 723, 727, 755, 759, 979, 983, 1011, 1015, 2771, 2775, 2803, 2807, 3027, 3031, 3059, 3063, 1601, 1605, 1633, 1637, 1857, 1861, 1889, 1893, 3649, 3653, 3681, 3685, 3905, 3909, 3937, 3941, 1603, 1607, 1635, 1639, 1859, 1863, 1891, 1895, 3651, 3655, 3683, 3687, 3907, 3911, 3939, 3943, 1617, 1621, 1649, 1653, 1873, 1877, 1905, 1909, 3665, 3669, 3697, 3701, 3921, 3925, 3953, 3957, 1619, 1623, 1651, 1655, 1875, 1879, 1907, 1911, 3667, 3671, 3699, 3703, 3923, 3927, 3955, 3959, 1729, 1733, 1761, 1765, 1985, 1989, 2017, 2021, 3777, 3781, 3809, 3813, 4033, 4037, 4065, 4069, 1731, 1735, 1763, 1767, 1987, 1991, 2019, 2023, 3779, 3783, 3811, 3815, 4035, 4039, 4067, 4071, 1745, 1749, 1777, 1781, 2001, 2005, 2033, 2037, 3793, 3797, 3825, 3829, 4049, 4053, 4081, 4085, 1747, 1751, 1779, 1783, 2003, 2007, 2035, 2039, 3795, 3799, 3827, 3831, 4051, 4055, 4083, 4087, 584, 588, 616, 620, 840, 844, 872, 876, 2632, 2636, 2664, 2668, 2888, 2892, 2920, 2924, 586, 590, 618, 622, 842, 846, 874, 878, 2634, 2638, 2666, 2670, 2890, 2894, 2922, 2926, 600, 604, 632, 636, 856, 860, 888, 892, 2648, 2652, 2680, 2684, 2904, 2908, 2936, 2940, 602, 606, 634, 638, 858, 862, 890, 894, 2650, 2654, 2682, 2686, 2906, 2910, 2938, 2942, 712, 716, 744, 748, 968, 972, 1000, 1004, 2760, 2764, 2792, 2796, 3016, 3020, 3048, 3052, 714, 718, 746, 750, 970, 974, 1002, 1006, 2762, 2766, 2794, 2798, 3018, 3022, 3050, 3054, 728, 732, 760, 764, 984, 988, 1016, 1020, 2776, 2780, 2808, 2812, 3032, 3036, 3064, 3068, 730, 734, 762, 766, 986, 990, 1018, 1022, 2778, 2782, 2810, 2814, 3034, 3038, 3066, 3070, 1608, 1612, 1640, 1644, 1864, 1868, 1896, 1900, 3656, 3660, 3688, 3692, 3912, 3916, 3944, 3948, 1610, 1614, 1642, 1646, 1866, 1870, 1898, 1902, 3658, 3662, 3690, 3694, 3914, 3918, 3946, 3950, 1624, 1628, 1656, 1660, 1880, 1884, 1912, 1916, 3672, 3676, 3704, 3708, 3928, 3932, 3960, 3964, 1626, 1630, 1658, 1662, 1882, 1886, 1914, 1918, 3674, 3678, 3706, 3710, 3930, 3934, 3962, 3966, 1736, 1740, 1768, 1772, 1992, 1996, 2024, 2028, 3784, 3788, 3816, 3820, 4040, 4044, 4072, 4076, 1738, 1742, 1770, 1774, 1994, 1998, 2026, 2030, 3786, 3790, 3818, 3822, 4042, 4046, 4074, 4078, 1752, 1756, 1784, 1788, 2008, 2012, 2040, 2044, 3800, 3804, 3832, 3836, 4056, 4060, 4088, 4092, 1754, 1758, 1786, 1790, 2010, 2014, 2042, 2046, 3802, 3806, 3834, 3838, 4058, 4062, 4090, 4094, 585, 589, 617, 621, 841, 845, 873, 877, 2633, 2637, 2665, 2669, 2889, 2893, 2921, 2925, 587, 591, 619, 623, 843, 847, 875, 879, 2635, 2639, 2667, 2671, 2891, 2895, 2923, 2927, 601, 605, 633, 637, 857, 861, 889, 893, 2649, 2653, 2681, 2685, 2905, 2909, 2937, 2941, 603, 607, 635, 639, 859, 863, 891, 895, 2651, 2655, 2683, 2687, 2907, 2911, 2939, 2943, 713, 717, 745, 749, 969, 973, 1001, 1005, 2761, 2765, 2793, 2797, 3017, 3021, 3049, 3053, 715, 719, 747, 751, 971, 975, 1003, 1007, 2763, 2767, 2795, 2799, 3019, 3023, 3051, 3055, 729, 733, 761, 765, 985, 989, 1017, 1021, 2777, 2781, 2809, 2813, 3033, 3037, 3065, 3069, 731, 735, 763, 767, 987, 991, 1019, 1023, 2779, 2783, 2811, 2815, 3035, 3039, 3067, 3071, 1609, 1613, 1641, 1645, 1865, 1869, 1897, 1901, 3657, 3661, 3689, 3693, 3913, 3917, 3945, 3949, 1611, 1615, 1643, 1647, 1867, 1871, 1899, 1903, 3659, 3663, 3691, 3695, 3915, 3919, 3947, 3951, 1625, 1629, 1657, 1661, 1881, 1885, 1913, 1917, 3673, 3677, 3705, 3709, 3929, 3933, 3961, 3965, 1627, 1631, 1659, 1663, 1883, 1887, 1915, 1919, 3675, 3679, 3707, 3711, 3931, 3935, 3963, 3967, 1737, 1741, 1769, 1773, 1993, 1997, 2025, 2029, 3785, 3789, 3817, 3821, 4041, 4045, 4073, 4077, 1739, 1743, 1771, 1775, 1995, 1999, 2027, 2031, 3787, 3791, 3819, 3823, 4043, 4047, 4075, 4079, 1753, 1757, 1785, 1789, 2009, 2013, 2041, 2045, 3801, 3805, 3833, 3837, 4057, 4061, 4089, 4093, 1755, 1759, 1787, 1791, 2011, 2015, 2043, 2047, 3803, 3807, 3835, 3839, 4059, 4063, 4091, 4095};
+
+uint64_t hilbert_grid_16_16_16[16 * 16* 16] = {
+    0, 7, 8, 11, 212, 211, 204, 203, 3892, 3891, 3884, 3883, 4084, 4087, 4088, 4095, 3, 4, 9, 10, 215, 208, 207, 200, 3895, 3888, 3887, 3880, 4085, 4086, 4091, 4092, 60, 59, 54, 53, 216, 219, 198, 199, 3896, 3897, 3876, 3879, 4042, 4041, 4036, 4035, 63, 56, 55, 52, 217, 218, 193, 192, 3903, 3902, 3877, 3878, 4043, 4040, 4039, 4032, 64, 67, 124, 127, 128, 131, 188, 191, 3904, 3907, 3964, 3967, 3968, 3971, 4028, 4031, 65, 66, 125, 126, 129, 130, 189, 190, 3905, 3906, 3965, 3966, 3969, 3970, 4029, 4030, 90, 93, 98, 101, 154, 157, 162, 165, 3930, 3933, 3938, 3941, 3994, 3997, 4002, 4005, 89, 94, 97, 102, 153, 158, 161, 166, 3929, 3934, 3937, 3942, 3993, 3998, 4001, 4006, 1702, 1703, 1704, 1707, 1876, 1879, 1880, 1881, 2214, 2215, 2216, 2219, 2388, 2391, 2392, 2393, 1697, 1696, 1705, 1706, 1877, 1878, 1887, 1886, 2209, 2208, 2217, 2218, 2389, 2390, 2399, 2398, 1694, 1695, 1686, 1685, 1898, 1897, 1888, 1889, 2206, 2207, 2198, 2197, 2410, 2409, 2400, 2401, 1689, 1688, 1687, 1684, 1899, 1896, 1895, 1894, 2201, 2200, 2199, 2196, 2411, 2408, 2407, 2406, 1638, 1639, 1640, 1643, 1940, 1943, 1944, 1945, 2150, 2151, 2152, 2155, 2452, 2455, 2456, 2457, 1633, 1632, 1641, 1642, 1941, 1942, 1951, 1950, 2145, 2144, 2153, 2154, 2453, 2454, 2463, 2462, 1630, 1631, 1622, 1621, 1962, 1961, 1952, 1953, 2142, 2143, 2134, 2133, 2474, 2473, 2464, 2465, 1625, 1624, 1623, 1620, 1963, 1960, 1959, 1958, 2137, 2136, 2135, 2132, 2475, 2472, 2471, 2470, 1, 6, 15, 12, 213, 210, 205, 202, 3893, 3890, 3885, 3882, 4083, 4080, 4089, 4094, 2, 5, 14, 13, 214, 209, 206, 201, 3894, 3889, 3886, 3881, 4082, 4081, 4090, 4093, 61, 58, 49, 50, 223, 220, 197, 196, 3899, 3898, 3875, 3872, 4045, 4046, 4037, 4034, 62, 57, 48, 51, 222, 221, 194, 195, 3900, 3901, 3874, 3873, 4044, 4047, 4038, 4033, 71, 68, 123, 120, 135, 132, 187, 184, 3911, 3908, 3963, 3960, 3975, 3972, 4027, 4024, 70, 69, 122, 121, 134, 133, 186, 185, 3910, 3909, 3962, 3961, 3974, 3973, 4026, 4025, 91, 92, 99, 100, 155, 156, 163, 164, 3931, 3932, 3939, 3940, 3995, 3996, 4003, 4004, 88, 95, 96, 103, 152, 159, 160, 167, 3928, 3935, 3936, 3943, 3992, 3999, 4000, 4007, 1701, 1700, 1711, 1708, 1875, 1872, 1883, 1882, 2213, 2212, 2223, 2220, 2387, 2384, 2395, 2394, 1698, 1699, 1710, 1709, 1874, 1873, 1884, 1885, 2210, 2211, 2222, 2221, 2386, 2385, 2396, 2397, 1693, 1692, 1681, 1682, 1901, 1902, 1891, 1890, 2205, 2204, 2193, 2194, 2413, 2414, 2403, 2402, 1690, 1691, 1680, 1683, 1900, 1903, 1892, 1893, 2202, 2203, 2192, 2195, 2412, 2415, 2404, 2405, 1637, 1636, 1647, 1644, 1939, 1936, 1947, 1946, 2149, 2148, 2159, 2156, 2451, 2448, 2459, 2458, 1634, 1635, 1646, 1645, 1938, 1937, 1948, 1949, 2146, 2147, 2158, 2157, 2450, 2449, 2460, 2461, 1629, 1628, 1617, 1618, 1965, 1966, 1955, 1954, 2141, 2140, 2129, 2130, 2477, 2478, 2467, 2466, 1626, 1627, 1616, 1619, 1964, 1967, 1956, 1957, 2138, 2139, 2128, 2131, 2476, 2479, 2468, 2469, 26, 27, 16, 19, 234, 237, 242, 245, 3850, 3853, 3858, 3861, 4076, 4079, 4068, 4069, 29, 28, 17, 18, 233, 238, 241, 246, 3849, 3854, 3857, 3862, 4077, 4078, 4067, 4066, 34, 35, 46, 45, 224, 227, 250, 251, 3844, 3845, 3868, 3871, 4050, 4049, 4060, 4061, 37, 36, 47, 44, 225, 226, 253, 252, 3843, 3842, 3869, 3870, 4051, 4048, 4059, 4058, 72, 73, 118, 119, 136, 137, 182, 183, 3912, 3913, 3958, 3959, 3976, 3977, 4022, 4023, 79, 78, 113, 112, 143, 142, 177, 176, 3919, 3918, 3953, 3952, 3983, 3982, 4017, 4016, 80, 81, 110, 111, 144, 145, 174, 175, 3920, 3921, 3950, 3951, 3984, 3985, 4014, 4015, 87, 86, 105, 104, 151, 150, 169, 168, 3927, 3926, 3945, 3944, 3991, 3990, 4009, 4008, 1726, 1721, 1712, 1715, 1868, 1871, 1862, 1857, 2238, 2233, 2224, 2227, 2380, 2383, 2374, 2369, 1725, 1722, 1713, 1714, 1869, 1870, 1861, 1858, 2237, 2234, 2225, 2226, 2381, 2382, 2373, 2370, 1666, 1669, 1678, 1677, 1906, 1905, 1914, 1917, 2178, 2181, 2190, 2189, 2418, 2417, 2426, 2429, 1665, 1670, 1679, 1676, 1907, 1904, 1913, 1918, 2177, 2182, 2191, 2188, 2419, 2416, 2425, 2430, 1662, 1657, 1648, 1651, 1932, 1935, 1926, 1921, 2174, 2169, 2160, 2163, 2444, 2447, 2438, 2433, 1661, 1658, 1649, 1650, 1933, 1934, 1925, 1922, 2173, 2170, 2161, 2162, 2445, 2446, 2437, 2434, 1602, 1605, 1614, 1613, 1970, 1969, 1978, 1981, 2114, 2117, 2126, 2125, 2482, 2481, 2490, 2493, 1601, 1606, 1615, 1612, 1971, 1968, 1977, 1982, 2113, 2118, 2127, 2124, 2483, 2480, 2489, 2494, 25, 24, 23, 20, 235, 236, 243, 244, 3851, 3852, 3859, 3860, 4075, 4072, 4071, 4070, 30, 31, 22, 21, 232, 239, 240, 247, 3848, 3855, 3856, 3863, 4074, 4073, 4064, 4065, 33, 32, 41, 42, 231, 228, 249, 248, 3847, 3846, 3867, 3864, 4053, 4054, 4063, 4062, 38, 39, 40, 43, 230, 229, 254, 255, 3840, 3841, 3866, 3865, 4052, 4055, 4056, 4057, 75, 74, 117, 116, 139, 138, 181, 180, 3915, 3914, 3957, 3956, 3979, 3978, 4021, 4020, 76, 77, 114, 115, 140, 141, 178, 179, 3916, 3917, 3954, 3955, 3980, 3981, 4018, 4019, 83, 82, 109, 108, 147, 146, 173, 172, 3923, 3922, 3949, 3948, 3987, 3986, 4013, 4012, 84, 85, 106, 107, 148, 149, 170, 171, 3924, 3925, 3946, 3947, 3988, 3989, 4010, 4011, 1727, 1720, 1719, 1716, 1867, 1864, 1863, 1856, 2239, 2232, 2231, 2228, 2379, 2376, 2375, 2368, 1724, 1723, 1718, 1717, 1866, 1865, 1860, 1859, 2236, 2235, 2230, 2229, 2378, 2377, 2372, 2371, 1667, 1668, 1673, 1674, 1909, 1910, 1915, 1916, 2179, 2180, 2185, 2186, 2421, 2422, 2427, 2428, 1664, 1671, 1672, 1675, 1908, 1911, 1912, 1919, 2176, 2183, 2184, 2187, 2420, 2423, 2424, 2431, 1663, 1656, 1655, 1652, 1931, 1928, 1927, 1920, 2175, 2168, 2167, 2164, 2443, 2440, 2439, 2432, 1660, 1659, 1654, 1653, 1930, 1929, 1924, 1923, 2172, 2171, 2166, 2165, 2442, 2441, 2436, 2435, 1603, 1604, 1609, 1610, 1973, 1974, 1979, 1980, 2115, 2116, 2121, 2122, 2485, 2486, 2491, 2492, 1600, 1607, 1608, 1611, 1972, 1975, 1976, 1983, 2112, 2119, 2120, 2123, 2484, 2487, 2488, 2495, 486, 487, 488, 491, 276, 275, 268, 267, 3828, 3827, 3820, 3819, 3604, 3607, 3608, 3609, 481, 480, 489, 490, 279, 272, 271, 264, 3831, 3824, 3823, 3816, 3605, 3606, 3615, 3614, 478, 479, 470, 469, 280, 283, 262, 263, 3832, 3833, 3812, 3815, 3626, 3625, 3616, 3617, 473, 472, 471, 468, 281, 282, 257, 256, 3839, 3838, 3813, 3814, 3627, 3624, 3623, 3622, 436, 437, 394, 395, 372, 373, 330, 331, 3764, 3765, 3722, 3723, 3700, 3701, 3658, 3659, 435, 434, 397, 396, 371, 370, 333, 332, 3763, 3762, 3725, 3724, 3699, 3698, 3661, 3660, 428, 429, 402, 403, 364, 365, 338, 339, 3756, 3757, 3730, 3731, 3692, 3693, 3666, 3667, 427, 426, 405, 404, 363, 362, 341, 340, 3755, 3754, 3733, 3732, 3691, 3690, 3669, 3668, 1728, 1731, 1788, 1791, 1792, 1795, 1852, 1855, 2240, 2243, 2300, 2303, 2304, 2307, 2364, 2367, 1729, 1730, 1789, 1790, 1793, 1794, 1853, 1854, 2241, 2242, 2301, 2302, 2305, 2306, 2365, 2366, 1754, 1757, 1762, 1765, 1818, 1821, 1826, 1829, 2266, 2269, 2274, 2277, 2330, 2333, 2338, 2341, 1753, 1758, 1761, 1766, 1817, 1822, 1825, 1830, 2265, 2270, 2273, 2278, 2329, 2334, 2337, 2342, 1588, 1587, 1580, 1579, 2004, 2003, 1996, 1995, 2100, 2099, 2092, 2091, 2516, 2515, 2508, 2507, 1591, 1584, 1583, 1576, 2007, 2000, 1999, 1992, 2103, 2096, 2095, 2088, 2519, 2512, 2511, 2504, 1592, 1593, 1572, 1575, 2008, 2011, 1990, 1991, 2104, 2105, 2084, 2087, 2520, 2523, 2502, 2503, 1599, 1598, 1573, 1574, 2009, 2010, 1985, 1984, 2111, 2110, 2085, 2086, 2521, 2522, 2497, 2496, 485, 484, 495, 492, 277, 274, 269, 266, 3829, 3826, 3821, 3818, 3603, 3600, 3611, 3610, 482, 483, 494, 493, 278, 273, 270, 265, 3830, 3825, 3822, 3817, 3602, 3601, 3612, 3613, 477, 476, 465, 466, 287, 284, 261, 260, 3835, 3834, 3811, 3808, 3629, 3630, 3619, 3618, 474, 475, 464, 467, 286, 285, 258, 259, 3836, 3837, 3810, 3809, 3628, 3631, 3620, 3621, 439, 438, 393, 392, 375, 374, 329, 328, 3767, 3766, 3721, 3720, 3703, 3702, 3657, 3656, 432, 433, 398, 399, 368, 369, 334, 335, 3760, 3761, 3726, 3727, 3696, 3697, 3662, 3663, 431, 430, 401, 400, 367, 366, 337, 336, 3759, 3758, 3729, 3728, 3695, 3694, 3665, 3664, 424, 425, 406, 407, 360, 361, 342, 343, 3752, 3753, 3734, 3735, 3688, 3689, 3670, 3671, 1735, 1732, 1787, 1784, 1799, 1796, 1851, 1848, 2247, 2244, 2299, 2296, 2311, 2308, 2363, 2360, 1734, 1733, 1786, 1785, 1798, 1797, 1850, 1849, 2246, 2245, 2298, 2297, 2310, 2309, 2362, 2361, 1755, 1756, 1763, 1764, 1819, 1820, 1827, 1828, 2267, 2268, 2275, 2276, 2331, 2332, 2339, 2340, 1752, 1759, 1760, 1767, 1816, 1823, 1824, 1831, 2264, 2271, 2272, 2279, 2328, 2335, 2336, 2343, 1589, 1586, 1581, 1578, 2005, 2002, 1997, 1994, 2101, 2098, 2093, 2090, 2517, 2514, 2509, 2506, 1590, 1585, 1582, 1577, 2006, 2001, 1998, 1993, 2102, 2097, 2094, 2089, 2518, 2513, 2510, 2505, 1595, 1594, 1571, 1568, 2015, 2012, 1989, 1988, 2107, 2106, 2083, 2080, 2527, 2524, 2501, 2500, 1596, 1597, 1570, 1569, 2014, 2013, 1986, 1987, 2108, 2109, 2082, 2081, 2526, 2525, 2498, 2499, 510, 505, 496, 499, 298, 301, 306, 309, 3786, 3789, 3794, 3797, 3596, 3599, 3590, 3585, 509, 506, 497, 498, 297, 302, 305, 310, 3785, 3790, 3793, 3798, 3597, 3598, 3589, 3586, 450, 453, 462, 461, 288, 291, 314, 315, 3780, 3781, 3804, 3807, 3634, 3633, 3642, 3645, 449, 454, 463, 460, 289, 290, 317, 316, 3779, 3778, 3805, 3806, 3635, 3632, 3641, 3646, 440, 443, 388, 391, 376, 379, 324, 327, 3768, 3771, 3716, 3719, 3704, 3707, 3652, 3655, 441, 442, 389, 390, 377, 378, 325, 326, 3769, 3770, 3717, 3718, 3705, 3706, 3653, 3654, 420, 419, 412, 411, 356, 355, 348, 347, 3748, 3747, 3740, 3739, 3684, 3683, 3676, 3675, 423, 416, 415, 408, 359, 352, 351, 344, 3751, 3744, 3743, 3736, 3687, 3680, 3679, 3672, 1736, 1737, 1782, 1783, 1800, 1801, 1846, 1847, 2248, 2249, 2294, 2295, 2312, 2313, 2358, 2359, 1743, 1742, 1777, 1776, 1807, 1806, 1841, 1840, 2255, 2254, 2289, 2288, 2319, 2318, 2353, 2352, 1744, 1745, 1774, 1775, 1808, 1809, 1838, 1839, 2256, 2257, 2286, 2287, 2320, 2321, 2350, 2351, 1751, 1750, 1769, 1768, 1815, 1814, 1833, 1832, 2263, 2262, 2281, 2280, 2327, 2326, 2345, 2344, 1546, 1549, 1554, 1557, 2026, 2029, 2034, 2037, 2058, 2061, 2066, 2069, 2538, 2541, 2546, 2549, 1545, 1550, 1553, 1558, 2025, 2030, 2033, 2038, 2057, 2062, 2065, 2070, 2537, 2542, 2545, 2550, 1540, 1541, 1564, 1567, 2016, 2019, 2042, 2043, 2052, 2053, 2076, 2079, 2528, 2531, 2554, 2555, 1539, 1538, 1565, 1566, 2017, 2018, 2045, 2044, 2051, 2050, 2077, 2078, 2529, 2530, 2557, 2556, 511, 504, 503, 500, 299, 300, 307, 308, 3787, 3788, 3795, 3796, 3595, 3592, 3591, 3584, 508, 507, 502, 501, 296, 303, 304, 311, 3784, 3791, 3792, 3799, 3594, 3593, 3588, 3587, 451, 452, 457, 458, 295, 292, 313, 312, 3783, 3782, 3803, 3800, 3637, 3638, 3643, 3644, 448, 455, 456, 459, 294, 293, 318, 319, 3776, 3777, 3802, 3801, 3636, 3639, 3640, 3647, 447, 444, 387, 384, 383, 380, 323, 320, 3775, 3772, 3715, 3712, 3711, 3708, 3651, 3648, 446, 445, 386, 385, 382, 381, 322, 321, 3774, 3773, 3714, 3713, 3710, 3709, 3650, 3649, 421, 418, 413, 410, 357, 354, 349, 346, 3749, 3746, 3741, 3738, 3685, 3682, 3677, 3674, 422, 417, 414, 409, 358, 353, 350, 345, 3750, 3745, 3742, 3737, 3686, 3681, 3678, 3673, 1739, 1738, 1781, 1780, 1803, 1802, 1845, 1844, 2251, 2250, 2293, 2292, 2315, 2314, 2357, 2356, 1740, 1741, 1778, 1779, 1804, 1805, 1842, 1843, 2252, 2253, 2290, 2291, 2316, 2317, 2354, 2355, 1747, 1746, 1773, 1772, 1811, 1810, 1837, 1836, 2259, 2258, 2285, 2284, 2323, 2322, 2349, 2348, 1748, 1749, 1770, 1771, 1812, 1813, 1834, 1835, 2260, 2261, 2282, 2283, 2324, 2325, 2346, 2347, 1547, 1548, 1555, 1556, 2027, 2028, 2035, 2036, 2059, 2060, 2067, 2068, 2539, 2540, 2547, 2548, 1544, 1551, 1552, 1559, 2024, 2031, 2032, 2039, 2056, 2063, 2064, 2071, 2536, 2543, 2544, 2551, 1543, 1542, 1563, 1560, 2023, 2020, 2041, 2040, 2055, 2054, 2075, 2072, 2535, 2532, 2553, 2552, 1536, 1537, 1562, 1561, 2022, 2021, 2046, 2047, 2048, 2049, 2074, 2073, 2534, 2533, 2558, 2559, 512, 515, 572, 575, 576, 577, 602, 601, 3494, 3493, 3518, 3519, 3520, 3523, 3580, 3583, 513, 514, 573, 574, 583, 582, 603, 600, 3495, 3492, 3513, 3512, 3521, 3522, 3581, 3582, 538, 541, 546, 549, 584, 591, 592, 599, 3496, 3503, 3504, 3511, 3546, 3549, 3554, 3557, 537, 542, 545, 550, 587, 588, 595, 596, 3499, 3500, 3507, 3508, 3545, 3550, 3553, 3558, 998, 993, 990, 985, 948, 947, 940, 939, 3156, 3155, 3148, 3147, 3110, 3105, 3102, 3097, 997, 994, 989, 986, 951, 944, 943, 936, 3159, 3152, 3151, 3144, 3109, 3106, 3101, 3098, 1022, 1021, 962, 961, 952, 953, 932, 935, 3160, 3163, 3142, 3143, 3134, 3133, 3074, 3073, 1023, 1020, 963, 960, 959, 958, 933, 934, 3161, 3162, 3137, 3136, 3135, 3132, 3075, 3072, 1024, 1027, 1084, 1087, 1088, 1089, 1114, 1113, 2982, 2981, 3006, 3007, 3008, 3011, 3068, 3071, 1025, 1026, 1085, 1086, 1095, 1094, 1115, 1112, 2983, 2980, 3001, 3000, 3009, 3010, 3069, 3070, 1050, 1053, 1058, 1061, 1096, 1103, 1104, 1111, 2984, 2991, 2992, 2999, 3034, 3037, 3042, 3045, 1049, 1054, 1057, 1062, 1099, 1100, 1107, 1108, 2987, 2988, 2995, 2996, 3033, 3038, 3041, 3046, 1510, 1505, 1502, 1497, 1460, 1459, 1452, 1451, 2644, 2643, 2636, 2635, 2598, 2593, 2590, 2585, 1509, 1506, 1501, 1498, 1463, 1456, 1455, 1448, 2647, 2640, 2639, 2632, 2597, 2594, 2589, 2586, 1534, 1533, 1474, 1473, 1464, 1465, 1444, 1447, 2648, 2651, 2630, 2631, 2622, 2621, 2562, 2561, 1535, 1532, 1475, 1472, 1471, 1470, 1445, 1446, 2649, 2650, 2625, 2624, 2623, 2620, 2563, 2560, 519, 516, 571, 568, 579, 578, 605, 606, 3489, 3490, 3517, 3516, 3527, 3524, 3579, 3576, 518, 517, 570, 569, 580, 581, 604, 607, 3488, 3491, 3514, 3515, 3526, 3525, 3578, 3577, 539, 540, 547, 548, 585, 590, 593, 598, 3497, 3502, 3505, 3510, 3547, 3548, 3555, 3556, 536, 543, 544, 551, 586, 589, 594, 597, 3498, 3501, 3506, 3509, 3544, 3551, 3552, 3559, 999, 992, 991, 984, 949, 946, 941, 938, 3157, 3154, 3149, 3146, 3111, 3104, 3103, 3096, 996, 995, 988, 987, 950, 945, 942, 937, 3158, 3153, 3150, 3145, 3108, 3107, 3100, 3099, 1017, 1018, 965, 966, 955, 954, 931, 928, 3167, 3164, 3141, 3140, 3129, 3130, 3077, 3078, 1016, 1019, 964, 967, 956, 957, 930, 929, 3166, 3165, 3138, 3139, 3128, 3131, 3076, 3079, 1031, 1028, 1083, 1080, 1091, 1090, 1117, 1118, 2977, 2978, 3005, 3004, 3015, 3012, 3067, 3064, 1030, 1029, 1082, 1081, 1092, 1093, 1116, 1119, 2976, 2979, 3002, 3003, 3014, 3013, 3066, 3065, 1051, 1052, 1059, 1060, 1097, 1102, 1105, 1110, 2985, 2990, 2993, 2998, 3035, 3036, 3043, 3044, 1048, 1055, 1056, 1063, 1098, 1101, 1106, 1109, 2986, 2989, 2994, 2997, 3032, 3039, 3040, 3047, 1511, 1504, 1503, 1496, 1461, 1458, 1453, 1450, 2645, 2642, 2637, 2634, 2599, 2592, 2591, 2584, 1508, 1507, 1500, 1499, 1462, 1457, 1454, 1449, 2646, 2641, 2638, 2633, 2596, 2595, 2588, 2587, 1529, 1530, 1477, 1478, 1467, 1466, 1443, 1440, 2655, 2652, 2629, 2628, 2617, 2618, 2565, 2566, 1528, 1531, 1476, 1479, 1468, 1469, 1442, 1441, 2654, 2653, 2626, 2627, 2616, 2619, 2564, 2567, 520, 521, 566, 567, 636, 637, 610, 609, 3486, 3485, 3458, 3459, 3528, 3529, 3574, 3575, 527, 526, 561, 560, 635, 634, 611, 608, 3487, 3484, 3461, 3460, 3535, 3534, 3569, 3568, 528, 529, 558, 559, 630, 625, 622, 617, 3478, 3473, 3470, 3465, 3536, 3537, 3566, 3567, 535, 534, 553, 552, 629, 626, 621, 618, 3477, 3474, 3469, 3466, 3543, 3542, 3561, 3560, 1000, 1001, 982, 983, 906, 909, 914, 917, 3178, 3181, 3186, 3189, 3112, 3113, 3094, 3095, 1007, 1006, 977, 976, 905, 910, 913, 918, 3177, 3182, 3185, 3190, 3119, 3118, 3089, 3088, 1008, 1009, 974, 975, 900, 901, 924, 927, 3168, 3171, 3194, 3195, 3120, 3121, 3086, 3087, 1015, 1014, 969, 968, 899, 898, 925, 926, 3169, 3170, 3197, 3196, 3127, 3126, 3081, 3080, 1032, 1033, 1078, 1079, 1148, 1149, 1122, 1121, 2974, 2973, 2946, 2947, 3016, 3017, 3062, 3063, 1039, 1038, 1073, 1072, 1147, 1146, 1123, 1120, 2975, 2972, 2949, 2948, 3023, 3022, 3057, 3056, 1040, 1041, 1070, 1071, 1142, 1137, 1134, 1129, 2966, 2961, 2958, 2953, 3024, 3025, 3054, 3055, 1047, 1046, 1065, 1064, 1141, 1138, 1133, 1130, 2965, 2962, 2957, 2954, 3031, 3030, 3049, 3048, 1512, 1513, 1494, 1495, 1418, 1421, 1426, 1429, 2666, 2669, 2674, 2677, 2600, 2601, 2582, 2583, 1519, 1518, 1489, 1488, 1417, 1422, 1425, 1430, 2665, 2670, 2673, 2678, 2607, 2606, 2577, 2576, 1520, 1521, 1486, 1487, 1412, 1413, 1436, 1439, 2656, 2659, 2682, 2683, 2608, 2609, 2574, 2575, 1527, 1526, 1481, 1480, 1411, 1410, 1437, 1438, 2657, 2658, 2685, 2684, 2615, 2614, 2569, 2568, 523, 522, 565, 564, 639, 638, 613, 614, 3481, 3482, 3457, 3456, 3531, 3530, 3573, 3572, 524, 525, 562, 563, 632, 633, 612, 615, 3480, 3483, 3462, 3463, 3532, 3533, 3570, 3571, 531, 530, 557, 556, 631, 624, 623, 616, 3479, 3472, 3471, 3464, 3539, 3538, 3565, 3564, 532, 533, 554, 555, 628, 627, 620, 619, 3476, 3475, 3468, 3467, 3540, 3541, 3562, 3563, 1003, 1002, 981, 980, 907, 908, 915, 916, 3179, 3180, 3187, 3188, 3115, 3114, 3093, 3092, 1004, 1005, 978, 979, 904, 911, 912, 919, 3176, 3183, 3184, 3191, 3116, 3117, 3090, 3091, 1011, 1010, 973, 972, 903, 902, 923, 920, 3175, 3172, 3193, 3192, 3123, 3122, 3085, 3084, 1012, 1013, 970, 971, 896, 897, 922, 921, 3174, 3173, 3198, 3199, 3124, 3125, 3082, 3083, 1035, 1034, 1077, 1076, 1151, 1150, 1125, 1126, 2969, 2970, 2945, 2944, 3019, 3018, 3061, 3060, 1036, 1037, 1074, 1075, 1144, 1145, 1124, 1127, 2968, 2971, 2950, 2951, 3020, 3021, 3058, 3059, 1043, 1042, 1069, 1068, 1143, 1136, 1135, 1128, 2967, 2960, 2959, 2952, 3027, 3026, 3053, 3052, 1044, 1045, 1066, 1067, 1140, 1139, 1132, 1131, 2964, 2963, 2956, 2955, 3028, 3029, 3050, 3051, 1515, 1514, 1493, 1492, 1419, 1420, 1427, 1428, 2667, 2668, 2675, 2676, 2603, 2602, 2581, 2580, 1516, 1517, 1490, 1491, 1416, 1423, 1424, 1431, 2664, 2671, 2672, 2679, 2604, 2605, 2578, 2579, 1523, 1522, 1485, 1484, 1415, 1414, 1435, 1432, 2663, 2660, 2681, 2680, 2611, 2610, 2573, 2572, 1524, 1525, 1482, 1483, 1408, 1409, 1434, 1433, 2662, 2661, 2686, 2687, 2612, 2613, 2570, 2571, 724, 727, 728, 729, 640, 641, 666, 665, 3430, 3429, 3454, 3455, 3366, 3367, 3368, 3371, 725, 726, 735, 734, 647, 646, 667, 664, 3431, 3428, 3449, 3448, 3361, 3360, 3369, 3370, 746, 745, 736, 737, 648, 655, 656, 663, 3432, 3439, 3440, 3447, 3358, 3359, 3350, 3349, 747, 744, 743, 742, 651, 652, 659, 660, 3435, 3436, 3443, 3444, 3353, 3352, 3351, 3348, 788, 791, 792, 793, 884, 883, 876, 875, 3220, 3219, 3212, 3211, 3302, 3303, 3304, 3307, 789, 790, 799, 798, 887, 880, 879, 872, 3223, 3216, 3215, 3208, 3297, 3296, 3305, 3306, 810, 809, 800, 801, 888, 889, 868, 871, 3224, 3227, 3206, 3207, 3294, 3295, 3286, 3285, 811, 808, 807, 806, 895, 894, 869, 870, 3225, 3226, 3201, 3200, 3289, 3288, 3287, 3284, 1236, 1239, 1240, 1241, 1152, 1153, 1178, 1177, 2918, 2917, 2942, 2943, 2854, 2855, 2856, 2859, 1237, 1238, 1247, 1246, 1159, 1158, 1179, 1176, 2919, 2916, 2937, 2936, 2849, 2848, 2857, 2858, 1258, 1257, 1248, 1249, 1160, 1167, 1168, 1175, 2920, 2927, 2928, 2935, 2846, 2847, 2838, 2837, 1259, 1256, 1255, 1254, 1163, 1164, 1171, 1172, 2923, 2924, 2931, 2932, 2841, 2840, 2839, 2836, 1300, 1303, 1304, 1305, 1396, 1395, 1388, 1387, 2708, 2707, 2700, 2699, 2790, 2791, 2792, 2795, 1301, 1302, 1311, 1310, 1399, 1392, 1391, 1384, 2711, 2704, 2703, 2696, 2785, 2784, 2793, 2794, 1322, 1321, 1312, 1313, 1400, 1401, 1380, 1383, 2712, 2715, 2694, 2695, 2782, 2783, 2774, 2773, 1323, 1320, 1319, 1318, 1407, 1406, 1381, 1382, 2713, 2714, 2689, 2688, 2777, 2776, 2775, 2772, 723, 720, 731, 730, 643, 642, 669, 670, 3425, 3426, 3453, 3452, 3365, 3364, 3375, 3372, 722, 721, 732, 733, 644, 645, 668, 671, 3424, 3427, 3450, 3451, 3362, 3363, 3374, 3373, 749, 750, 739, 738, 649, 654, 657, 662, 3433, 3438, 3441, 3446, 3357, 3356, 3345, 3346, 748, 751, 740, 741, 650, 653, 658, 661, 3434, 3437, 3442, 3445, 3354, 3355, 3344, 3347, 787, 784, 795, 794, 885, 882, 877, 874, 3221, 3218, 3213, 3210, 3301, 3300, 3311, 3308, 786, 785, 796, 797, 886, 881, 878, 873, 3222, 3217, 3214, 3209, 3298, 3299, 3310, 3309, 813, 814, 803, 802, 891, 890, 867, 864, 3231, 3228, 3205, 3204, 3293, 3292, 3281, 3282, 812, 815, 804, 805, 892, 893, 866, 865, 3230, 3229, 3202, 3203, 3290, 3291, 3280, 3283, 1235, 1232, 1243, 1242, 1155, 1154, 1181, 1182, 2913, 2914, 2941, 2940, 2853, 2852, 2863, 2860, 1234, 1233, 1244, 1245, 1156, 1157, 1180, 1183, 2912, 2915, 2938, 2939, 2850, 2851, 2862, 2861, 1261, 1262, 1251, 1250, 1161, 1166, 1169, 1174, 2921, 2926, 2929, 2934, 2845, 2844, 2833, 2834, 1260, 1263, 1252, 1253, 1162, 1165, 1170, 1173, 2922, 2925, 2930, 2933, 2842, 2843, 2832, 2835, 1299, 1296, 1307, 1306, 1397, 1394, 1389, 1386, 2709, 2706, 2701, 2698, 2789, 2788, 2799, 2796, 1298, 1297, 1308, 1309, 1398, 1393, 1390, 1385, 2710, 2705, 2702, 2697, 2786, 2787, 2798, 2797, 1325, 1326, 1315, 1314, 1403, 1402, 1379, 1376, 2719, 2716, 2693, 2692, 2781, 2780, 2769, 2770, 1324, 1327, 1316, 1317, 1404, 1405, 1378, 1377, 2718, 2717, 2690, 2691, 2778, 2779, 2768, 2771, 716, 719, 710, 705, 700, 701, 674, 673, 3422, 3421, 3394, 3395, 3390, 3385, 3376, 3379, 717, 718, 709, 706, 699, 698, 675, 672, 3423, 3420, 3397, 3396, 3389, 3386, 3377, 3378, 754, 753, 762, 765, 694, 689, 686, 681, 3414, 3409, 3406, 3401, 3330, 3333, 3342, 3341, 755, 752, 761, 766, 693, 690, 685, 682, 3413, 3410, 3405, 3402, 3329, 3334, 3343, 3340, 780, 783, 774, 769, 842, 845, 850, 853, 3242, 3245, 3250, 3253, 3326, 3321, 3312, 3315, 781, 782, 773, 770, 841, 846, 849, 854, 3241, 3246, 3249, 3254, 3325, 3322, 3313, 3314, 818, 817, 826, 829, 836, 837, 860, 863, 3232, 3235, 3258, 3259, 3266, 3269, 3278, 3277, 819, 816, 825, 830, 835, 834, 861, 862, 3233, 3234, 3261, 3260, 3265, 3270, 3279, 3276, 1228, 1231, 1222, 1217, 1212, 1213, 1186, 1185, 2910, 2909, 2882, 2883, 2878, 2873, 2864, 2867, 1229, 1230, 1221, 1218, 1211, 1210, 1187, 1184, 2911, 2908, 2885, 2884, 2877, 2874, 2865, 2866, 1266, 1265, 1274, 1277, 1206, 1201, 1198, 1193, 2902, 2897, 2894, 2889, 2818, 2821, 2830, 2829, 1267, 1264, 1273, 1278, 1205, 1202, 1197, 1194, 2901, 2898, 2893, 2890, 2817, 2822, 2831, 2828, 1292, 1295, 1286, 1281, 1354, 1357, 1362, 1365, 2730, 2733, 2738, 2741, 2814, 2809, 2800, 2803, 1293, 1294, 1285, 1282, 1353, 1358, 1361, 1366, 2729, 2734, 2737, 2742, 2813, 2810, 2801, 2802, 1330, 1329, 1338, 1341, 1348, 1349, 1372, 1375, 2720, 2723, 2746, 2747, 2754, 2757, 2766, 2765, 1331, 1328, 1337, 1342, 1347, 1346, 1373, 1374, 2721, 2722, 2749, 2748, 2753, 2758, 2767, 2764, 715, 712, 711, 704, 703, 702, 677, 678, 3417, 3418, 3393, 3392, 3391, 3384, 3383, 3380, 714, 713, 708, 707, 696, 697, 676, 679, 3416, 3419, 3398, 3399, 3388, 3387, 3382, 3381, 757, 758, 763, 764, 695, 688, 687, 680, 3415, 3408, 3407, 3400, 3331, 3332, 3337, 3338, 756, 759, 760, 767, 692, 691, 684, 683, 3412, 3411, 3404, 3403, 3328, 3335, 3336, 3339, 779, 776, 775, 768, 843, 844, 851, 852, 3243, 3244, 3251, 3252, 3327, 3320, 3319, 3316, 778, 777, 772, 771, 840, 847, 848, 855, 3240, 3247, 3248, 3255, 3324, 3323, 3318, 3317, 821, 822, 827, 828, 839, 838, 859, 856, 3239, 3236, 3257, 3256, 3267, 3268, 3273, 3274, 820, 823, 824, 831, 832, 833, 858, 857, 3238, 3237, 3262, 3263, 3264, 3271, 3272, 3275, 1227, 1224, 1223, 1216, 1215, 1214, 1189, 1190, 2905, 2906, 2881, 2880, 2879, 2872, 2871, 2868, 1226, 1225, 1220, 1219, 1208, 1209, 1188, 1191, 2904, 2907, 2886, 2887, 2876, 2875, 2870, 2869, 1269, 1270, 1275, 1276, 1207, 1200, 1199, 1192, 2903, 2896, 2895, 2888, 2819, 2820, 2825, 2826, 1268, 1271, 1272, 1279, 1204, 1203, 1196, 1195, 2900, 2899, 2892, 2891, 2816, 2823, 2824, 2827, 1291, 1288, 1287, 1280, 1355, 1356, 1363, 1364, 2731, 2732, 2739, 2740, 2815, 2808, 2807, 2804, 1290, 1289, 1284, 1283, 1352, 1359, 1360, 1367, 2728, 2735, 2736, 2743, 2812, 2811, 2806, 2805, 1333, 1334, 1339, 1340, 1351, 1350, 1371, 1368, 2727, 2724, 2745, 2744, 2755, 2756, 2761, 2762, 1332, 1335, 1336, 1343, 1344, 1345, 1370, 1369, 2726, 2725, 2750, 2751, 2752, 2759, 2760, 2763};
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp b/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp
new file mode 100644
index 00000000..954d4ecd
--- /dev/null
+++ b/libNeonDomain/tests/domain-space-filling-curves/src/gtests.cpp
@@ -0,0 +1,116 @@
+
+#include "Neon/Neon.h"
+#include "Neon/domain/tools/SpaceCurves.h"
+#include "domain-space-filling-curves.h"
+#include "goldenEncoding.h"
+#include "gtest/gtest.h"
+#include "runHelper.h"
+
+TEST(domain_space_filling_curves, morton)
+{
+    Neon::int32_3d dim = {16, 16, 16};
+    for (int x = 0; x < dim.x; x++) {
+        for (int y = 0; y < dim.y; y++) {
+            for (int z = 0; z < dim.z; z++) {
+                using namespace Neon::domain::tool::spaceCurves;
+                Neon::int32_3d idx = {x, y, z};
+                auto           morton = Encoder::encode(EncoderType::morton, dim, idx);
+                auto           sweep = Encoder::encode(EncoderType::sweep, dim, {z, y, x});
+
+                ASSERT_EQ(morton_grid_16_16_16[sweep], morton) << dim << " " << idx << " " << morton;
+            }
+        }
+    }
+}
+
+TEST(domain_space_filling_curves, hilbert)
+{
+    Neon::int32_3d dim = {16, 16, 16};
+    for (int x = 0; x < dim.x; x++) {
+        for (int y = 0; y < dim.y; y++) {
+            for (int z = 0; z < dim.z; z++) {
+
+                using namespace Neon::domain::tool::spaceCurves;
+                Neon::int32_3d idx = {x, y, z};
+                auto           hilbert = Encoder::encode(EncoderType::hilbert, dim, idx);
+                auto           sweep = Encoder::encode(EncoderType::sweep, dim, {z, y, x});
+
+                ASSERT_EQ(hilbert_grid_16_16_16[sweep], hilbert) << dim << " " << idx << " " << hilbert;
+            }
+        }
+    }
+}
+
+TEST(domain_space_filling_curves, hilbert_hilbert)
+{
+    auto run = [](Neon::domain::tool::spaceCurves::EncoderType encodingType, int dimEdge) {
+        // Step 1 -> Neon backend: choosing the hardware for the computation
+        Neon::init();
+        // auto runtime = Neon::Runtime::openmp;
+        auto runtime = Neon::Runtime::openmp;
+        // We are overbooking GPU 0 three times
+        std::vector<int> devIds{0};
+        Neon::Backend    backend(devIds, runtime);
+
+        // Step 2 -> Neon grid: setting up a dense cartesian domain
+        Neon::index_3d dim(dimEdge, dimEdge, dimEdge);  // Size of the domain
+
+        using Grid = Neon::eGrid;  // Selecting one of the grid provided by Neon
+        Neon::domain::Stencil gradStencil([] {
+            // We use a center difference scheme to compute the grad
+            // The order of the points is important,
+            // as we'll leverage the specific order when computing the grad.
+            // First positive direction on x, y and z,
+            // then negative direction on x, y, z respectively.
+            return std::vector<Neon::index_3d>{
+                {1, 0, 0},
+                {0, 1, 0},
+                {0, 0, 1},
+                {-1, 0, 0},
+                {0, -1, 0},
+                {0, 0, -1}};
+        }());
+        // Actual Neon grid allocation
+        Grid grid(
+            backend,
+            dim,
+            [&](const Neon::index_3d&) -> bool {
+                return true;
+            },  // <-  defining the active cells.
+            gradStencil,
+            1.0,
+            0.0, encodingType);
+
+        auto field = grid.newField<int>("spaceCode", 1, 0);
+
+        grid.newContainer<Neon::Execution::host>("DecoceFromId",
+                                                 [&](Neon::set::Loader& l) {
+                                                     auto f = l.load(field);
+                                                     return [=] NEON_CUDA_HOST_DEVICE(const Grid::Idx& gidx) mutable {
+                                                         auto internalId = gidx.helpGet();
+                                                         auto global = f.getGlobalIndex(gidx);
+#pragma omp critical
+                                                         {
+                                                             using namespace Neon::domain::tool::spaceCurves;
+                                                             auto encoded = Encoder::encode(encodingType, dim, global);
+                                                             // std::cout << global << " -> internal " << internalId << " code " << encoded << std::endl;
+                                                             EXPECT_EQ(internalId, encoded);
+                                                         }
+                                                         f(gidx, 0) = internalId;
+                                                     };
+                                                 })
+            .run(Neon::Backend::mainStreamIdx);
+        field.ioToVtk("DecoceFromId", "grad");
+        printf("DONE\n");
+    };
+    run(Neon::domain::tool::spaceCurves::EncoderType::sweep, 32);
+    run(Neon::domain::tool::spaceCurves::EncoderType::morton,32);
+    run(Neon::domain::tool::spaceCurves::EncoderType::hilbert,32);
+}
+
+int main(int argc, char** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    Neon::init();
+    return RUN_ALL_TESTS();
+}
diff --git a/libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h b/libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h
new file mode 100644
index 00000000..993bce70
--- /dev/null
+++ b/libNeonDomain/tests/domain-space-filling-curves/src/runHelper.h
@@ -0,0 +1,100 @@
+#pragma once
+#include <map>
+#include "gtest/gtest.h"
+
+#include "Neon/core/core.h"
+#include "Neon/core/tools/io/ioToVti.h"
+#include "Neon/core/types/DataUse.h"
+#include "Neon/core/types/DeviceType.h"
+
+#include "Neon/domain/dGrid.h"
+#include "Neon/domain/details/dGridSoA/dGridSoA.h"
+#include "Neon/domain/eGrid.h"
+#include "Neon/domain/tools/Geometries.h"
+#include "Neon/domain/tools/TestData.h"
+
+#include "gtest/gtest.h"
+
+using namespace Neon;
+using namespace Neon::domain;
+
+using namespace Neon::domain::tool::testing;
+using namespace Neon::domain::tool;
+
+template <typename G, typename T, int C>
+void runAllTestConfigurations(std::function<void(TestData<G, T, C>&)> f)
+{
+    std::vector<int> nGpuTest;
+    nGpuTest.push_back(1);
+    std::vector<int> cardinalityTest{1};
+
+    std::vector<Neon::index_3d> dimTest{{32,32,32}};
+    std::vector<Neon::Runtime>  runtimeE;
+
+    runtimeE.push_back(Neon::Runtime::openmp);
+
+
+    std::vector<Geometry>           geos;
+    std::vector<Neon::MemoryLayout> memoryLayoutOptions{Neon::MemoryLayout::structOfArrays};
+
+    if constexpr (std::is_same_v<G, Neon::dGrid>) {
+        geos = std::vector<Geometry>{
+            Geometry::FullDomain,
+        };
+    } else {
+        geos = std::vector<Geometry>{
+            Geometry::FullDomain,
+            //            Geometry::Sphere,
+            //            Geometry::HollowSphere,
+
+        };
+    }
+
+    for (auto dim : dimTest) {
+        for (const auto& card : cardinalityTest) {
+            for (auto& geo : geos) {
+                for (const auto& ngpu : nGpuTest) {
+                    for (const auto& runtime : runtimeE) {
+                        for (const auto& memoryLayout : memoryLayoutOptions) {
+
+                            int maxnGPUs = [] {
+                                if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) {
+                                    return Neon::set::DevSet::maxSet().setCardinality();
+                                }
+                                return 1;
+                            }();
+
+                            std::vector<int> ids;
+                            for (int i = 0; i < ngpu; i++) {
+                                ids.push_back(i % maxnGPUs);
+                            }
+
+                            Neon::Backend       backend(ids, runtime);
+                            Neon::MemoryOptions memoryOptions = backend.getMemoryOptions();
+                            memoryOptions.setOrder(memoryLayout);
+
+                            if constexpr (std::is_same_v<G, Neon::bGrid>) {
+                                if (dim.z < 8 * ngpu * 3) {
+                                    dim.z = ngpu * 3 * 8;
+                                }
+                                if (memoryLayout == Neon::MemoryLayout::arrayOfStructs) {
+                                    continue;
+                                }
+                            }
+
+                            assert(card == 1);
+                            TestData<G, T, C> testData(backend,
+                                                       dim,
+                                                       card,
+                                                       memoryOptions,
+                                                       geo);
+
+                            NEON_INFO(testData.toString());
+                            f(testData);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/libNeonDomain/tests/domain-stencil/src/gtests.cpp b/libNeonDomain/tests/domain-stencil/src/gtests.cpp
index ec6f892a..9fed3354 100644
--- a/libNeonDomain/tests/domain-stencil/src/gtests.cpp
+++ b/libNeonDomain/tests/domain-stencil/src/gtests.cpp
@@ -4,29 +4,74 @@
 #include "runHelper.h"
 #include "stencil.h"
 
-TEST(domain_stencil, dGrid)
+TEST(domain_stencil, dGrid_NoTemplate)
 {
     int nGpus = 3;
     using Type = int64_t;
-    runAllTestConfiguration(std::function(map::run<Neon::dGrid, Type, 0>),
+    runAllTestConfiguration(std::function(map::runNoTemplate<Neon::dGrid, Type, 0>),
                             nGpus,
                             1);
 }
 
-TEST(domain_stencil, eGrid)
+TEST(domain_stencil, eGrid_NoTemplate)
 {
     int nGpus = 3;
     using Type = int64_t;
-    runAllTestConfiguration(std::function(map::run<Neon::eGrid, Type, 0>),
+    runAllTestConfiguration(std::function(map::runNoTemplate<Neon::eGrid, Type, 0>),
                             nGpus,
                             1);
 }
 
-TEST(domain_stencil, bGri )
+TEST(domain_stencil, bGri_NoTemplate)
 {
     int nGpus = 5;
     using Type = int64_t;
-    runAllTestConfiguration(std::function(map::run<Neon::bGrid, Type, 0>),
+    runAllTestConfiguration(std::function(map::runNoTemplate<Neon::bGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_stencil, dGridSoA_NoTemplate)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::runNoTemplate<Neon::dGridSoA, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_stencil, dGrid_Template)
+{
+    int nGpus = 3;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::runTemplate<Neon::dGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_stencil, eGrid_Template)
+{
+    int nGpus = 3;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::runTemplate<Neon::eGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_stencil, bGri_Template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::runTemplate<Neon::bGrid, Type, 0>),
+                            nGpus,
+                            1);
+}
+
+TEST(domain_stencil, dGridSoA_Template)
+{
+    int nGpus = 5;
+    using Type = int64_t;
+    runAllTestConfiguration(std::function(map::runTemplate<Neon::dGridSoA, Type, 0>),
                             nGpus,
                             1);
 }
diff --git a/libNeonDomain/tests/domain-stencil/src/runHelper.h b/libNeonDomain/tests/domain-stencil/src/runHelper.h
index e8f286ae..16cefb0f 100644
--- a/libNeonDomain/tests/domain-stencil/src/runHelper.h
+++ b/libNeonDomain/tests/domain-stencil/src/runHelper.h
@@ -33,7 +33,7 @@ void runAllTestConfiguration(
     // std::vector<int> nGpuTest{2,4,6,8};
     std::vector<int> cardinalityTest{1};
 
-    std::vector<Neon::index_3d> dimTest{{10, 17, 13}, {1, 1, 100}, {17, 1, 77}};
+    std::vector<Neon::index_3d> dimTest{{10, 17, 90}, {1, 1, 100}, {17, 1, 77}};
     std::vector<Neon::Runtime>  runtimeE{Neon::Runtime::openmp};
     if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) {
         runtimeE.push_back(Neon::Runtime::stream);
diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.cu b/libNeonDomain/tests/domain-stencil/src/stencil.cu
index a86f1def..6cd4f6ff 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.cu
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.cu
@@ -9,8 +9,8 @@
 namespace map {
 
 template <typename Field>
-auto stencilContainer_laplace(const Field& filedA,
-                              Field&       fieldB)
+auto laplaceNoTemplate(const Field& filedA,
+                       Field&       fieldB)
     -> Neon::set::Container
 {
     const auto& grid = filedA.getGrid();
@@ -59,20 +59,37 @@ static constexpr std::array<const Ngh3DIdx, 6> stencil{
     Ngh3DIdx(0, 0, 1),
     Ngh3DIdx(0, 0, -1)};
 
-template<int stencilIdx, typename IDX, typename Field>
-inline auto viaTemplate (const IDX& idx, int i, const Field& a, int& partial, int& count){
-        a.template getNghData<stencil[stencilIdx].x,
-                              stencil[stencilIdx].y,
-                              stencil[stencilIdx].z>(idx, i,
-                                                     [&](typename Field::Type const& val) {
-                                                         partial += val;
-                                                         count++;
-                                                     });
+template <int sIdx, typename IDX, typename Partition, typename Partial>
+NEON_CUDA_HOST_DEVICE inline auto viaTemplate(const IDX& idx, int i, const Partition& a, Partial& partial, int& count)
+{
+    //    Neon::index_3d direction(X, Y, Z);
+    //    auto           nghData = a.getNghData(idx, direction.newType<int8_t>(), i);
+    //    if (nghData.isValid()) {
+    //        partial += nghData.getData();
+    //        count++;
+    //    }
+    a.template getNghData<stencil[sIdx].x,
+                          stencil[sIdx].y,
+                          stencil[sIdx].z>(idx, i,
+                                           [&](typename Partition::Type const& val) {
+                                               partial += val;
+                                               count++;
+                                           });
 };
 
+
+//template <auto Start, auto End, auto Inc, class F>
+//constexpr void constexpr_for(F&& f)
+//{
+//    if constexpr (Start < End) {
+//        f(std::integral_constant<decltype(Start), Start>());
+//        constexpr_for<Start + Inc, End, Inc>(f);
+//    }
+//}
+
 template <typename Field>
-auto stencilContainerLaplaceTemplate(const Field& filedA,
-                                     Field&       fieldB)
+auto laplaceTemplate(const Field& filedA,
+                     Field&       fieldB)
     -> Neon::set::Container
 {
     const auto& grid = filedA.getGrid();
@@ -88,35 +105,18 @@ auto stencilContainerLaplaceTemplate(const Field& filedA,
                     // printf("GPU %ld <- %ld + %ld\n", lc(e, i) , la(e, i) , val);
                     typename Field::Type partial = 0;
                     int                  count = 0;
+                    using Ngh3DIdx = Neon::int8_3d;
 
-                    constexpr std::array<const Ngh3DIdx, 6> stencil{
-                        Ngh3DIdx(1, 0, 0),
-                        Ngh3DIdx(-1, 0, 0),
-                        Ngh3DIdx(0, 1, 0),
-                        Ngh3DIdx(0, -1, 0),
-                        Ngh3DIdx(0, 0, 1),
-                        Ngh3DIdx(0, 0, -1)};
-
-#if 0
-                    auto viaTemplate = [&]<int stencilIdx>() {
-                        if constexpr (std::is_same_v<typename Field::Grid, Neon::dGrid>) {
-                            a.template getNghData<stencil[stencilIdx].x,
-                                                  stencil[stencilIdx].y,
-                                                  stencil[stencilIdx].z>(idx, i,
-                                                                         [&](Field::Type const& val) {
-                                                                             partial += val;
-                                                                             count++;
-                                                                         });
-                        }
-                    };
-#endif
-                    viaTemplate<0>(idx, i, a, partial, count);
-                    viaTemplate<1>(idx, i, a, partial, count);
-                    viaTemplate<2>(idx, i, a, partial, count);
-                    viaTemplate<3>(idx, i, a, partial, count);
-                    viaTemplate<4>(idx, i, a, partial, count);
-                    viaTemplate<5>(idx, i, a, partial, count);
-
+                    Neon::ConstexprFor<0, 6, 1>([&](auto sIdx) {
+                        a.template getNghData<stencil[sIdx].x,
+                                              stencil[sIdx].y,
+                                              stencil[sIdx].z>(idx, i,
+                                                               [&](auto const& val) {
+                                                                   partial += val;
+                                                                   count++;
+                                                               });
+                    });
+                    
                     b(idx, i) = a(idx, i) - count * partial;
                 }
             };
@@ -126,7 +126,82 @@ auto stencilContainerLaplaceTemplate(const Field& filedA,
 using namespace Neon::domain::tool::testing;
 
 template <typename G, typename T, int C>
-auto run(TestData<G, T, C>& data) -> void
+auto runNoTemplate(TestData<G, T, C>& data) -> void
+{
+
+    using Type = typename TestData<G, T, C>::Type;
+    auto&             grid = data.getGrid();
+    const std::string appName = TestInformation::fullName(grid.getImplementationName());
+    const int         maxIters = 1;
+
+    NEON_INFO(grid.toString());
+
+    // data.resetValuesToLinear(1, 100);
+    data.resetValuesToMasked(1);
+
+    {  // NEON
+        const Neon::index_3d        dim = grid.getDimension();
+        std::vector<Neon::index_3d> elements;
+        auto                        bk = grid.getBackend();
+        auto&                       X = data.getField(FieldNames::X);
+        auto&                       Y = data.getField(FieldNames::Y);
+        for (int iter = maxIters; iter > 0; iter--) {
+            bk.sync(Neon::Backend::mainStreamIdx);
+            X.newHaloUpdate(Neon::set::StencilSemantic::standard,
+                            Neon::set::TransferMode::put,
+                            Neon::Execution::device)
+                .run(Neon::Backend::mainStreamIdx);
+
+            bk.sync(Neon::Backend::mainStreamIdx);
+            laplaceNoTemplate(X, Y).run(Neon::Backend::mainStreamIdx);
+
+            bk.sync(Neon::Backend::mainStreamIdx);
+            Y.newHaloUpdate(Neon::set::StencilSemantic::standard,
+                            Neon::set::TransferMode::get,
+                            Neon::Execution::device)
+                .run(Neon::Backend::mainStreamIdx);
+
+            bk.sync(Neon::Backend::mainStreamIdx);
+            laplaceNoTemplate(Y, X).run(Neon::Backend::mainStreamIdx);
+        }
+        data.getBackend().sync(0);
+    }
+
+    {  // Golden data
+        auto& X = data.getIODomain(FieldNames::X);
+        auto& Y = data.getIODomain(FieldNames::Y);
+        for (int iter = maxIters; iter > 0; iter--) {
+            data.laplace(X, Y);
+            data.laplace(Y, X);
+        }
+    }
+
+    data.updateHostData();
+
+    data.getField(FieldNames::X).ioToVtk("X", "X", true);
+    //    data.getField(FieldNames::Y).ioToVtk("Y", "Y", false);
+    //    data.getField(FieldNames::Z).ioToVtk("Z", "Z", false);
+    //
+    data.getIODomain(FieldNames::X).ioToVti("X_", "X_");
+    //    data.getField(FieldNames::Y).ioVtiAllocator("Y_");
+    //    data.getField(FieldNames::Z).ioVtiAllocator("Z_");
+
+    bool isOk = data.compare(FieldNames::X);
+    isOk = data.compare(FieldNames::Y);
+    if (!isOk) {
+        auto flagField = data.compareAndGetField(FieldNames::X);
+        flagField.ioToVti("X_diffFlag", "X_diffFlag");
+        flagField = data.compareAndGetField(FieldNames::Y);
+        flagField.ioToVti("Y_diffFlag", "Y_diffFlag");
+    }
+    ASSERT_TRUE(isOk);
+    if (!isOk) {
+        exit(99);
+    }
+}
+
+template <typename G, typename T, int C>
+auto runTemplate(TestData<G, T, C>& data) -> void
 {
 
     using Type = typename TestData<G, T, C>::Type;
@@ -153,7 +228,7 @@ auto run(TestData<G, T, C>& data) -> void
                 .run(Neon::Backend::mainStreamIdx);
 
             bk.sync(Neon::Backend::mainStreamIdx);
-            stencilContainer_laplace(X, Y).run(Neon::Backend::mainStreamIdx);
+            laplaceTemplate(X, Y).run(Neon::Backend::mainStreamIdx);
 
             bk.sync(Neon::Backend::mainStreamIdx);
             Y.newHaloUpdate(Neon::set::StencilSemantic::standard,
@@ -162,7 +237,7 @@ auto run(TestData<G, T, C>& data) -> void
                 .run(Neon::Backend::mainStreamIdx);
 
             bk.sync(Neon::Backend::mainStreamIdx);
-            stencilContainer_laplace(Y, X).run(Neon::Backend::mainStreamIdx);
+            laplaceTemplate(Y, X).run(Neon::Backend::mainStreamIdx);
         }
         data.getBackend().sync(0);
     }
@@ -200,9 +275,14 @@ auto run(TestData<G, T, C>& data) -> void
     }
 }
 
-template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
-template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
-template auto run<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto runNoTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+template auto runNoTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+template auto runNoTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto runNoTemplate<Neon::dGridSoA, int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
+template auto runTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+template auto runTemplate<Neon::dGridSoA, int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
 }  // namespace map
\ No newline at end of file
diff --git a/libNeonDomain/tests/domain-stencil/src/stencil.h b/libNeonDomain/tests/domain-stencil/src/stencil.h
index a35d8011..456f5f01 100644
--- a/libNeonDomain/tests/domain-stencil/src/stencil.h
+++ b/libNeonDomain/tests/domain-stencil/src/stencil.h
@@ -11,9 +11,20 @@ namespace map {
 using namespace Neon::domain::tool::testing;
 
 template <typename G, typename T, int C>
-auto run(TestData<G, T, C>& data) -> void;
+auto runNoTemplate(TestData<G, T, C>& data) -> void;
 
-extern template auto run<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
-extern template auto run<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+template <typename G, typename T, int C>
+auto runTemplate(TestData<G, T, C>& data) -> void;
+
+
+extern template auto runNoTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+extern template auto runNoTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+extern template auto runNoTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto runNoTemplate<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
+
+extern template auto runTemplate<Neon::dGrid, int64_t, 0>(TestData<Neon::dGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::eGrid, int64_t, 0>(TestData<Neon::eGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::bGrid, int64_t, 0>(TestData<Neon::bGrid, int64_t, 0>&) -> void;
+extern template auto runTemplate<Neon::dGridSoA , int64_t, 0>(TestData<Neon::dGridSoA, int64_t, 0>&) -> void;
 
 }  // namespace map
diff --git a/libNeonSet/include/Neon/set/DevSet.h b/libNeonSet/include/Neon/set/DevSet.h
index 5ac38250..5e8b03b7 100644
--- a/libNeonSet/include/Neon/set/DevSet.h
+++ b/libNeonSet/include/Neon/set/DevSet.h
@@ -20,6 +20,7 @@
 #include "Neon/set/LambdaExecutor.h"
 #include "Neon/set/LaunchParameters.h"
 #include "Neon/set/Transfer.h"
+#include "Neon/set/container/CudaLaunchCompileTimeHints.h"
 #include "Neon/set/memory/memDevSet.h"
 #include "Neon/set/memory/memSet.h"
 #include "Neon/sys/global/GpuSysGlobal.h"
@@ -222,7 +223,9 @@ class DevSet
     auto newLaunchParameters() const
         -> LaunchParameters;
 
-    template <typename DataSetContainer, typename Lambda>
+    template <typename CudaLaunchCompileTimeHint,
+              typename DataSetContainer,
+              typename Lambda>
     inline auto launchLambdaOnSpan(
         Neon::Execution                       execution,
         const Neon::set::KernelConfig&        kernelConfig,
@@ -236,9 +239,11 @@ class DevSet
         switch (mode) {
             case Neon::Runtime::stream: {
                 if (execution == Neon::Execution::device) {
-                    this->template helpLaunchLambdaOnSpanCUDA<DataSetContainer, Lambda>(kernelConfig,
-                                                                                        dataSetContainer,
-                                                                                        lambdaHolder);
+                    this->template helpLaunchLambdaOnSpanCUDA<CudaLaunchCompileTimeHint,
+                                                              DataSetContainer,
+                                                              Lambda>(kernelConfig,
+                                                                      dataSetContainer,
+                                                                      lambdaHolder);
                     return;
                 }
 #if defined(NEON_OS_LINUX) || defined(NEON_OS_MAC)
@@ -352,7 +357,9 @@ class DevSet
         }
     }
 
-    template <typename DataSetContainer, typename Lambda>
+    template <typename CudaLaunchCompilerTimeHints,
+              typename DataSetContainer,
+              typename Lambda>
     inline auto helpLaunchLambdaOnSpanCUDA([[maybe_unused]] const Neon::set::KernelConfig&        kernelConfig,
                                            [[maybe_unused]] DataSetContainer&                     dataSetContainer,
                                            [[maybe_unused]] std::function<Lambda(SetIdx,
@@ -379,10 +386,20 @@ class DevSet
             Lambda lambda = lambdaHolder(setIdx.idx(), kernelConfig.dataView());
             void*  untypedParams[2] = {&iterator, &lambda};
             void*  executor;
-            if constexpr (!details::ExecutionThreadSpanUtils::isBlockSpan(DataSetContainer::executionThreadSpan)) {
-                executor = (void*)Neon::set::details::denseSpan::launchLambdaOnSpanCUDA<DataSetContainer, Lambda>;
-            } else {
-                executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDA<DataSetContainer, Lambda>;
+            if constexpr (!CudaLaunchCompilerTimeHints::initialized) {
+                if constexpr (!details::ExecutionThreadSpanUtils::isBlockSpan(DataSetContainer::executionThreadSpan)) {
+                    executor = (void*)Neon::set::details::denseSpan::launchLambdaOnSpanCUDA<DataSetContainer, Lambda>;
+                } else {
+                    executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDA<DataSetContainer, Lambda>;
+                }
+            }
+
+            if constexpr (CudaLaunchCompilerTimeHints::initialized) {
+                if constexpr (!details::ExecutionThreadSpanUtils::isBlockSpan(DataSetContainer::executionThreadSpan)) {
+                    executor = (void*)Neon::set::details::denseSpan::launchLambdaOnSpanCUDAWithCompilerHints<CudaLaunchCompilerTimeHints, DataSetContainer, Lambda>;
+                } else {
+                    executor = (void*)Neon::set::details::blockSpan::launchLambdaOnSpanCUDAWithCompilerHints<CudaLaunchCompilerTimeHints, DataSetContainer, Lambda>;
+                }
             }
             dev.kernel.template cudaLaunchKernel<Neon::run_et::async>(gpuStreamSet[setIdx.idx()],
                                                                       launchInfoSet[setIdx.idx()],
diff --git a/libNeonSet/include/Neon/set/LambdaExecutor.h b/libNeonSet/include/Neon/set/LambdaExecutor.h
index 4ffe2501..825e86a7 100644
--- a/libNeonSet/include/Neon/set/LambdaExecutor.h
+++ b/libNeonSet/include/Neon/set/LambdaExecutor.h
@@ -36,6 +36,38 @@ NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDA(typename DataSetContainer::Span spa
         }
     }
 }
+
+template <typename CudaLaunchCompilerTimeHints,
+          typename DataSetContainer,
+          typename UserLambda>
+__launch_bounds__(CudaLaunchCompilerTimeHints::maxThreadsPerBlock)
+    NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDAWithCompilerHints(typename DataSetContainer::Span span,
+                                                                  UserLambda                      userLambdaTa)
+        -> void
+{
+    typename DataSetContainer::Idx e;
+    if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d1) {
+        if (span.setAndValidate(e,
+                                threadIdx.x + blockIdx.x * blockDim.x)) {
+            userLambdaTa(e);
+        }
+    }
+    if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d2) {
+        if (span.setAndValidate(e,
+                                threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y)) {
+            userLambdaTa(e);
+        }
+    }
+    if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d3) {
+        if (span.setAndValidate(e,
+                                threadIdx.x + blockIdx.x * blockDim.x,
+                                threadIdx.y + blockIdx.y * blockDim.y,
+                                threadIdx.z + blockIdx.z * blockDim.z)) {
+            userLambdaTa(e);
+        }
+    }
+}
 #endif
 
 
@@ -48,9 +80,9 @@ void launchLambdaOnSpanOMP(Neon::Integer_3d<IndexType> const&     gridDim,
 {
     if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d1) {
 #ifdef NEON_OS_WINDOWS
-//#pragma omp parallel for default(shared)
+// #pragma omp parallel for default(shared)
 #else
- #pragma omp parallel for simd default(shared)
+#pragma omp parallel for simd default(shared)
 #endif
         for (IndexType x = 0; x < gridDim.x; x++) {
             typename DataSetContainer::Idx e;
@@ -65,7 +97,7 @@ void launchLambdaOnSpanOMP(Neon::Integer_3d<IndexType> const&     gridDim,
 #ifdef NEON_OS_WINDOWS
 #pragma omp parallel for default(shared)
 #else
-// #pragma omp parallel for simd collapse(2) default(shared)
+        // #pragma omp parallel for simd collapse(2) default(shared)
 #endif
         for (IndexType y = 0; y < gridDim.y; y++) {
             for (IndexType x = 0; x < gridDim.x; x++) {
@@ -81,7 +113,7 @@ void launchLambdaOnSpanOMP(Neon::Integer_3d<IndexType> const&     gridDim,
 #ifdef NEON_OS_WINDOWS
 #pragma omp parallel for default(shared)
 #else
-// #pragma omp parallel for simd collapse(1) default(shared) schedule(guided)
+        // #pragma omp parallel for simd collapse(1) default(shared) schedule(guided)
 #endif
         for (IndexType z = 0; z < gridDim.z; z++) {
             for (IndexType y = 0; y < gridDim.y; y++) {
@@ -113,6 +145,21 @@ NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDA(typename DataSetContainer::Span spa
         }
     }
 }
+
+template <typename CudaLaunchCompilerTimeHints,
+          typename DataSetContainer,
+          typename UserLambda>
+NEON_CUDA_KERNEL auto launchLambdaOnSpanCUDAWithCompilerHints(typename DataSetContainer::Span span,
+                                             UserLambda                      userLambdaTa)
+    -> void
+{
+    typename DataSetContainer::Idx e;
+    if constexpr (DataSetContainer::executionThreadSpan == ExecutionThreadSpan::d1b3) {
+        if (span.setAndValidateGPUDevice(e)) {
+            userLambdaTa(e);
+        }
+    }
+}
 #endif
 
 
diff --git a/libNeonSet/include/Neon/set/StencilSemantic.h b/libNeonSet/include/Neon/set/StencilSemantic.h
index cd512ae7..28b596dc 100644
--- a/libNeonSet/include/Neon/set/StencilSemantic.h
+++ b/libNeonSet/include/Neon/set/StencilSemantic.h
@@ -2,6 +2,7 @@
 #include <string>
 #include <vector>
 
+#include "Neon/Report.h"
 #include "Neon/core/core.h"
 
 namespace Neon::set {
@@ -9,7 +10,7 @@ namespace Neon::set {
 enum struct StencilSemantic
 {
     standard = 0 /*<  Transfer for halo update on grid structure    */,
-    streaming = 1 /*< Transfer for halo update on lattice structure */
+    lattice = 1 /*< Transfer for halo update on lattice structure */
 };
 
 
@@ -20,19 +21,24 @@ struct StencilSemanticUtils
     static auto toString(StencilSemantic opt) -> std::string;
     static auto fromString(const std::string& opt) -> StencilSemantic;
     static auto getOptions() -> std::array<StencilSemantic, nOptions>;
-    
+
     struct Cli
     {
         explicit Cli(std::string);
         explicit Cli(StencilSemantic model);
         Cli();
 
-        auto getOption() -> StencilSemantic;
+        auto getOption() const -> StencilSemantic;
         auto set(const std::string& opt) -> void;
-        auto getStringOptions() -> std::string;
+        auto getStringOptions() const -> std::string;
+        auto getStringOption() const -> std::string;
+        auto getDoc() const -> std::string;
+
+        auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void;
+        auto addToReport(Neon::Report& report) const -> void;
 
        private:
-        bool mSet = false;
+        bool            mSet = false;
         StencilSemantic mOption;
     };
 };
diff --git a/libNeonSet/include/Neon/set/TransferMode.h b/libNeonSet/include/Neon/set/TransferMode.h
index b6f4ec86..a335f5da 100644
--- a/libNeonSet/include/Neon/set/TransferMode.h
+++ b/libNeonSet/include/Neon/set/TransferMode.h
@@ -3,6 +3,7 @@
 #include <vector>
 
 #include "Neon/core/core.h"
+#include "Neon/Report.h"
 
 namespace Neon::set {
 
@@ -26,9 +27,14 @@ class TransferModeUtils
         explicit Cli(TransferMode model);
         Cli();
 
-        auto getOption() -> TransferMode;
+        auto getOption() const -> TransferMode;
         auto set(const std::string& opt) -> void;
-        auto getStringOptions() -> std::string;
+        auto getStringOptions() const -> std::string;
+        auto getStringOption() const -> std::string;
+        auto getDoc () const -> std::string;
+
+        auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const ->void;
+        auto addToReport(Neon::Report& report) const ->void;
 
        private:
         bool         mSet = false;
diff --git a/libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h b/libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h
new file mode 100644
index 00000000..84fee176
--- /dev/null
+++ b/libNeonSet/include/Neon/set/container/CudaLaunchCompileTimeHints.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "Neon/core/core.h"
+
+
+namespace Neon::set::container {
+
+template <bool inited__ = false,
+          int  maxThreadsPerBlock__ = 1024,
+          int  minBlocksPerMultiprocessor__ = 1,
+          int  maxBlocksPerCluster__ = 0>
+struct CudaLaunchCompileTimeHint
+{
+   public:
+    static constexpr bool initialized = inited__;
+    static constexpr int  maxThreadsPerBlock = maxThreadsPerBlock__;
+    static constexpr int  minBlocksPerMultiprocessor = minBlocksPerMultiprocessor__;
+    static constexpr int  maxBlocksPerCluster = maxBlocksPerCluster__;
+};
+
+}  // namespace Neon::set::container
diff --git a/libNeonSet/include/Neon/set/container/DeviceContainer.h b/libNeonSet/include/Neon/set/container/DeviceContainer.h
index 6f729894..ae3bf957 100644
--- a/libNeonSet/include/Neon/set/container/DeviceContainer.h
+++ b/libNeonSet/include/Neon/set/container/DeviceContainer.h
@@ -6,8 +6,8 @@
 
 namespace Neon::set::internal {
 
-template <typename DataIteratorContainerT,
-          typename UserComputeLambdaT>
+template < typename DataIteratorContainerT,
+          typename UserComputeLambdaT, typename CudaLaunchCompileTimeHintT  = Neon::set::container::CudaLaunchCompileTimeHint<false>>
 struct DeviceContainer : ContainerAPI
 {
    public:
@@ -93,7 +93,7 @@ struct DeviceContainer : ContainerAPI
         Neon::set::KernelConfig kernelConfig(dataView, bk, streamIdx, this->getLaunchParameters(dataView));
 
         if (ContainerExecutionType::device == this->getContainerExecutionType()) {
-            bk.devSet().template launchLambdaOnSpan<DataIteratorContainerT, UserComputeLambdaT>(
+            bk.devSet().template launchLambdaOnSpan<CudaLaunchCompileTimeHintT, DataIteratorContainerT, UserComputeLambdaT>(
                 mExecution,
                 kernelConfig,
                 m_dataIteratorContainer,
diff --git a/libNeonSet/include/Neon/set/container/Loader_imp.h b/libNeonSet/include/Neon/set/container/Loader_imp.h
index e134effe..c9682ff9 100644
--- a/libNeonSet/include/Neon/set/container/Loader_imp.h
+++ b/libNeonSet/include/Neon/set/container/Loader_imp.h
@@ -115,7 +115,7 @@ auto Loader::
 
             if (compute == Neon::Pattern::STENCIL &&
                 (stencilSemantic == StencilSemantic::standard ||
-                 stencilSemantic == StencilSemantic::streaming)) {
+                 stencilSemantic == StencilSemantic::lattice)) {
                 Neon::NeonException exp("Loader");
                 exp << "Loading a non const field for a stencil operation is not supported in Neon";
                 NEON_THROW(exp);
diff --git a/libNeonSet/src/set/StencilSemantic.cpp b/libNeonSet/src/set/StencilSemantic.cpp
index 560b687a..0e6b2114 100644
--- a/libNeonSet/src/set/StencilSemantic.cpp
+++ b/libNeonSet/src/set/StencilSemantic.cpp
@@ -5,11 +5,11 @@ namespace Neon::set {
 auto StencilSemanticUtils::toString(StencilSemantic option) -> std::string
 {
     switch (option) {
-        case StencilSemantic::streaming: {
-            return "streaming";
+        case StencilSemantic::lattice: {
+            return "lattice";
         }
         case StencilSemantic::standard: {
-            return "grid";
+            return "standard";
         }
     }
     NEON_THROW_UNSUPPORTED_OPTION("");
@@ -17,7 +17,7 @@ auto StencilSemanticUtils::toString(StencilSemantic option) -> std::string
 
 auto StencilSemanticUtils::fromString(const std::string& occ) -> StencilSemantic
 {
-    std::array<StencilSemantic, 4> opts{StencilSemantic::standard, StencilSemantic::streaming};
+    std::array<StencilSemantic, 2> opts{StencilSemantic::standard, StencilSemantic::lattice};
     for (auto a : opts) {
         if (toString(a) == occ) {
             return a;
@@ -28,7 +28,7 @@ auto StencilSemanticUtils::fromString(const std::string& occ) -> StencilSemantic
 
 auto StencilSemanticUtils::getOptions() -> std::array<StencilSemantic, nOptions>
 {
-    std::array<StencilSemantic, nOptions> opts = {StencilSemantic::standard, StencilSemantic::streaming};
+    std::array<StencilSemantic, nOptions> opts = {StencilSemantic::standard, StencilSemantic::lattice};
     return opts;
 }
 
@@ -47,7 +47,7 @@ StencilSemanticUtils::Cli::Cli(StencilSemantic model)
     mOption = model;
 }
 
-auto StencilSemanticUtils::Cli::getOption() -> StencilSemantic
+auto StencilSemanticUtils::Cli::getOption() const -> StencilSemantic
 {
     if (!mSet) {
         std::stringstream errorMsg;
@@ -66,13 +66,13 @@ auto StencilSemanticUtils::Cli::set(const std::string& opt)
         std::stringstream errorMsg;
         errorMsg << "TransferSemantic: " << opt << " is not a valid option (valid options are {";
         auto options = StencilSemanticUtils::getOptions();
-        int i = 0;
+        int  i = 0;
         for (auto o : options) {
-            if(i!=0){
-                errorMsg << ", "<< StencilSemanticUtils::toString(o) ;
+            if (i != 0) {
+                errorMsg << ", " << StencilSemanticUtils::toString(o);
             }
             errorMsg << StencilSemanticUtils::toString(o);
-            i=1;
+            i = 1;
         }
         errorMsg << "})";
         NEON_ERROR(errorMsg.str());
@@ -80,19 +80,48 @@ auto StencilSemanticUtils::Cli::set(const std::string& opt)
     mSet = true;
 }
 
-auto StencilSemanticUtils::Cli::getStringOptions() -> std::string
+auto StencilSemanticUtils::Cli::getStringOptions() const -> std::string
 {
     std::stringstream s;
     auto              options = StencilSemanticUtils::getOptions();
     int               i = 0;
     for (auto o : options) {
         if (i != 0) {
-            s << ", " ;
+            s << ", ";
         }
         s << StencilSemanticUtils::toString(o);
         i = 1;
     }
-    std::string msg= s.str();
+    std::string msg = s.str();
     return msg;
 }
-}  // namespace Neon
+
+auto StencilSemanticUtils::Cli::getStringOption() const -> std::string
+{
+    if (!mSet) {
+        std::stringstream errorMsg;
+        errorMsg << "TransferSemantic was not set.";
+        NEON_ERROR(errorMsg.str());
+    }
+    return StencilSemanticUtils::toString(mOption);
+}
+
+auto StencilSemanticUtils::Cli::getDoc() const-> std::string
+{
+    std::stringstream s;
+    s << getStringOptions();
+    s << " default: " << getStringOptions();
+    return s.str();
+}
+
+
+auto StencilSemanticUtils::Cli::addToReport(Neon::Report& report) const -> void
+{
+    report.addMember("StencilSemantic", StencilSemanticUtils::toString(this->getOption()));
+}
+
+auto StencilSemanticUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void
+{
+    report.addMember("StencilSemantic", StencilSemanticUtils::toString(this->getOption()), &subBlock);
+}
+}  // namespace Neon::set
diff --git a/libNeonSet/src/set/TransferMode.cpp b/libNeonSet/src/set/TransferMode.cpp
index 9ef657eb..c2a30ab2 100644
--- a/libNeonSet/src/set/TransferMode.cpp
+++ b/libNeonSet/src/set/TransferMode.cpp
@@ -47,7 +47,7 @@ TransferModeUtils::Cli::Cli(TransferMode model)
     mOption = model;
 }
 
-auto TransferModeUtils::Cli::getOption() -> TransferMode
+auto TransferModeUtils::Cli::getOption() const -> TransferMode
 {
     if (!mSet) {
         std::stringstream errorMsg;
@@ -66,13 +66,13 @@ auto TransferModeUtils::Cli::set(const std::string& opt)
         std::stringstream errorMsg;
         errorMsg << "Transfer: " << opt << " is not a valid option (valid options are {";
         auto options = TransferModeUtils::getOptions();
-        int i = 0;
+        int  i = 0;
         for (auto o : options) {
-            if(i!=0){
-                errorMsg << ", "<< TransferModeUtils::toString(o) ;
+            if (i != 0) {
+                errorMsg << ", " << TransferModeUtils::toString(o);
             }
             errorMsg << TransferModeUtils::toString(o);
-            i=1;
+            i = 1;
         }
         errorMsg << "})";
         NEON_ERROR(errorMsg.str());
@@ -80,19 +80,47 @@ auto TransferModeUtils::Cli::set(const std::string& opt)
     mSet = true;
 }
 
-auto TransferModeUtils::Cli::getStringOptions() -> std::string
+auto TransferModeUtils::Cli::getStringOptions() const -> std::string
 {
     std::stringstream s;
     auto              options = TransferModeUtils::getOptions();
     int               i = 0;
     for (auto o : options) {
         if (i != 0) {
-            s << ", " ;
+            s << ", ";
         }
         s << TransferModeUtils::toString(o);
         i = 1;
     }
-    std::string msg= s.str();
+    std::string msg = s.str();
     return msg;
 }
-}  // namespace Neon
+
+auto TransferModeUtils::Cli::getStringOption() const -> std::string
+{
+    if (!mSet) {
+        std::stringstream errorMsg;
+        errorMsg << "TransferMode was not set.";
+        NEON_ERROR(errorMsg.str());
+    }
+    return TransferModeUtils::toString(mOption);
+}
+
+auto TransferModeUtils::Cli::getDoc() const -> std::string
+{
+    std::stringstream s;
+    s << getStringOptions();
+    s << " default: " << getStringOptions();
+    return s.str();
+}
+
+auto TransferModeUtils::Cli::addToReport(Neon::Report& report) const -> void
+{
+    report.addMember("TransferMode", TransferModeUtils::toString(this->getOption()));
+}
+
+auto TransferModeUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void
+{
+    report.addMember("TransferMode", TransferModeUtils::toString(this->getOption()), &subBlock);
+}
+}  // namespace Neon::set
diff --git a/libNeonSkeleton/include/Neon/skeleton/Occ.h b/libNeonSkeleton/include/Neon/skeleton/Occ.h
index a54f799a..041d178f 100644
--- a/libNeonSkeleton/include/Neon/skeleton/Occ.h
+++ b/libNeonSkeleton/include/Neon/skeleton/Occ.h
@@ -27,12 +27,15 @@ struct OccUtils
         explicit Cli(Occ model);
         Cli();
 
-        auto getOption() -> Occ;
+        auto getOption() const -> Occ;
         auto set(const std::string& opt) -> void;
-        auto getStringOptions() -> std::string;
+        auto getStringOptions() const -> std::string;
+        auto getDoc() const -> std::string;
 
-        auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock)->void;
-        auto addToReport(Neon::Report& report)->void;
+        auto getStringOption() const -> std::string;
+
+        auto addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void;
+        auto addToReport(Neon::Report& report) const -> void;
 
        private:
         bool mSet = false;
@@ -41,4 +44,4 @@ struct OccUtils
 };
 
 
-}  // namespace Neon::skeleton
\ No newline at end of file
+}  // namespace Neon::skeleton
diff --git a/libNeonSkeleton/src/skeleton/Occ.cpp b/libNeonSkeleton/src/skeleton/Occ.cpp
index 44ac9155..44ba2cd9 100644
--- a/libNeonSkeleton/src/skeleton/Occ.cpp
+++ b/libNeonSkeleton/src/skeleton/Occ.cpp
@@ -48,12 +48,23 @@ OccUtils::Cli::Cli(std::string s)
     set(s);
 }
 
+auto OccUtils::Cli::getStringOption() const -> std::string
+{
+    if (!mSet) {
+        std::stringstream errorMsg;
+        errorMsg << "Occ was not set.";
+        NEON_ERROR(errorMsg.str());
+    }
+    return OccUtils::toString(mOption);
+}
+
 OccUtils::Cli::Cli(Occ model)
 {
     mOption = model;
+    mSet = true;
 }
 
-auto OccUtils::Cli::getOption() -> Occ
+auto OccUtils::Cli::getOption() const -> Occ
 {
     if (!mSet) {
         std::stringstream errorMsg;
@@ -86,7 +97,7 @@ auto OccUtils::Cli::set(const std::string& opt)
     mSet = true;
 }
 
-auto OccUtils::Cli::getStringOptions() -> std::string
+auto OccUtils::Cli::getStringOptions() const -> std::string
 {
     std::stringstream s;
     auto              options = OccUtils::getOptions();
@@ -102,14 +113,22 @@ auto OccUtils::Cli::getStringOptions() -> std::string
     return msg;
 }
 
-auto OccUtils::Cli::addToReport(Neon::Report& report) -> void
+auto OccUtils::Cli::getDoc() const -> std::string
+{
+    std::stringstream s;
+    s << getStringOptions();
+    s << " default: " << OccUtils::toString(getOption());
+    return s.str();
+}
+
+auto OccUtils::Cli::addToReport(Neon::Report& report) const -> void
 {
     report.addMember("Occ", OccUtils::toString(this->getOption()));
 }
 
-auto OccUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) -> void
+auto OccUtils::Cli::addToReport(Neon::Report& report, Neon::Report::SubBlock& subBlock) const -> void
 {
     report.addMember("Occ", OccUtils::toString(this->getOption()), &subBlock);
 }
 
-}  // namespace Neon::skeleton
\ No newline at end of file
+}  // namespace Neon::skeleton
diff --git a/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu b/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu
index 0170936c..2e2a2929 100644
--- a/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu
+++ b/libNeonSkeleton/tests/unit/sUt_skeletonOnStreams/src/sUt_skeleton.Stencil.cu
@@ -160,7 +160,7 @@ void SingleStencil(TestData<G, T, C>&      data,
 }
 
 template <typename G, typename T, int C>
-void SingleStencilOCC(TestData<G, T, C>& data)
+void SingleStencilStandardOCC(TestData<G, T, C>& data)
 {
     SingleStencil<G, T, C>(data, Neon::skeleton::Occ::standard, Neon::set::TransferMode::get);
 }
@@ -208,4 +208,14 @@ TEST(SingleStencil_NoOCC, bGrid)
     // using Grid = Neon::dGrid;
     using Type = int32_t;
     runAllTestConfiguration<Grid, Type, 0>("bGrid_t", SingleStencilNoOCC<Grid, Type, 0>, nGpus, 1);
+}
+
+TEST(SingleStencil_StandardOCC, bGrid)
+{
+    int nGpus = 1;
+    using Grid = Neon::bGrid;
+    // using Grid = Neon::domain::eGrid;
+    // using Grid = Neon::dGrid;
+    using Type = int32_t;
+    runAllTestConfiguration<Grid, Type, 0>("bGrid_t", SingleStencilStandardOCC<Grid, Type, 0>, nGpus, 1);
 }
\ No newline at end of file
diff --git a/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h b/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h
index 4858b819..8cd53082 100644
--- a/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h
+++ b/libNeonSkeleton/tests/unit/skeleton-stencil/src/runHelper.h
@@ -22,10 +22,11 @@ using namespace Neon::domain::tool::testing;
 using namespace Neon::domain::tool;
 
 template <typename G, typename T, int C>
-void runAllTestConfiguration(const std::string&                      gname,
-                             std::function<void(TestData<G, T, C>&)> f,
-                             int                                     nGpus,
-                             int                                     minNumGpus)
+void runAllTestConfiguration(const std::string&                                                        gname,
+                             std::function<void(std::string, TestData<G, T, C>&, Neon::skeleton::Occ)> f,
+                             Neon::skeleton::Occ                                                       occ,
+                             int                                                                       nGpus,
+                             int                                                                       minNumGpus)
 {
     if (Neon::sys::globalSpace::gpuSysObjStorage.numDevs() > 0) {
         std::vector<int> nGpuTest;
@@ -69,7 +70,7 @@ void runAllTestConfiguration(const std::string&                      gname,
 
                             NEON_INFO(testData.toString());
 
-                            f(testData);
+                            f(gname, testData, occ);
                         }
                     }
                 }
diff --git a/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu b/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu
index 0e88980a..095959f9 100644
--- a/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu
+++ b/libNeonSkeleton/tests/unit/skeleton-stencil/src/stencil.cu
@@ -59,7 +59,9 @@ auto laplaceOnIntegers(const Field& filedA,
 
 
 template <typename G, typename T, int C>
-void singleStencil(TestData<G, T, C>& data)
+void singleStencil(std::string         testName,
+                   TestData<G, T, C>&  data,
+                   Neon::skeleton::Occ occ)
 {
     using Type = typename TestData<G, T, C>::Type;
 
@@ -82,7 +84,9 @@ void singleStencil(TestData<G, T, C>& data)
         ops.push_back(laplaceOnIntegers(Y, X));
 
         Neon::skeleton::Skeleton skl(data.getBackend());
-        skl.sequence(ops, "sUt_dGridStencil");
+        Neon::skeleton::Options  opt(occ, Neon::set::TransferMode::get);
+        skl.sequence(ops, testName, opt);
+        skl.ioToDot(testName, testName, true);
 
         for (int j = 0; j < nIterations; j++) {
             skl.run();
@@ -108,20 +112,29 @@ void singleStencil(TestData<G, T, C>& data)
     ASSERT_TRUE(isOk);
 }
 
-TEST(singleStencil, dGrid)
+TEST(skeleton_stencil_occ_none, dGrid)
 {
     int nGpus = 1;
     using Grid = Neon::dGrid;
     using Type = int32_t;
     constexpr int C = 0;
-    runAllTestConfiguration<Grid, Type, 0>("dGrid", singleStencil<Grid, Type, C>, nGpus, 1);
+    runAllTestConfiguration<Grid, Type, 0>("skeleton_stencil_occ_none_dGrid", singleStencil<Grid, Type, C>, Neon::skeleton::Occ::none, nGpus, 1);
 }
 
-TEST(singleStencil, bGridSingleGpu)
+TEST(skeleton_stencil_occ_standard, dGrid)
+{
+    int nGpus = 1;
+    using Grid = Neon::dGrid;
+    using Type = int32_t;
+    constexpr int C = 0;
+    runAllTestConfiguration<Grid, Type, 0>("skeleton_stencil_occ_standard_dGrid", singleStencil<Grid, Type, C>, Neon::skeleton::Occ::standard, nGpus, 1);
+}
+
+TEST(skeleton_stencil, bGridSingleGpu)
 {
     int nGpus = 1;
     using Grid = Neon::bGrid;
     using Type = int32_t;
     constexpr int C = 0;
-    runAllTestConfiguration<Grid, Type, 0>("bGrid", singleStencil<Grid, Type, C>, nGpus, 1);
+    runAllTestConfiguration<Grid, Type, 0>("bGrid", singleStencil<Grid, Type, C>, Neon::skeleton::Occ::none, nGpus, 1);
 }
\ No newline at end of file