Autodesk · massimim · Jun 20, 2023 · Jun 9, 2023 · Jun 15, 2023 · Jun 15, 2023
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt b/benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt
@@ -23,4 +23,11 @@ add_custom_command(
 		TARGET ${APP}  POST_BUILD
 		COMMAND ${CMAKE_COMMAND} -E copy
 		${CMAKE_CURRENT_SOURCE_DIR}/${APP}.sh
-		${CMAKE_BINARY_DIR}/bin/${APP}.sh)
+		${CMAKE_BINARY_DIR}/bin/${APP}.sh)
+
+add_custom_command(
+		TARGET ${APP}  POST_BUILD
+		COMMAND ${CMAKE_COMMAND} -E copy
+		${CMAKE_CURRENT_SOURCE_DIR}/${APP}.py
+		${CMAKE_BINARY_DIR}/bin/${APP}.py
+)
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.py
@@ -0,0 +1,97 @@
+DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split()
+DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split()
+DEVICE_TYPE_LIST = 'cpu gpu'.split()
+GRID_LIST = "dGrid bGrid eGrid".split()
+STORAGE_FP_LIST = "double float".split()
+COMPUTE_FP_LIST = "double float".split()
+OCC_LIST = "nOCC".split()
+WARM_UP_ITER = 10
+MAX_ITER = 100
+REPETITIONS = 5
+
+import subprocess
+import sys
+
+
+def printProgressBar(value, label):
+    n_bar = 40  # size of progress bar
+    max = 100
+    j = value / max
+    sys.stdout.write('\r')
+    bar = '█' * int(n_bar * j)
+    bar = bar + '-' * int(n_bar * (1 - j))
+
+    sys.stdout.write(f"{label.ljust(10)} | [{bar:{n_bar}s}] {int(100 * j)}% ")
+    sys.stdout.flush()
+
+
+def countAll():
+    counter = 0
+    for DEVICE_TYPE in DEVICE_TYPE_LIST:
+        DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
+        if DEVICE_TYPE == 'gpu':
+            for DEVICE in DEVICE_ID_LIST[1:]:
+                DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
+        for OCC in OCC_LIST:
+            for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
+                for STORAGE_FP in STORAGE_FP_LIST:
+                    for COMPUTE_FP in COMPUTE_FP_LIST:
+                        for DEVICE_SET in DEVICE_SET_LIST:
+                            for GRID in GRID_LIST:
+                                if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                    continue
+
+                                counter += 1
+    return counter
+
+
+SAMPLES = countAll()
+counter = 0
+command = './lbm-lid-driven-cavity-flow'
+with open(command + '.log', 'w') as fp:
+    for DEVICE_TYPE in DEVICE_TYPE_LIST:
+        DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
+        if DEVICE_TYPE == 'gpu':
+            for DEVICE in DEVICE_ID_LIST[1:]:
+                DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
+        for OCC in OCC_LIST:
+            for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
+                for STORAGE_FP in STORAGE_FP_LIST:
+                    for COMPUTE_FP in COMPUTE_FP_LIST:
+                        for DEVICE_SET in DEVICE_SET_LIST:
+                            for GRID in GRID_LIST:
+                                if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
+                                    continue
+
+                                parameters = []
+                                parameters.append('--deviceType ' + DEVICE_TYPE)
+                                parameters.append('--deviceIds ' + DEVICE_SET)
+                                parameters.append('--grid ' + GRID)
+                                parameters.append('--domain-size ' + DOMAIN_SIZE)
+                                parameters.append('--warmup-iter ' + str(WARM_UP_ITER))
+                                parameters.append('--repetitions ' + str(REPETITIONS))
+                                parameters.append('--max-iter ' + str(MAX_ITER))
+                                parameters.append(
+                                    '--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
+                                    DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' +
+                                    STORAGE_FP + '_' + COMPUTE_FP + '_' +
+                                    DEVICE_SET.replace(' ', '_') + '_' + OCC)
+                                parameters.append('--computeFP ' + COMPUTE_FP)
+                                parameters.append('--storageFP ' + STORAGE_FP)
+                                parameters.append('--benchmark')
+                                parameters.append('--' + OCC)
+
+                                commandList = []
+                                commandList.append(command)
+                                for el in parameters:
+                                    for s in el.split():
+                                        commandList.append(s)
+
+                                fp.write("\n-------------------------------------------\n")
+                                fp.write(' '.join(commandList))
+                                fp.write("\n-------------------------------------------\n")
+                                fp.flush()
+                                subprocess.run(commandList, text=True, stdout=fp)
+
+                                counter += 1
+                                printProgressBar(counter * 100.0 / SAMPLES, 'Progress')
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh b/benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh
@@ -1,28 +1,30 @@
 set -x
 
-DOMAIN_SIZE_LIST="128 192 256 320 384 448 512"
-GRID="dGrid"
+DOMAIN_SIZE_LIST="64 128 192 256 320 384 448 512"
+GRID_LIST="dGrid bGrid eGrid"
 STORAGE_FP_LIST="double float"
 COMPUTE_FP_LIST="double float"
 OCC="nOCC"
 
 for DOMAIN_SIZE in ${DOMAIN_SIZE_LIST}; do
   for STORAGE_FP in ${STORAGE_FP_LIST}; do
     for COMPUTE_FP in ${COMPUTE_FP_LIST}; do
+      for GRID in ${GRID_LIST}; do
 
-      if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then
-        continue
-      fi
+        if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then
+          continue
+        fi
 
-      echo ./lbm-lid-driven-cavity-flow \
-        --deviceType gpu --deviceIds 0 \
-        --grid "${GRID}" \
-        --domain-size "${DOMAIN_SIZE}" \
-        --warmup-iter 10 --max-iter 100 --repetitions 5 \
-        --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \
-        --computeFP "${COMPUTE_FP}" \
-        --storageFP "${STORAGE_FP}" \
-        --${OCC} --benchmark
+        echo ./lbm-lid-driven-cavity-flow \
+          --deviceType gpu --deviceIds 0 \
+          --grid "${GRID}" \
+          --domain-size "${DOMAIN_SIZE}" \
+          --warmup-iter 10 --max-iter 100 --repetitions 5 \
+          --report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \
+          --computeFP "${COMPUTE_FP}" \
+          --storageFP "${STORAGE_FP}" \
+          --${OCC} --benchmark
+      done
     done
   done
 done
diff --git a/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu b/benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
@@ -128,7 +128,7 @@ auto run(Config& config,
             },
                                 Neon::computeMode_t::seq);
 
-            //sort the position so the linear interpolation works
+            // sort the position so the linear interpolation works
             std::sort(xPosVal.begin(), xPosVal.end(), [=](std::pair<double, double>& a, std::pair<double, double>& b) {
                 return a.first < b.first;
             });
@@ -308,12 +308,10 @@ auto run(Config& config,
         return details::runFilterStoreType<Neon::dGrid>(config, report);
     }
     if (config.gridType == "eGrid") {
-        NEON_DEV_UNDER_CONSTRUCTION("");
-        // return details::runFilterStoreType<Neon::eGrid>(config, report);
+        return details::runFilterStoreType<Neon::eGrid>(config, report);
     }
     if (config.gridType == "bGrid") {
-        NEON_DEV_UNDER_CONSTRUCTION("");
-        //        return details::runFilterStoreType<Neon::bGrid>(config, report);
+        return details::runFilterStoreType<Neon::bGrid>(config, report);
     }
 }
 }  // namespace CavityTwoPop
diff --git a/libNeonDomain/include/Neon/domain/bGrid.h b/libNeonDomain/include/Neon/domain/bGrid.h
@@ -2,5 +2,5 @@
 #include "Neon/domain/details/bGrid/bGrid.h"
 
 namespace Neon {
-using bGrid = Neon::domain::details::bGrid::bGrid<8,8,8>;
+using bGrid = Neon::domain::details::bGrid::bGrid<Neon::domain::details::bGrid::StaticBlock<8,8,8>>;
 }
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h b/libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h
@@ -0,0 +1,29 @@
+#include "Neon/domain/details/bGrid/BlockView/BlockViewGrid.h"
+#include "Neon/domain/tools/GridTransformer.h"
+
+namespace Neon::domain::details::bGrid {
+
+struct BlockView
+{
+   public:
+    using Grid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
+    template <typename T, int C = 0>
+    using Field = Grid::template Field<T, C>;
+    using index_3d = Neon::index_3d;
+
+    template <typename T, int C = 0>
+    static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t<C == 0, T&>
+    {
+        return mem[idx * card];
+    }
+
+    template <typename T, int C = 0>
+    static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t<C != 0, T&>
+    {
+        return mem[idx * C];
+    }
+
+    static constexpr Neon::MemoryLayout layout = Neon::MemoryLayout::arrayOfStructs;
+};
+
+}  // namespace Neon::domain::details::bGrid
diff --git a/...tails/bGrid/BlockViewGrid/BlockViewGrid.h → ...n/details/bGrid/BlockView/BlockViewGrid.h b/...tails/bGrid/BlockViewGrid/BlockViewGrid.h → ...n/details/bGrid/BlockView/BlockViewGrid.h
@@ -90,8 +90,8 @@ struct GridTransformation
             });
     }
 };
+using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
 
 }  // namespace details
-using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
 
 }  // namespace Neon::domain::details::bGrid
diff --git a/.../bGrid/BlockViewGrid/BlockViewPartition.h → ...ails/bGrid/BlockView/BlockViewPartition.h b/.../bGrid/BlockViewGrid/BlockViewPartition.h → ...ails/bGrid/BlockView/BlockViewPartition.h
diff --git a/...id/BlockViewGrid/BlockViewPartition_imp.h → .../bGrid/BlockView/BlockViewPartition_imp.h b/...id/BlockViewGrid/BlockViewPartition_imp.h → .../bGrid/BlockView/BlockViewPartition_imp.h
diff --git a/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h b/libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
@@ -0,0 +1,104 @@
+#include "Neon/domain/details/bGrid/bSpan.h"
+
+namespace Neon::domain::details::bGrid {
+
+template <uint32_t memBlockSizeX_,
+          uint32_t memBlockSizeY_,
+          uint32_t memBlockSizeZ_,
+          uint32_t userBlockSizeX_ = memBlockSizeX_,
+          uint32_t userBlockSizeY_ = memBlockSizeY_,
+          uint32_t userBlockSizeZ_ = memBlockSizeZ_,
+          bool     isMultiResMode_ = false>
+struct StaticBlock
+{
+   public:
+    constexpr static uint32_t        memBlockSizeX = memBlockSizeX_;
+    constexpr static uint32_t        memBlockSizeY = memBlockSizeY_;
+    constexpr static uint32_t        memBlockSizeZ = memBlockSizeZ_;
+    constexpr static Neon::uint32_3d memBlockSize3D = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ);
+
+    constexpr static uint32_t        userBlockSizeX = userBlockSizeX_;
+    constexpr static uint32_t        userBlockSizeY = userBlockSizeY_;
+    constexpr static uint32_t        userBlockSizeZ = userBlockSizeZ_;
+    constexpr static Neon::uint32_3d userBlockSize3D = Neon::uint32_3d(userBlockSizeX, userBlockSizeY, userBlockSizeZ);
+
+    constexpr static uint32_t blockRatioX = memBlockSizeX / userBlockSizeX;
+    constexpr static uint32_t blockRatioY = memBlockSizeY / userBlockSizeY;
+    constexpr static uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ;
+
+    constexpr static uint32_t memBlockPitchX = 1;
+    constexpr static uint32_t memBlockPitchY = memBlockSizeX;
+    constexpr static uint32_t memBlockPitchZ = memBlockSizeX * memBlockSizeY;
+
+    constexpr static bool isMultiResMode = isMultiResMode_;
+
+    constexpr static uint32_t memBlockCountElements = memBlockSizeX * memBlockSizeY * memBlockSizeZ;
+
+    static_assert(memBlockSizeX >= userBlockSizeX);
+    static_assert(memBlockSizeY >= userBlockSizeY);
+    static_assert(memBlockSizeZ >= userBlockSizeZ);
+
+    static_assert(memBlockSizeX % userBlockSizeX == 0);
+    static_assert(memBlockSizeY % userBlockSizeY == 0);
+    static_assert(memBlockSizeZ % userBlockSizeZ == 0);
+
+    struct BitMask
+    {
+        using BitMaskWordType = uint32_t;
+        auto reset() -> void
+        {
+            for (BitMaskWordType i = 0; i < nWords; ++i) {
+                bits[i] = 0;
+            }
+        }
+
+        auto setActive(int threadX,
+                       int threadY,
+                       int threadZ) -> void
+        {
+            BitMaskWordType mask;
+            uint32_t        wordIdx;
+            getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx);
+            auto& word = bits[wordIdx];
+            word = word | mask;
+        }
+
+        inline auto NEON_CUDA_HOST_DEVICE isActive(int threadX,
+                                                   int threadY,
+                                                   int threadZ) const -> bool
+        {
+            BitMaskWordType mask;
+            uint32_t        wordIdx;
+            getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx);
+            auto& word = bits[wordIdx];
+            return (word & mask) != 0;
+        }
+
+        static inline auto NEON_CUDA_HOST_DEVICE getMaskAndWordI(int                       threadX,
+                                                                 int                       threadY,
+                                                                 int                       threadZ,
+                                                                 NEON_OUT BitMaskWordType& mask,
+                                                                 NEON_OUT uint32_t&        wordIdx) -> void
+        {
+            const uint32_t threadPitch = threadX * memBlockPitchX +
+                                         threadY * memBlockPitchY +
+                                         threadZ * memBlockPitchZ;
+
+            // threadPitch >> log2_of_bitPerWord
+            // the same as: threadPitch / 2^{log2_of_bitPerWord}
+            wordIdx = threadPitch >> log2_of_bitPerWord;
+            // threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1);
+            // same as threadPitch % 2^{log2OfbitMaskWordSize}
+            const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitPerWord)) - 1);
+            mask = BitMaskWordType(1) << offsetInWord;
+        }
+
+        constexpr static BitMaskWordType nWords = (memBlockCountElements + 31) / 32;
+        static constexpr uint32_t        log2_of_bitPerWord = 5;
+        static constexpr uint32_t        bitPerWord = 32;
+
+        BitMaskWordType bits[nWords];
+    };
+};
+
+}  // namespace Neon::domain::details::bGrid