Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bGrid new temaplate api #40

Merged
merged 12 commits into from
Jun 20, 2023
9 changes: 8 additions & 1 deletion benchmarks/lbm-lid-driven-cavity-flow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,11 @@ add_custom_command(
TARGET ${APP} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/${APP}.sh
${CMAKE_BINARY_DIR}/bin/${APP}.sh)
${CMAKE_BINARY_DIR}/bin/${APP}.sh)

add_custom_command(
TARGET ${APP} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${CMAKE_CURRENT_SOURCE_DIR}/${APP}.py
${CMAKE_BINARY_DIR}/bin/${APP}.py
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
DOMAIN_SIZE_LIST = "64 128 192 256 320 384 448 512".split()
DEVICE_ID_LIST = "0 1 2 3 4 5 6 7".split()
DEVICE_TYPE_LIST = 'cpu gpu'.split()
GRID_LIST = "dGrid bGrid eGrid".split()
STORAGE_FP_LIST = "double float".split()
COMPUTE_FP_LIST = "double float".split()
OCC_LIST = "nOCC".split()
WARM_UP_ITER = 10
MAX_ITER = 100
REPETITIONS = 5

import subprocess
import sys


def printProgressBar(value, label):
n_bar = 40 # size of progress bar
max = 100
j = value / max
sys.stdout.write('\r')
bar = '█' * int(n_bar * j)
bar = bar + '-' * int(n_bar * (1 - j))

sys.stdout.write(f"{label.ljust(10)} | [{bar:{n_bar}s}] {int(100 * j)}% ")
sys.stdout.flush()


def countAll():
counter = 0
for DEVICE_TYPE in DEVICE_TYPE_LIST:
DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
if DEVICE_TYPE == 'gpu':
for DEVICE in DEVICE_ID_LIST[1:]:
DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
for OCC in OCC_LIST:
for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
for STORAGE_FP in STORAGE_FP_LIST:
for COMPUTE_FP in COMPUTE_FP_LIST:
for DEVICE_SET in DEVICE_SET_LIST:
for GRID in GRID_LIST:
if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
continue

counter += 1
return counter


SAMPLES = countAll()
counter = 0
command = './lbm-lid-driven-cavity-flow'
with open(command + '.log', 'w') as fp:
for DEVICE_TYPE in DEVICE_TYPE_LIST:
DEVICE_SET_LIST = [DEVICE_ID_LIST[0]]
if DEVICE_TYPE == 'gpu':
for DEVICE in DEVICE_ID_LIST[1:]:
DEVICE_SET_LIST.append(DEVICE_SET_LIST[-1] + ' ' + DEVICE)
for OCC in OCC_LIST:
for DOMAIN_SIZE in DOMAIN_SIZE_LIST:
for STORAGE_FP in STORAGE_FP_LIST:
for COMPUTE_FP in COMPUTE_FP_LIST:
for DEVICE_SET in DEVICE_SET_LIST:
for GRID in GRID_LIST:
if STORAGE_FP == 'double' and COMPUTE_FP == 'float':
continue

parameters = []
parameters.append('--deviceType ' + DEVICE_TYPE)
parameters.append('--deviceIds ' + DEVICE_SET)
parameters.append('--grid ' + GRID)
parameters.append('--domain-size ' + DOMAIN_SIZE)
parameters.append('--warmup-iter ' + str(WARM_UP_ITER))
parameters.append('--repetitions ' + str(REPETITIONS))
parameters.append('--max-iter ' + str(MAX_ITER))
parameters.append(
'--report-filename ' + 'lbm-lid-driven-cavity-flow___' +
DEVICE_TYPE + '_' + DOMAIN_SIZE + '_' +
STORAGE_FP + '_' + COMPUTE_FP + '_' +
DEVICE_SET.replace(' ', '_') + '_' + OCC)
parameters.append('--computeFP ' + COMPUTE_FP)
parameters.append('--storageFP ' + STORAGE_FP)
parameters.append('--benchmark')
parameters.append('--' + OCC)

commandList = []
commandList.append(command)
for el in parameters:
for s in el.split():
commandList.append(s)

fp.write("\n-------------------------------------------\n")
fp.write(' '.join(commandList))
fp.write("\n-------------------------------------------\n")
fp.flush()
subprocess.run(commandList, text=True, stdout=fp)

counter += 1
printProgressBar(counter * 100.0 / SAMPLES, 'Progress')
30 changes: 16 additions & 14 deletions benchmarks/lbm-lid-driven-cavity-flow/lbm-lid-driven-cavity-flow.sh
Original file line number Diff line number Diff line change
@@ -1,28 +1,30 @@
set -x

DOMAIN_SIZE_LIST="128 192 256 320 384 448 512"
GRID="dGrid"
DOMAIN_SIZE_LIST="64 128 192 256 320 384 448 512"
GRID_LIST="dGrid bGrid eGrid"
STORAGE_FP_LIST="double float"
COMPUTE_FP_LIST="double float"
OCC="nOCC"

for DOMAIN_SIZE in ${DOMAIN_SIZE_LIST}; do
for STORAGE_FP in ${STORAGE_FP_LIST}; do
for COMPUTE_FP in ${COMPUTE_FP_LIST}; do
for GRID in ${GRID_LIST}; do

if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then
continue
fi
if [ "${STORAGE_FP}_${COMPUTE_FP}" = "double_float" ]; then
continue
fi

echo ./lbm-lid-driven-cavity-flow \
--deviceType gpu --deviceIds 0 \
--grid "${GRID}" \
--domain-size "${DOMAIN_SIZE}" \
--warmup-iter 10 --max-iter 100 --repetitions 5 \
--report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \
--computeFP "${COMPUTE_FP}" \
--storageFP "${STORAGE_FP}" \
--${OCC} --benchmark
echo ./lbm-lid-driven-cavity-flow \
--deviceType gpu --deviceIds 0 \
--grid "${GRID}" \
--domain-size "${DOMAIN_SIZE}" \
--warmup-iter 10 --max-iter 100 --repetitions 5 \
--report-filename "lbm-lid-driven-cavity-flow_${DOMAIN_SIZE}_${GRID}_STORAGE_${STORAGE_FP}_COMPUTE_${COMPUTE_FP}" \
--computeFP "${COMPUTE_FP}" \
--storageFP "${STORAGE_FP}" \
--${OCC} --benchmark
done
done
done
done
8 changes: 3 additions & 5 deletions benchmarks/lbm-lid-driven-cavity-flow/src/RunCavityTwoPop.cu
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ auto run(Config& config,
},
Neon::computeMode_t::seq);

//sort the position so the linear interpolation works
// sort the position so the linear interpolation works
std::sort(xPosVal.begin(), xPosVal.end(), [=](std::pair<double, double>& a, std::pair<double, double>& b) {
return a.first < b.first;
});
Expand Down Expand Up @@ -308,12 +308,10 @@ auto run(Config& config,
return details::runFilterStoreType<Neon::dGrid>(config, report);
}
if (config.gridType == "eGrid") {
NEON_DEV_UNDER_CONSTRUCTION("");
// return details::runFilterStoreType<Neon::eGrid>(config, report);
return details::runFilterStoreType<Neon::eGrid>(config, report);
}
if (config.gridType == "bGrid") {
NEON_DEV_UNDER_CONSTRUCTION("");
// return details::runFilterStoreType<Neon::bGrid>(config, report);
return details::runFilterStoreType<Neon::bGrid>(config, report);
}
}
} // namespace CavityTwoPop
2 changes: 1 addition & 1 deletion libNeonDomain/include/Neon/domain/bGrid.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
#include "Neon/domain/details/bGrid/bGrid.h"

namespace Neon {
using bGrid = Neon::domain::details::bGrid::bGrid<8,8,8>;
using bGrid = Neon::domain::details::bGrid::bGrid<Neon::domain::details::bGrid::StaticBlock<8,8,8>>;
}
29 changes: 29 additions & 0 deletions libNeonDomain/include/Neon/domain/details/bGrid/BlockView.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include "Neon/domain/details/bGrid/BlockView/BlockViewGrid.h"
#include "Neon/domain/tools/GridTransformer.h"

namespace Neon::domain::details::bGrid {

struct BlockView
{
public:
using Grid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;
template <typename T, int C = 0>
using Field = Grid::template Field<T, C>;
using index_3d = Neon::index_3d;

template <typename T, int C = 0>
static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t<C == 0, T&>
{
return mem[idx * card];
}

template <typename T, int C = 0>
static auto helpGetReference(T* mem, const int idx, const int card) -> std::enable_if_t<C != 0, T&>
{
return mem[idx * C];
}

static constexpr Neon::MemoryLayout layout = Neon::MemoryLayout::arrayOfStructs;
};

} // namespace Neon::domain::details::bGrid
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ struct GridTransformation
});
}
};
using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;

} // namespace details
using BlockViewGrid = Neon::domain::tool::GridTransformer<details::GridTransformation>::Grid;

} // namespace Neon::domain::details::bGrid
104 changes: 104 additions & 0 deletions libNeonDomain/include/Neon/domain/details/bGrid/StaticBlock.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#include "Neon/domain/details/bGrid/bSpan.h"

namespace Neon::domain::details::bGrid {

template <uint32_t memBlockSizeX_,
uint32_t memBlockSizeY_,
uint32_t memBlockSizeZ_,
uint32_t userBlockSizeX_ = memBlockSizeX_,
uint32_t userBlockSizeY_ = memBlockSizeY_,
uint32_t userBlockSizeZ_ = memBlockSizeZ_,
bool isMultiResMode_ = false>
struct StaticBlock
{
public:
constexpr static uint32_t memBlockSizeX = memBlockSizeX_;
constexpr static uint32_t memBlockSizeY = memBlockSizeY_;
constexpr static uint32_t memBlockSizeZ = memBlockSizeZ_;
constexpr static Neon::uint32_3d memBlockSize3D = Neon::uint32_3d(memBlockSizeX, memBlockSizeY, memBlockSizeZ);

constexpr static uint32_t userBlockSizeX = userBlockSizeX_;
constexpr static uint32_t userBlockSizeY = userBlockSizeY_;
constexpr static uint32_t userBlockSizeZ = userBlockSizeZ_;
constexpr static Neon::uint32_3d userBlockSize3D = Neon::uint32_3d(userBlockSizeX, userBlockSizeY, userBlockSizeZ);

constexpr static uint32_t blockRatioX = memBlockSizeX / userBlockSizeX;
constexpr static uint32_t blockRatioY = memBlockSizeY / userBlockSizeY;
constexpr static uint32_t blockRatioZ = memBlockSizeZ / userBlockSizeZ;

constexpr static uint32_t memBlockPitchX = 1;
constexpr static uint32_t memBlockPitchY = memBlockSizeX;
constexpr static uint32_t memBlockPitchZ = memBlockSizeX * memBlockSizeY;

constexpr static bool isMultiResMode = isMultiResMode_;

constexpr static uint32_t memBlockCountElements = memBlockSizeX * memBlockSizeY * memBlockSizeZ;

static_assert(memBlockSizeX >= userBlockSizeX);
static_assert(memBlockSizeY >= userBlockSizeY);
static_assert(memBlockSizeZ >= userBlockSizeZ);

static_assert(memBlockSizeX % userBlockSizeX == 0);
static_assert(memBlockSizeY % userBlockSizeY == 0);
static_assert(memBlockSizeZ % userBlockSizeZ == 0);

struct BitMask
{
using BitMaskWordType = uint32_t;
auto reset() -> void
{
for (BitMaskWordType i = 0; i < nWords; ++i) {
bits[i] = 0;
}
}

auto setActive(int threadX,
int threadY,
int threadZ) -> void
{
BitMaskWordType mask;
uint32_t wordIdx;
getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx);
auto& word = bits[wordIdx];
word = word | mask;
}

inline auto NEON_CUDA_HOST_DEVICE isActive(int threadX,
int threadY,
int threadZ) const -> bool
{
BitMaskWordType mask;
uint32_t wordIdx;
getMaskAndWordI(threadX, threadY, threadZ, mask, wordIdx);
auto& word = bits[wordIdx];
return (word & mask) != 0;
}

static inline auto NEON_CUDA_HOST_DEVICE getMaskAndWordI(int threadX,
int threadY,
int threadZ,
NEON_OUT BitMaskWordType& mask,
NEON_OUT uint32_t& wordIdx) -> void
{
const uint32_t threadPitch = threadX * memBlockPitchX +
threadY * memBlockPitchY +
threadZ * memBlockPitchZ;

// threadPitch >> log2_of_bitPerWord
// the same as: threadPitch / 2^{log2_of_bitPerWord}
wordIdx = threadPitch >> log2_of_bitPerWord;
// threadPitch & ((bitMaskWordType(bitMaskStorageBitWidth)) - 1);
// same as threadPitch % 2^{log2OfbitMaskWordSize}
const uint32_t offsetInWord = threadPitch & ((BitMaskWordType(bitPerWord)) - 1);
mask = BitMaskWordType(1) << offsetInWord;
}

constexpr static BitMaskWordType nWords = (memBlockCountElements + 31) / 32;
static constexpr uint32_t log2_of_bitPerWord = 5;
static constexpr uint32_t bitPerWord = 32;

BitMaskWordType bits[nWords];
};
};

} // namespace Neon::domain::details::bGrid
Loading