diff --git a/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/README.md b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/README.md new file mode 100644 index 000000000..5a43266ee --- /dev/null +++ b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/README.md @@ -0,0 +1,63 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:2 +* 单服务器内使用卡数:8 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 机内总线协议:Speed 32GT/s, Width x16(PCIE5) +* 服务器间多卡的MPI互联带宽采用多种通信方式组合,无标定互联带宽 + +# 指标选型 + +The following are the three performance metrics commonly used +1. samples/s (algbw): This metric measures the number of samples processed per second, indicating the algorithmic bandwidth. It reflects the computational efficiency of the algorithm. +2. busbw: This metric represents the bus bandwidth, which measures the data transfer rate across the system's bus. It is crucial for understanding the communication efficiency between different parts of the system. +3. busbw * 2: This metric is an extension of busbw, accounting for bidirectional data transfer. It doubles the bus bandwidth to reflect the full duplex capability of the system. + +The second metric, busbw, is chosen for the following reasons: +1. This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication. Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used. +2. We can horizontally compare the MPI of different patterns such as all-gather/all-reduce/reduce-scatter. + +# 评测结果 + +## 核心评测结果 + +| 评测项 | 服务器间多卡的MPI互联算法带宽测试值(8卡平均) | 服务器间多卡的MPI互联算法带宽标定值(8卡平均) | 测试标定比例(8卡平均) | +| ---- | -------------- | -------------- | ------------ | +| 评测结果 | / | / | / | + +| 评测项 | 服务器间多卡的MPI互联等效带宽测试值(8卡平均) | 服务器间多卡的MPI互联等效带宽标定值(8卡平均) | 测试标定比例(8卡平均) | +| ---- | -------------- | -------------- | ------------ | +| 评测结果 | / | / | / | +* 等效带宽为双向带宽 + +* 由于MCCL用法和NCCL相似,算法带宽、等效带宽计算可参考:https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 | +| ---- | --------- | -------- | ------- | -------- | +| 监控结果 | / | / | / | / | + +# 厂商测试工具原理说明 + +使用mcclAllReduce,进行多机多卡的MPI互联操作,计算服务器间MPI互联带宽 + +* 注:如镜像启动时ssh并未随命令开启,请切换至[容器内启动](https://github.com/FlagOpen/FlagPerf/blob/main/docs/utils/definitions/IN_CONTAINER_LAUNCH.md) \ No newline at end of file diff --git a/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/bandwidth.mu new file mode 100644 index 000000000..e663a5cc9 --- /dev/null +++ b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/bandwidth.mu @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include +#include +#include + +#define GB (1024ULL * 1024ULL * 1024ULL) +#define SIZE (4ULL * GB) +#define WARMUP_ITERATIONS 100 +#define ITERATIONS 1000 + +void checkMusaError(musaError_t err, const char* msg) { + if (err != musaSuccess) { + fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +void checkMcclError(mcclResult_t result, const char* msg) { + if (result != mcclSuccess) { + fprintf(stderr, "MCCL Error: %s: %s\n", msg, mcclGetErrorString(result)); + exit(EXIT_FAILURE); + } +} + +void checkMPIError(int result, const char* msg) { + if (result != MPI_SUCCESS) { + fprintf(stderr, "MPI Error: %s\n", msg); + exit(EXIT_FAILURE); + } +} + +int main(int argc, char* argv[]) { + checkMPIError(MPI_Init(&argc, &argv), "MPI_Init"); + + int rank, size; + checkMPIError(MPI_Comm_rank(MPI_COMM_WORLD, &rank), "MPI_Comm_rank"); + checkMPIError(MPI_Comm_size(MPI_COMM_WORLD, &size), "MPI_Comm_size"); + + int num_gpus_per_node = 8; + int total_gpus = size; + int gpu_id = rank % num_gpus_per_node; + + musaEvent_t start, end; + float elapsed_time; + float* d_src; + float* d_dst; + mcclComm_t comm; + musaStream_t stream; + + checkMusaError(musaSetDevice(gpu_id), "musaSetDevice"); + checkMusaError(musaMalloc(&d_src, SIZE), "musaMalloc"); + checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc"); + + std::vector host_data(SIZE / sizeof(float), 1.0f); + checkMusaError(musaMemcpy(d_src, host_data.data(), SIZE, musaMemcpyHostToDevice), "musaMemcpy"); + + // checkMusaError(musaMemset(d_src, 1.0f, SIZE), "musaMemset"); + checkMusaError(musaStreamCreate(&stream), "musaStreamCreate"); + + mcclUniqueId id; + if (rank == 0) + checkMcclError(mcclGetUniqueId(&id), "mcclGetUniqueId"); + checkMPIError(MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD), + "MPI_Bcast"); + checkMcclError(mcclCommInitRank(&comm, total_gpus, id, rank), + "mcclCommInitRank"); + checkMusaError(musaEventCreate(&start), "musaEventCreate"); + checkMusaError(musaEventCreate(&end), "musaEventCreate"); + + for (int i = 0; i < WARMUP_ITERATIONS; ++i) { + checkMcclError(mcclAllReduce((const void*)d_src, (void*)d_dst, + SIZE / sizeof(float), mcclFloat, mcclSum, comm, + stream), + "mcclAllReduce"); + checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize"); + } + checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier"); + checkMusaError(musaEventRecord(start), "musaEventRecord"); + + for (int i = 0; i < ITERATIONS; ++i) { + checkMcclError(mcclAllReduce((const void*)d_src, (void*)d_dst, + SIZE / sizeof(float), mcclFloat, mcclSum, comm, + stream), + "mcclAllReduce"); + checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize"); + } + checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier"); + checkMusaError(musaEventRecord(end), "musaEventRecord"); + checkMusaError(musaEventSynchronize(end), "musaEventSynchronize"); + checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), + "musaEventElapsedTime"); + /* + The following are the three performance metrics commonly used + 1. samples/s (algbw): This metric measures the number of samples + processed per second, indicating the algorithmic bandwidth. It reflects the + computational efficiency of the algorithm. + 2. busbw: This metric represents the bus bandwidth, which measures the + data transfer rate across the system's bus. It is crucial for understanding + the communication efficiency between different parts of the system. + 3. busbw * 2: This metric is an extension of busbw, accounting for + bidirectional data transfer. It doubles the bus bandwidth to reflect the full + duplex capability of the system. The second metric, busbw, is chosen for the + following reasons: + 1. This number is obtained applying a formula to the algorithm bandwidth + to reflect the speed of the inter-GPU communication. Using this bus + bandwidth, we can compare it with the hardware peak bandwidth, independently + of the number of ranks used. + 2. We can horizontally compare the MPI of different patterns such as + all-gather/all-reduce/reduce-scatter. The following is the derivation: algbw + = S/t Considering that each rank has a bandwidth to the outside world of B, + the time to perform an allReduce operation of S elements is at best : t = + (S*2*(n-1)) / (n*B) Indeed, we have S elements, 2*(n-1) operations per + element, and n links of bandwidth B to perform them. Reordering the equation, + we find that t = (S/B) * (2*(n-1)/n) Therefore, to get an AllReduce bandwidth + measurement which we can compare to the hardware peak bandwidth, we compute : + B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n) + More details can be found in + https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md The final + calculation is the unidirectional bandwidth. + */ + double algbw = SIZE * ITERATIONS / (elapsed_time / 1000.0); + double bandwidth = algbw * (2.0 * (total_gpus - 1) / total_gpus); + bandwidth = bandwidth + bandwidth; + if (rank == 0) { + std::cout << "[FlagPerf Result]interconnect-MPI_interserver-algbw=" + << std::fixed << std::setprecision(2) + << algbw / (1024.0 * 1024.0 * 1024.0) << "GiB/s" << std::endl; + std::cout << "[FlagPerf Result]interconnect-MPI_interserver-algbw=" + << std::fixed << std::setprecision(2) + << algbw / (1000.0 * 1000.0 * 1000.0) << "GB/s" << std::endl; + std::cout << "[FlagPerf Result]interconnect-MPI_interserver-bandwidth=" + << std::fixed << std::setprecision(2) + << bandwidth / (1024.0 * 1024.0 * 1024.0) << "GiB/s" << std::endl; + std::cout << "[FlagPerf Result]interconnect-MPI_interserver-bandwidth=" + << std::fixed << std::setprecision(2) + << bandwidth / (1000.0 * 1000.0 * 1000.0) << "GB/s" << std::endl; + } + checkMusaError(musaFree(d_src), "musaFree"); + checkMusaError(musaFree(d_dst), "musaFree"); + checkMcclError(mcclCommDestroy(comm), "mcclCommDestroy"); + checkMusaError(musaEventDestroy(start), "musaEventDestroy"); + checkMusaError(musaEventDestroy(end), "musaEventDestroy"); + checkMPIError(MPI_Finalize(), "MPI_Finalize"); + return 0; +} \ No newline at end of file diff --git a/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/main.sh new file mode 100644 index 000000000..fb67682ed --- /dev/null +++ b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/main.sh @@ -0,0 +1,17 @@ +#!/bin/bash +service ssh restart +export MCCL_DEBUG=WARN +export MCCL_PROTOS=2 +export OPAL_PREFIX=/opt/hpcx/ompi +export PATH=/opt/hpcx/ompi/bin:$PATH +export LD_LIBRARY_PATH=/opt/hpcx/ompi/lib/:/usr/local/musa/lib/:$LD_LIBRARY_PATH +export MUSA_KERNEL_TIMEOUT=3600000 +HOSTS=$(yq '.HOSTS | map(. + ":8") | join(",")' ../../../../configs/host.yaml) +mcc -c -o bandwidth.o bandwidth.mu -I/usr/local/musa/include -I/opt/hpcx/ompi/include -fPIC +mpic++ -o bdtest bandwidth.o -L/usr/local/musa/lib -lmusart -lmccl -lmusa -lmpi +echo "NODERANK: $NODERANK" +if [ "$NODERANK" -eq 0 ]; then + echo "NODERANK is 0, executing the final command..." + sleep 10 + mpirun --allow-run-as-root --host $HOSTS -x MCCL_PROTOS=2 -x MCCL_DEBUG=WARN -x MCCL_IB_DISABLE=0 -x MCCL_IB_HCA=mlx5_0,mlx5_1 -x MUSA_DEVICE_MAX_CONNECTIONS=1 ./bdtest +fi \ No newline at end of file diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/README.md b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/README.md new file mode 100644 index 000000000..e949bce6e --- /dev/null +++ b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/README.md @@ -0,0 +1,48 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:1 +* 单服务器内使用卡数:8 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 机内总线协议:Speed 32GT/s, Width x16(PCIE5) + +# 评测结果 + +## 核心评测结果 + +| 评测项 | 单机多卡的MPI互联算法带宽测试值 | 单机多卡的MPI互联算法带宽标定值 | 测试标定比例 | +| ---- | ----------- | -------- | ------ | +| 评测结果 | / | / | / | + +| 评测项 | 单机多卡的MPI互联等效带宽测试值 | 单机多卡的MPI互联等效带宽标定值 | 测试标定比例 | +| ---- | ----------- | -------- | ------ | +| 评测结果 | / | / | / | + +* 由于MCCL用法和NCCL类似,算法带宽、等效带宽计算可参考:https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 | +| ---- | --------- | -------- | ------- | -------- | +| 监控结果 | / | / | / | / | + +# 厂商测试工具原理说明 + +使用mcclAllReduce,进行单机多卡的MPI互联操作,计算服务器内MPI互联带宽 diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu new file mode 100644 index 000000000..f55952504 --- /dev/null +++ b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu @@ -0,0 +1,53 @@ +#include +#include + +#define GB (1024ULL * 1024ULL * 1024ULL) +#define SIZE (16ULL * GB) +#define WARMUP_ITERATIONS 100 +#define ITERATIONS 1000 + +void checkMusaError(musaError_t err, const char* msg) { + if (err != musaSuccess) { + fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +int main() { + float* d_src, * d_dst; + musaEvent_t start, end; + float elapsed_time; + + checkMusaError(musaMallocHost(&d_src, SIZE), "musaMallocHost"); + checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc"); + + checkMusaError(musaEventCreate(&start), "musaEventCreate"); + checkMusaError(musaEventCreate(&end), "musaEventCreate"); + + for (int i = 0; i < WARMUP_ITERATIONS; ++i) { + checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy"); + } + + checkMusaError(musaEventRecord(start), "musaEventRecord"); + + for (int i = 0; i < ITERATIONS; ++i) { + checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy"); + } + + checkMusaError(musaEventRecord(end), "musaEventRecord"); + checkMusaError(musaEventSynchronize(end), "musaEventSynchronize"); + + checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime"); + + double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0); + + printf("[FlagPerf Result]transfer-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0)); + printf("[FlagPerf Result]transfer-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0)); + + checkMusaError(musaFreeHost(d_src), "musaFreeHost"); + checkMusaError(musaFree(d_dst), "musaFree"); + checkMusaError(musaEventDestroy(start), "musaEventDestroy"); + checkMusaError(musaEventDestroy(end), "musaEventDestroy"); + + return 0; +} \ No newline at end of file diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh new file mode 100644 index 000000000..102ab5bc3 --- /dev/null +++ b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh @@ -0,0 +1,2 @@ +mcc bandwidth.mu -o bdtest -lmusart +./bdtest \ No newline at end of file diff --git a/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/README.md b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/README.md new file mode 100644 index 000000000..53b1e1dd7 --- /dev/null +++ b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/README.md @@ -0,0 +1,41 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:2 +* 单服务器内使用卡数:1 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 机内总线协议:Speed 32GT/s, Width x16(PCIE5) +* RDMA网卡:50GB/s(双向) + +# 评测结果 + +## 核心评测结果 + +| 评测项 | 跨服务器P2P互联带宽测试值(2卡平均,双向) | 跨服务器P2P互联带宽标定值(2卡平均,双向) | 测试标定比例(2卡平均) | +| ---- | -------------- | -------------- | ------------ | +| 评测结果 | / | / | / | + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(2卡平均) | 单卡最大功耗(2卡最大) | 单卡功耗标准差(2卡最大) | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(2卡平均) | 单卡平均显存占用(2卡平均) | +| ---- | --------- | -------- | ------------ | -------------- | +| 监控结果 | / | / | / | / | + +* 注:如镜像启动时ssh并未随命令开启,请切换至[容器内启动](https://github.com/FlagOpen/FlagPerf/blob/main/docs/utils/definitions/IN_CONTAINER_LAUNCH.md) \ No newline at end of file diff --git a/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/bandwidth.mu new file mode 100644 index 000000000..71ea18dda --- /dev/null +++ b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/bandwidth.mu @@ -0,0 +1,120 @@ +#include +#include +#include +#include + +#include +#include + +#define SIZE (1024ULL * 1024ULL * 1024ULL * sizeof(float)) +#define WARMUP_ITERATIONS 100 +#define ITERATIONS 1000 + +void checkMusaError(musaError_t err, const char* msg) { + if (err != musaSuccess) { + fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +void checkMcclError(mcclResult_t result, const char* msg) { + if (result != mcclSuccess) { + fprintf(stderr, "MCCL Error: %s: %s\n", msg, mcclGetErrorString(result)); + exit(EXIT_FAILURE); + } +} + +void checkMPIError(int result, const char* msg) { + if (result != MPI_SUCCESS) { + char error_string[MPI_MAX_ERROR_STRING]; + int length; + MPI_Error_string(result, error_string, &length); + fprintf(stderr, "MPI Error: %s: %s\n", msg, error_string); + exit(EXIT_FAILURE); + } +} + +int main(int argc, char** argv) { + float* d_tensor; + musaEvent_t start, end; + float elapsed_time; + + checkMPIError(MPI_Init(&argc, &argv), "MPI_Init"); + int rank, nranks; + checkMPIError(MPI_Comm_rank(MPI_COMM_WORLD, &rank), "MPI_Comm_rank"); + checkMPIError(MPI_Comm_size(MPI_COMM_WORLD, &nranks), "MPI_Comm_size"); + checkMusaError(musaSetDevice(0), "musaSetDevice"); + + mcclComm_t comm; + musaStream_t stream; + + mcclUniqueId id; + if (rank == 0) { + checkMcclError(mcclGetUniqueId(&id), "mcclGetUniqueId"); + } + MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD); + + checkMcclError(mcclCommInitRank(&comm, nranks, id, rank), "mcclCommInitRank"); + checkMusaError(musaStreamCreate(&stream), "musaStreamCreate"); + + checkMusaError(musaMalloc(&d_tensor, SIZE), "musaMalloc"); + + checkMusaError(musaEventCreate(&start), "musaEventCreate"); + checkMusaError(musaEventCreate(&end), "musaEventCreate"); + + checkMcclError(mcclGroupStart(), "mcclGroupStart"); + for (int i = 0; i < WARMUP_ITERATIONS; ++i) { + if (rank == 0) { + checkMcclError( + mcclSend(d_tensor, SIZE / sizeof(float), mcclFloat, 1, comm, stream), + "mcclSend"); + } + else if (rank == 1) { + checkMcclError( + mcclRecv(d_tensor, SIZE / sizeof(float), mcclFloat, 0, comm, stream), + "mcclRecv"); + } + } + checkMcclError(mcclGroupEnd(), "mcclGroupEnd"); + checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize"); + checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier"); + + checkMusaError(musaEventRecord(start), "musaEventRecord"); + checkMcclError(mcclGroupStart(), "mcclGroupStart"); + for (int i = 0; i < ITERATIONS; ++i) { + if (rank == 0) { + checkMcclError( + mcclSend(d_tensor, SIZE / sizeof(float), mcclFloat, 1, comm, stream), + "mcclSend"); + } + else if (rank == 1) { + checkMcclError( + mcclRecv(d_tensor, SIZE / sizeof(float), mcclFloat, 0, comm, stream), + "mcclRecv"); + } + } + checkMcclError(mcclGroupEnd(), "mcclGroupEnd"); + checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize"); + checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier"); + checkMusaError(musaEventRecord(end), "musaEventRecord"); + checkMusaError(musaEventSynchronize(end), "musaEventSynchronize"); + checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), + "musaEventElapsedTime"); + + double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0) + + SIZE * ITERATIONS / (elapsed_time / 1000.0); + std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=" + << std::fixed << std::setprecision(2) + << bandwidth / (1024.0 * 1024.0 * 1024.0) << "GiB/s" << std::endl; + + std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth=" + << std::fixed << std::setprecision(2) + << bandwidth / (1000.0 * 1000.0 * 1000.0) << "GB/s" << std::endl; + checkMusaError(musaEventDestroy(start), "musaEventDestroy"); + checkMusaError(musaEventDestroy(end), "musaEventDestroy"); + checkMusaError(musaFree(d_tensor), "musaFree"); + checkMcclError(mcclCommDestroy(comm), "mcclCommDestroy"); + checkMusaError(musaStreamDestroy(stream), "musaStreamDestroy"); + checkMPIError(MPI_Finalize(), "MPI_Finalize"); + return 0; +} \ No newline at end of file diff --git a/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/main.sh new file mode 100644 index 000000000..3e4546bf2 --- /dev/null +++ b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/main.sh @@ -0,0 +1,17 @@ +#!/bin/bash +service ssh restart +export MCCL_DEBUG=WARN +export MCCL_PROTOS=2 +export OPAL_PREFIX=/opt/hpcx/ompi +export PATH=/opt/hpcx/ompi/bin:$PATH +export LD_LIBRARY_PATH=/opt/hpcx/ompi/lib/:/usr/local/musa/lib/:$LD_LIBRARY_PATH +export MUSA_KERNEL_TIMEOUT=3600000 +mcc -c -o bandwidth.o bandwidth.mu -I/usr/local/musa/include -I/opt/hpcx/ompi/include -fPIC +mpic++ -o bdtest bandwidth.o -L/usr/local/musa/lib -lmusart -lmccl -lmusa -lmpi +HOSTS=$(yq '.HOSTS | join(",")' ../../../../configs/host.yaml) +echo "NODERANK: $NODERANK" +if [ "$NODERANK" -eq 0 ]; then + echo "NODERANK is 0, executing the final command..." + sleep 10 + mpirun --allow-run-as-root --host $HOSTS -np 2 -x MCCL_DEBUG=WARN -x MCCL_IB_DISABLE=0 -x MCCL_IB_HCA=mlx5_0,mlx5_1 -x MUSA_DEVICE_MAX_CONNECTIONS=1 -x MUSA_KERNEL_TIMEOUT=3600000 ./bdtest +fi \ No newline at end of file diff --git a/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/README.md b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/README.md new file mode 100644 index 000000000..b643cc97e --- /dev/null +++ b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/README.md @@ -0,0 +1,43 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:1 +* 单服务器内使用卡数:2 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 机内总线协议:Speed 32GT/s, Width x16(PCIE5) +* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 + +# 评测结果 + +## 核心评测结果 + +| 评测项 | 服务器内P2P互联带宽测试值(双向) | 服务器P2P互联带宽标定值(双向) | 测试标定比例 | +| ---- | ----------- | -------- | ------ | +| 评测结果 | / | / | / | + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗(2卡平均) | 单卡最大功耗(2卡最大) | 单卡功耗标准差(2卡最大) | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(2卡平均) | 单卡平均显存占用(2卡平均) | +| ---- | --------- | -------- | ------- | -------- | +| 监控结果 | / | / | / | / | + +# 厂商测试工具原理说明 + +使用musaMemcpy,进行服务器内AI芯片通信操作,计算服务器AI芯片内P2P互联带宽 diff --git a/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/bandwidth.mu new file mode 100644 index 000000000..f763308ea --- /dev/null +++ b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/bandwidth.mu @@ -0,0 +1,119 @@ +#include +#include +#include +#include + +#define SIZE (1024ULL * 1024ULL * 1024ULL * sizeof(float)) +#define WARMUP_ITERATIONS 100 +#define ITERATIONS 2000 + +void checkMusaError(musaError_t err, const char* msg) { + if (err != musaSuccess) { + fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +int main() { + float* d_src, * d_dst; + musaEvent_t start, end; + float elapsed_time; + int gpu_n; + checkMusaError(musaGetDeviceCount(&gpu_n), "musaGetDeviceCount"); + printf("[FlagPerf Info]MUSA-capable device count: %i\n", gpu_n); + if (gpu_n < 2) { + fprintf(stderr, "Two or more GPUs with Peer-to-Peer access capability are required for inferconnect-P2P_intraserver-bandwidth test\n"); + exit(EXIT_FAILURE); + } + int can_access_peer; + int p2pCapableGPUs[2]; // We take only 1 pair of P2P capable GPUs + p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1; + + // Show all the combinations of supported P2P GPUs + for (int i = 0; i < gpu_n; i++) { + for (int j = 0; j < gpu_n; j++) { + if (i == j) { + continue; + } + checkMusaError(musaDeviceCanAccessPeer(&can_access_peer, i, j), "musaDeviceCanAccessPeer"); + printf("[FlagPerf Info]> Peer access from (GPU%d) -> (GPU%d) : %s\n", + i, j, can_access_peer ? "Yes" : "No"); + if (can_access_peer && p2pCapableGPUs[0] == -1) { + p2pCapableGPUs[0] = i; + p2pCapableGPUs[1] = j; + } + } + } + if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) { + printf( + "[FlagPerf Info]Two or more GPUs with Peer-to-Peer access capability are required for inferconnect-P2P_intraserver-bandwidth test\n"); + printf( + "[FlagPerf Info]Peer to Peer access is not available amongst GPUs in the system, " + "waiving test.\n"); + return 0; + } + int gpuid[2]; + gpuid[0] = p2pCapableGPUs[0]; + gpuid[1] = p2pCapableGPUs[1]; + printf("[FlagPerf Info]Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], + gpuid[1]); + printf("Allocating buffers (%iGB on GPU%d, GPU%d and CPU Host)...\n", + int(SIZE / 1024 / 1024 / 1024), gpuid[0], gpuid[1]); + + checkMusaError(musaSetDevice(gpuid[0]), "musaSetDevice"); + checkMusaError(musaDeviceEnablePeerAccess(gpuid[1], 0), "musaDeviceEnablePeerAccess"); + checkMusaError(musaSetDevice(gpuid[1]), "musaSetDevice"); + checkMusaError(musaDeviceEnablePeerAccess(gpuid[0], 0), "musaDeviceEnablePeerAccess"); + + checkMusaError(musaSetDevice(gpuid[0]), "musaSetDevice"); + checkMusaError(musaMalloc(&d_src, SIZE), "musaMalloc"); + checkMusaError(musaSetDevice(gpuid[1]), "musaSetDevice"); + checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc"); + + checkMusaError(musaEventCreate(&start), "musaEventCreate"); + checkMusaError(musaEventCreate(&end), "musaEventCreate"); + + + for (int i = 0; i < WARMUP_ITERATIONS; ++i) { + if (i % 2 == 0) { + checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyDefault), "musaMemcpy"); + } + else { + checkMusaError(musaMemcpy(d_src, d_dst, SIZE, musaMemcpyDefault), "musaMemcpy"); + } + } + + + checkMusaError(musaEventRecord(start, 0), "musaEventRecord"); + + for (int i = 0; i < ITERATIONS; ++i) { + if (i % 2 == 0) { + checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyDefault), "musaMemcpy"); + } + else { + checkMusaError(musaMemcpy(d_src, d_dst, SIZE, musaMemcpyDefault), "musaMemcpy"); + } + } + checkMusaError(musaEventRecord(end, 0), "musaEventRecord"); + checkMusaError(musaEventSynchronize(end), "musaEventSynchronize"); + checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime"); + double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0) + SIZE * ITERATIONS / (elapsed_time / 1000.0); + std::cout << "[FlagPerf Result]inferconnect-P2P_intraserver-bandwidth=" + << std::fixed << std::setprecision(2) << bandwidth / (1024.0 * 1024.0 * 1024.0) + << "GiB/s" << std::endl; + + std::cout << "[FlagPerf Result]inferconnect-P2P_intraserver-bandwidth=" + << std::fixed << std::setprecision(2) << bandwidth / (1000.0 * 1000.0 * 1000.0) + << "GB/s" << std::endl; + checkMusaError(musaSetDevice(gpuid[0]), "musaSetDevice"); + checkMusaError(musaDeviceDisablePeerAccess(gpuid[1]), "musaDeviceDisablePeerAccess"); + checkMusaError(musaSetDevice(gpuid[1]), "musaSetDevice"); + checkMusaError(musaDeviceDisablePeerAccess(gpuid[0]), "musaDeviceDisablePeerAccess"); + + checkMusaError(musaFree(d_src), "musaFree"); + checkMusaError(musaFree(d_dst), "musaFree"); + checkMusaError(musaEventDestroy(start), "musaEventDestroy"); + checkMusaError(musaEventDestroy(end), "musaEventDestroy"); + + return 0; +} \ No newline at end of file diff --git a/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/main.sh new file mode 100644 index 000000000..102ab5bc3 --- /dev/null +++ b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/main.sh @@ -0,0 +1,2 @@ +mcc bandwidth.mu -o bdtest -lmusart +./bdtest \ No newline at end of file diff --git a/base/toolkits/interconnect-h2d/mthreads/S4000/README.md b/base/toolkits/interconnect-h2d/mthreads/S4000/README.md new file mode 100644 index 000000000..393cfe20b --- /dev/null +++ b/base/toolkits/interconnect-h2d/mthreads/S4000/README.md @@ -0,0 +1,45 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:1 +* 单服务器内使用卡数:1 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 + + +# 评测结果 + +## 核心评测结果 + +| 评测项 | CPU-芯片互联带宽测试值 | CPU-芯片互联带宽标定值 | 测试标定比例 | +| ---- | ----------- | -------- | ------ | +| 评测结果 | / | / | / | + +注: h2d/d2h带宽受到CPU、PCIE、内存等服务器内AI芯片以外的模块影响,无标定值 + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 | +| ---- | --------- | -------- | ------- | -------- | +| 监控结果 | / | / | / | / | + +# 厂商测试工具原理说明 + +使用musaMemcpy,进行hosttodevice的CPU-AI芯片互联操作,计算CPU-AI芯片互联带宽 \ No newline at end of file diff --git a/base/toolkits/interconnect-h2d/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-h2d/mthreads/S4000/bandwidth.mu new file mode 100644 index 000000000..f55952504 --- /dev/null +++ b/base/toolkits/interconnect-h2d/mthreads/S4000/bandwidth.mu @@ -0,0 +1,53 @@ +#include +#include + +#define GB (1024ULL * 1024ULL * 1024ULL) +#define SIZE (16ULL * GB) +#define WARMUP_ITERATIONS 100 +#define ITERATIONS 1000 + +void checkMusaError(musaError_t err, const char* msg) { + if (err != musaSuccess) { + fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +int main() { + float* d_src, * d_dst; + musaEvent_t start, end; + float elapsed_time; + + checkMusaError(musaMallocHost(&d_src, SIZE), "musaMallocHost"); + checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc"); + + checkMusaError(musaEventCreate(&start), "musaEventCreate"); + checkMusaError(musaEventCreate(&end), "musaEventCreate"); + + for (int i = 0; i < WARMUP_ITERATIONS; ++i) { + checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy"); + } + + checkMusaError(musaEventRecord(start), "musaEventRecord"); + + for (int i = 0; i < ITERATIONS; ++i) { + checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy"); + } + + checkMusaError(musaEventRecord(end), "musaEventRecord"); + checkMusaError(musaEventSynchronize(end), "musaEventSynchronize"); + + checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime"); + + double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0); + + printf("[FlagPerf Result]transfer-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0)); + printf("[FlagPerf Result]transfer-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0)); + + checkMusaError(musaFreeHost(d_src), "musaFreeHost"); + checkMusaError(musaFree(d_dst), "musaFree"); + checkMusaError(musaEventDestroy(start), "musaEventDestroy"); + checkMusaError(musaEventDestroy(end), "musaEventDestroy"); + + return 0; +} \ No newline at end of file diff --git a/base/toolkits/interconnect-h2d/mthreads/S4000/main.sh b/base/toolkits/interconnect-h2d/mthreads/S4000/main.sh new file mode 100644 index 000000000..102ab5bc3 --- /dev/null +++ b/base/toolkits/interconnect-h2d/mthreads/S4000/main.sh @@ -0,0 +1,2 @@ +mcc bandwidth.mu -o bdtest -lmusart +./bdtest \ No newline at end of file diff --git a/base/toolkits/main_memory-bandwidth/mthreads/S4000/README.md b/base/toolkits/main_memory-bandwidth/mthreads/S4000/README.md new file mode 100644 index 000000000..b49670e1f --- /dev/null +++ b/base/toolkits/main_memory-bandwidth/mthreads/S4000/README.md @@ -0,0 +1,42 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:1 +* 单服务器内使用卡数:2 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 + +# 评测结果 + +## 核心评测结果 + +| 评测项 | 主存储带宽测试值 | 主存储带宽标定值 | 测试标定比例 | +| ---- | ----------- | -------- | ------ | +| 评测结果 | / | / | / | + +## 能耗监控结果 + +| 监控项 | 系统平均功耗 | 系统最大功耗 | 系统功耗标准差 | 单机TDP | 单卡平均功耗 | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP | +| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- | +| 监控结果 | / | / | / | / | / | / | / | / | + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度 | 单卡平均显存占用 | +| ---- | --------- | -------- | ------- | -------- | +| 监控结果 | / | / | / | / | + +# 厂商测试工具原理说明 + +使用musaMemcpy,进行读+写AI芯片主存储操作,计算AI芯片主存储带宽 diff --git a/base/toolkits/main_memory-bandwidth/mthreads/S4000/bandwidth.mu b/base/toolkits/main_memory-bandwidth/mthreads/S4000/bandwidth.mu new file mode 100644 index 000000000..909ac65bf --- /dev/null +++ b/base/toolkits/main_memory-bandwidth/mthreads/S4000/bandwidth.mu @@ -0,0 +1,53 @@ +#include +#include + +#define GB (1024ULL * 1024ULL * 1024ULL) +#define SIZE (16ULL * GB) +#define WARMUP_ITERATIONS 100 +#define ITERATIONS 10000 + +void checkMusaError(musaError_t err, const char* msg) { + if (err != musaSuccess) { + fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +int main() { + float* d_src, * d_dst; + musaEvent_t start, end; + float elapsed_time; + + checkMusaError(musaMalloc(&d_src, SIZE), "musaMalloc"); + checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc"); + + checkMusaError(musaEventCreate(&start), "musaEventCreate"); + checkMusaError(musaEventCreate(&end), "musaEventCreate"); + + for (int i = 0; i < WARMUP_ITERATIONS; ++i) { + checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyDeviceToDevice), "musaMemcpy"); + } + + checkMusaError(musaEventRecord(start), "musaEventRecord"); + + for (int i = 0; i < ITERATIONS; ++i) { + checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyDeviceToDevice), "musaMemcpy"); + } + + checkMusaError(musaEventRecord(end), "musaEventRecord"); + checkMusaError(musaEventSynchronize(end), "musaEventSynchronize"); + + checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime"); + + double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0); + + printf("[FlagPerf Result]main_memory-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0)); + printf("[FlagPerf Result]main_memory-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0)); + + checkMusaError(musaFree(d_src), "musaFree"); + checkMusaError(musaFree(d_dst), "musaFree"); + checkMusaError(musaEventDestroy(start), "musaEventDestroy"); + checkMusaError(musaEventDestroy(end), "musaEventDestroy"); + + return 0; +} \ No newline at end of file diff --git a/base/toolkits/main_memory-bandwidth/mthreads/S4000/main.sh b/base/toolkits/main_memory-bandwidth/mthreads/S4000/main.sh new file mode 100644 index 000000000..102ab5bc3 --- /dev/null +++ b/base/toolkits/main_memory-bandwidth/mthreads/S4000/main.sh @@ -0,0 +1,2 @@ +mcc bandwidth.mu -o bdtest -lmusart +./bdtest \ No newline at end of file diff --git a/base/toolkits/main_memory-capacity/mthreads/S4000/README.md b/base/toolkits/main_memory-capacity/mthreads/S4000/README.md new file mode 100644 index 000000000..74c7f480c --- /dev/null +++ b/base/toolkits/main_memory-capacity/mthreads/S4000/README.md @@ -0,0 +1,47 @@ +# 参评AI芯片信息 + +* 厂商:MThreads +* 产品名称:S4000 +* 产品型号:MTT S4000 +* TDP:/ + +# 所用服务器配置 + +* 服务器数量:1 +* 单服务器内使用卡数:2 +* 服务器型号:/ +* 操作系统版本:Ubuntu 22.04.5 LTS +* 操作系统内核:Linux 5.15.0-105-generic +* CPU:/ +* docker版本:24.0.7 +* 内存:1TiB +* 服务器间AI芯片直连规格及带宽:此评测样例无需服务器间通信 + +# 评测结果 + +## 核心评测结果 + +| 评测项 | 主存储容量测试值 | 主存储容量标定值 | 测试标定比例 | +| ---- | ----------------- | -------- | ------ | +| 评测结果 | / | / | / | + +## 能耗监控结果 + +此评测样例中无意义 + +## 其他重要监控结果 + +| 监控项 | 系统平均CPU占用 | 系统平均内存占用 | +| ---- | --------- | -------- | +| 监控结果 | / | / | + +# 厂商测试工具原理说明 + +通过按照一定规则不断尝试申请主存储(例如显存)来评测主存储容量 + +1. 初始化某个INITSIZE +2. 不断尝试musaMalloc INITSIZE大小的主存储,直到无法申请 +3. 减小INITSIZE为当前的二分之一,重复执行第2步 +4. 重复执行第3步,直到INITSIZE为1MiB + +上述评测过程可以确保在评测结束时,已无法申请任何1MiB的主存储,以此评测主存储容量。 \ No newline at end of file diff --git a/base/toolkits/main_memory-capacity/mthreads/S4000/capacity.mu b/base/toolkits/main_memory-capacity/mthreads/S4000/capacity.mu new file mode 100644 index 000000000..b7e237225 --- /dev/null +++ b/base/toolkits/main_memory-capacity/mthreads/S4000/capacity.mu @@ -0,0 +1,46 @@ +#include +#include + +bool CHECK(musaError_t call){ + const musaError_t error = call; + return (error == musaSuccess); +} + + +void test_gpu_memory_capacity() { + size_t initial_byte_size = 65536; + size_t current_byte_size = initial_byte_size; + size_t min_byte_size = 1; + size_t total_allocated = 0; + + printf("Init tensor size: %zu MiB...\n", initial_byte_size); + + while (current_byte_size >= min_byte_size) { + void* ptr = NULL; + bool allocation_failed = false; + + while (!allocation_failed) { + if (CHECK(musaMalloc(&ptr, current_byte_size * 1024 * 1024))){ + total_allocated += current_byte_size; + printf("Allocated: %zu MiB\n", total_allocated); + } + else{ + printf("MUSA OOM at tensor size %zu MiB. Allocated:%zu MiB\n", current_byte_size, total_allocated); + allocation_failed = true; + } + } + + current_byte_size /= 2; + printf("Reduce tensor size to %zu MiB\n", current_byte_size); + } + + + printf("[FlagPerf Result]main_memory-capacity=%.2fGiB\n", total_allocated / (1024.0)); + printf("[FlagPerf Result]main_memory-capacity=%.2fGB\n", total_allocated * 1024.0 * 1024.0 / (1000.0 * 1000.0 * 1000.0)); +} + +int main() { + test_gpu_memory_capacity(); + musaDeviceReset(); + return 0; +} \ No newline at end of file diff --git a/base/toolkits/main_memory-capacity/mthreads/S4000/main.sh b/base/toolkits/main_memory-capacity/mthreads/S4000/main.sh new file mode 100644 index 000000000..bc1b67c0a --- /dev/null +++ b/base/toolkits/main_memory-capacity/mthreads/S4000/main.sh @@ -0,0 +1,3 @@ +mcc capacity.mu -o capacitytest -lmusart +./capacitytest +sleep 300 \ No newline at end of file