[mthreads] Support base/toolkits: add bandwidth test with musa (#769)

FlagOpen · Dec 11, 2024 · d877592 · d877592
1 parent 7e514d5
commit d877592
Show file tree

Hide file tree

Showing 21 changed files with 966 additions and 0 deletions.
diff --git a/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/README.md b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/README.md
@@ -0,0 +1,63 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：2
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 机内总线协议：Speed 32GT/s, Width x16（PCIE5）
+* 服务器间多卡的MPI互联带宽采用多种通信方式组合，无标定互联带宽
+
+# 指标选型
+
+The following are the three performance metrics commonly used
+1. samples/s (algbw): This metric measures the number of samples processed per second, indicating the algorithmic bandwidth. It reflects the computational efficiency of the algorithm.
+2. busbw: This metric represents the bus bandwidth, which measures the data transfer rate across the system's bus. It is crucial for understanding the communication efficiency between different parts of the system.
+3. busbw * 2: This metric is an extension of busbw, accounting for bidirectional data transfer. It doubles the bus bandwidth to reflect the full duplex capability of the system.
+
+The second metric, busbw, is chosen for the following reasons:
+1. This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication. Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+2. We can horizontally compare the MPI of different patterns such as all-gather/all-reduce/reduce-scatter.
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 服务器间多卡的MPI互联算法带宽测试值(8卡平均) | 服务器间多卡的MPI互联算法带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /    |
+
+| 评测项  | 服务器间多卡的MPI互联等效带宽测试值(8卡平均) | 服务器间多卡的MPI互联等效带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /    |
+* 等效带宽为双向带宽
+
+* 由于MCCL用法和NCCL相似，算法带宽、等效带宽计算可参考：https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /    | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用mcclAllReduce，进行多机多卡的MPI互联操作，计算服务器间MPI互联带宽
+
+* 注：如镜像启动时ssh并未随命令开启，请切换至[容器内启动](https://github.com/FlagOpen/FlagPerf/blob/main/docs/utils/definitions/IN_CONTAINER_LAUNCH.md)
diff --git a/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/bandwidth.mu
@@ -0,0 +1,148 @@
+#include <iomanip>
+#include <iostream>
+#include <mccl.h>
+#include <mpi.h>
+#include <musa_runtime.h>
+#include <stdio.h>
+#include <vector>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (4ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 1000
+
+void checkMusaError(musaError_t err, const char* msg) {
+    if (err != musaSuccess) {
+        fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkMcclError(mcclResult_t result, const char* msg) {
+    if (result != mcclSuccess) {
+        fprintf(stderr, "MCCL Error: %s: %s\n", msg, mcclGetErrorString(result));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkMPIError(int result, const char* msg) {
+    if (result != MPI_SUCCESS) {
+        fprintf(stderr, "MPI Error: %s\n", msg);
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main(int argc, char* argv[]) {
+    checkMPIError(MPI_Init(&argc, &argv), "MPI_Init");
+
+    int rank, size;
+    checkMPIError(MPI_Comm_rank(MPI_COMM_WORLD, &rank), "MPI_Comm_rank");
+    checkMPIError(MPI_Comm_size(MPI_COMM_WORLD, &size), "MPI_Comm_size");
+
+    int num_gpus_per_node = 8;
+    int total_gpus = size;
+    int gpu_id = rank % num_gpus_per_node;
+
+    musaEvent_t start, end;
+    float elapsed_time;
+    float* d_src;
+    float* d_dst;
+    mcclComm_t comm;
+    musaStream_t stream;
+
+    checkMusaError(musaSetDevice(gpu_id), "musaSetDevice");
+    checkMusaError(musaMalloc(&d_src, SIZE), "musaMalloc");
+    checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc");
+
+    std::vector<float> host_data(SIZE / sizeof(float), 1.0f);
+    checkMusaError(musaMemcpy(d_src, host_data.data(), SIZE, musaMemcpyHostToDevice), "musaMemcpy");
+
+    // checkMusaError(musaMemset(d_src, 1.0f, SIZE), "musaMemset");
+    checkMusaError(musaStreamCreate(&stream), "musaStreamCreate");
+
+    mcclUniqueId id;
+    if (rank == 0)
+        checkMcclError(mcclGetUniqueId(&id), "mcclGetUniqueId");
+    checkMPIError(MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD),
+        "MPI_Bcast");
+    checkMcclError(mcclCommInitRank(&comm, total_gpus, id, rank),
+        "mcclCommInitRank");
+    checkMusaError(musaEventCreate(&start), "musaEventCreate");
+    checkMusaError(musaEventCreate(&end), "musaEventCreate");
+
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        checkMcclError(mcclAllReduce((const void*)d_src, (void*)d_dst,
+            SIZE / sizeof(float), mcclFloat, mcclSum, comm,
+            stream),
+            "mcclAllReduce");
+        checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize");
+    }
+    checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier");
+    checkMusaError(musaEventRecord(start), "musaEventRecord");
+
+    for (int i = 0; i < ITERATIONS; ++i) {
+        checkMcclError(mcclAllReduce((const void*)d_src, (void*)d_dst,
+            SIZE / sizeof(float), mcclFloat, mcclSum, comm,
+            stream),
+            "mcclAllReduce");
+        checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize");
+    }
+    checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier");
+    checkMusaError(musaEventRecord(end), "musaEventRecord");
+    checkMusaError(musaEventSynchronize(end), "musaEventSynchronize");
+    checkMusaError(musaEventElapsedTime(&elapsed_time, start, end),
+        "musaEventElapsedTime");
+    /*
+    The following are the three performance metrics commonly used
+        1. samples/s (algbw): This metric measures the number of samples
+    processed per second, indicating the algorithmic bandwidth. It reflects the
+    computational efficiency of the algorithm.
+        2. busbw: This metric represents the bus bandwidth, which measures the
+    data transfer rate across the system's bus. It is crucial for understanding
+    the communication efficiency between different parts of the system.
+        3. busbw * 2: This metric is an extension of busbw, accounting for
+    bidirectional data transfer. It doubles the bus bandwidth to reflect the full
+    duplex capability of the system. The second metric, busbw, is chosen for the
+    following reasons:
+        1. This number is obtained applying a formula to the algorithm bandwidth
+    to reflect the speed of the inter-GPU communication. Using this bus
+    bandwidth, we can compare it with the hardware peak bandwidth, independently
+    of the number of ranks used.
+        2. We can horizontally compare the MPI of different patterns such as
+    all-gather/all-reduce/reduce-scatter. The following is the derivation: algbw
+    = S/t Considering that each rank has a bandwidth to the outside world of B,
+    the time to perform an allReduce operation of S elements is at best : t =
+    (S*2*(n-1)) / (n*B) Indeed, we have S elements, 2*(n-1) operations per
+    element, and n links of bandwidth B to perform them. Reordering the equation,
+    we find that t = (S/B) * (2*(n-1)/n) Therefore, to get an AllReduce bandwidth
+    measurement which we can compare to the hardware peak bandwidth, we compute :
+        B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)
+    More details can be found in
+    https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md The final
+    calculation is the unidirectional bandwidth.
+    */
+    double algbw = SIZE * ITERATIONS / (elapsed_time / 1000.0);
+    double bandwidth = algbw * (2.0 * (total_gpus - 1) / total_gpus);
+    bandwidth = bandwidth + bandwidth;
+    if (rank == 0) {
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-algbw="
+            << std::fixed << std::setprecision(2)
+            << algbw / (1024.0 * 1024.0 * 1024.0) << "GiB/s" << std::endl;
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-algbw="
+            << std::fixed << std::setprecision(2)
+            << algbw / (1000.0 * 1000.0 * 1000.0) << "GB/s" << std::endl;
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-bandwidth="
+            << std::fixed << std::setprecision(2)
+            << bandwidth / (1024.0 * 1024.0 * 1024.0) << "GiB/s" << std::endl;
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-bandwidth="
+            << std::fixed << std::setprecision(2)
+            << bandwidth / (1000.0 * 1000.0 * 1000.0) << "GB/s" << std::endl;
+    }
+    checkMusaError(musaFree(d_src), "musaFree");
+    checkMusaError(musaFree(d_dst), "musaFree");
+    checkMcclError(mcclCommDestroy(comm), "mcclCommDestroy");
+    checkMusaError(musaEventDestroy(start), "musaEventDestroy");
+    checkMusaError(musaEventDestroy(end), "musaEventDestroy");
+    checkMPIError(MPI_Finalize(), "MPI_Finalize");
+    return 0;
+}
diff --git a/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/main.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+service ssh restart
+export MCCL_DEBUG=WARN
+export MCCL_PROTOS=2
+export OPAL_PREFIX=/opt/hpcx/ompi
+export PATH=/opt/hpcx/ompi/bin:$PATH
+export LD_LIBRARY_PATH=/opt/hpcx/ompi/lib/:/usr/local/musa/lib/:$LD_LIBRARY_PATH
+export MUSA_KERNEL_TIMEOUT=3600000
+HOSTS=$(yq '.HOSTS | map(. + ":8") | join(",")' ../../../../configs/host.yaml)
+mcc -c -o bandwidth.o bandwidth.mu -I/usr/local/musa/include -I/opt/hpcx/ompi/include -fPIC
+mpic++ -o bdtest bandwidth.o -L/usr/local/musa/lib -lmusart -lmccl -lmusa -lmpi
+echo "NODERANK: $NODERANK"
+if [ "$NODERANK" -eq 0 ]; then
+    echo "NODERANK is 0, executing the final command..."
+    sleep 10
+    mpirun --allow-run-as-root --host $HOSTS -x MCCL_PROTOS=2 -x MCCL_DEBUG=WARN -x MCCL_IB_DISABLE=0 -x MCCL_IB_HCA=mlx5_0,mlx5_1 -x MUSA_DEVICE_MAX_CONNECTIONS=1 ./bdtest
+fi
diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/README.md b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/README.md
@@ -0,0 +1,48 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 机内总线协议：Speed 32GT/s, Width x16（PCIE5）
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 单机多卡的MPI互联算法带宽测试值    | 单机多卡的MPI互联算法带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | / | / | /  |
+
+| 评测项  | 单机多卡的MPI互联等效带宽测试值    | 单机多卡的MPI互联等效带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | / | / | /  |
+
+* 由于MCCL用法和NCCL类似，算法带宽、等效带宽计算可参考：https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /    | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用mcclAllReduce，进行单机多卡的MPI互联操作，计算服务器内MPI互联带宽
diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <musa_runtime.h>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (16ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 1000
+
+void checkMusaError(musaError_t err, const char* msg) {
+    if (err != musaSuccess) {
+        fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() {
+    float* d_src, * d_dst;
+    musaEvent_t start, end;
+    float elapsed_time;
+
+    checkMusaError(musaMallocHost(&d_src, SIZE), "musaMallocHost");
+    checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc");
+
+    checkMusaError(musaEventCreate(&start), "musaEventCreate");
+    checkMusaError(musaEventCreate(&end), "musaEventCreate");
+
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy");
+    }
+
+    checkMusaError(musaEventRecord(start), "musaEventRecord");
+
+    for (int i = 0; i < ITERATIONS; ++i) {
+        checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy");
+    }
+
+    checkMusaError(musaEventRecord(end), "musaEventRecord");
+    checkMusaError(musaEventSynchronize(end), "musaEventSynchronize");
+
+    checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime");
+
+    double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0);
+
+    printf("[FlagPerf Result]transfer-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
+    printf("[FlagPerf Result]transfer-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+
+    checkMusaError(musaFreeHost(d_src), "musaFreeHost");
+    checkMusaError(musaFree(d_dst), "musaFree");
+    checkMusaError(musaEventDestroy(start), "musaEventDestroy");
+    checkMusaError(musaEventDestroy(end), "musaEventDestroy");
+
+    return 0;
+}
diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh
@@ -0,0 +1,2 @@
+mcc bandwidth.mu -o bdtest -lmusart
+./bdtest
diff --git a/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/README.md b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/README.md
@@ -0,0 +1,41 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：2
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 机内总线协议：Speed 32GT/s, Width x16（PCIE5）
+* RDMA网卡：50GB/s（双向）
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 跨服务器P2P互联带宽测试值(2卡平均，双向) | 跨服务器P2P互联带宽标定值(2卡平均，双向) | 测试标定比例(2卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /      | /     |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(2卡平均) | 单卡最大功耗(2卡最大) | 单卡功耗标准差(2卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | / | / | /    | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(2卡平均) | 单卡平均显存占用(2卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | /    | /   | / | /  |
+
+* 注：如镜像启动时ssh并未随命令开启，请切换至[容器内启动](https://github.com/FlagOpen/FlagPerf/blob/main/docs/utils/definitions/IN_CONTAINER_LAUNCH.md)