diff --git a/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/README.md b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/README.md
new file mode 100644
index 000000000..5a43266ee
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/README.md
@@ -0,0 +1,63 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：2
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 机内总线协议：Speed 32GT/s, Width x16（PCIE5）
+* 服务器间多卡的MPI互联带宽采用多种通信方式组合，无标定互联带宽
+
+# 指标选型
+
+The following are the three performance metrics commonly used
+1. samples/s (algbw): This metric measures the number of samples processed per second, indicating the algorithmic bandwidth. It reflects the computational efficiency of the algorithm.
+2. busbw: This metric represents the bus bandwidth, which measures the data transfer rate across the system's bus. It is crucial for understanding the communication efficiency between different parts of the system.
+3. busbw * 2: This metric is an extension of busbw, accounting for bidirectional data transfer. It doubles the bus bandwidth to reflect the full duplex capability of the system.
+
+The second metric, busbw, is chosen for the following reasons:
+1. This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication. Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+2. We can horizontally compare the MPI of different patterns such as all-gather/all-reduce/reduce-scatter.
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 服务器间多卡的MPI互联算法带宽测试值(8卡平均) | 服务器间多卡的MPI互联算法带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /    |
+
+| 评测项  | 服务器间多卡的MPI互联等效带宽测试值(8卡平均) | 服务器间多卡的MPI互联等效带宽标定值(8卡平均) | 测试标定比例(8卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /       | /    |
+* 等效带宽为双向带宽
+
+* 由于MCCL用法和NCCL相似，算法带宽、等效带宽计算可参考：https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /    | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用mcclAllReduce，进行多机多卡的MPI互联操作，计算服务器间MPI互联带宽
+
+* 注：如镜像启动时ssh并未随命令开启，请切换至[容器内启动](https://github.com/FlagOpen/FlagPerf/blob/main/docs/utils/definitions/IN_CONTAINER_LAUNCH.md)
\ No newline at end of file
diff --git a/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/bandwidth.mu
new file mode 100644
index 000000000..e663a5cc9
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/bandwidth.mu
@@ -0,0 +1,148 @@
+#include <iomanip>
+#include <iostream>
+#include <mccl.h>
+#include <mpi.h>
+#include <musa_runtime.h>
+#include <stdio.h>
+#include <vector>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (4ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 1000
+
+void checkMusaError(musaError_t err, const char* msg) {
+    if (err != musaSuccess) {
+        fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkMcclError(mcclResult_t result, const char* msg) {
+    if (result != mcclSuccess) {
+        fprintf(stderr, "MCCL Error: %s: %s\n", msg, mcclGetErrorString(result));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkMPIError(int result, const char* msg) {
+    if (result != MPI_SUCCESS) {
+        fprintf(stderr, "MPI Error: %s\n", msg);
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main(int argc, char* argv[]) {
+    checkMPIError(MPI_Init(&argc, &argv), "MPI_Init");
+
+    int rank, size;
+    checkMPIError(MPI_Comm_rank(MPI_COMM_WORLD, &rank), "MPI_Comm_rank");
+    checkMPIError(MPI_Comm_size(MPI_COMM_WORLD, &size), "MPI_Comm_size");
+
+    int num_gpus_per_node = 8;
+    int total_gpus = size;
+    int gpu_id = rank % num_gpus_per_node;
+
+    musaEvent_t start, end;
+    float elapsed_time;
+    float* d_src;
+    float* d_dst;
+    mcclComm_t comm;
+    musaStream_t stream;
+
+    checkMusaError(musaSetDevice(gpu_id), "musaSetDevice");
+    checkMusaError(musaMalloc(&d_src, SIZE), "musaMalloc");
+    checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc");
+
+    std::vector<float> host_data(SIZE / sizeof(float), 1.0f);
+    checkMusaError(musaMemcpy(d_src, host_data.data(), SIZE, musaMemcpyHostToDevice), "musaMemcpy");
+
+    // checkMusaError(musaMemset(d_src, 1.0f, SIZE), "musaMemset");
+    checkMusaError(musaStreamCreate(&stream), "musaStreamCreate");
+
+    mcclUniqueId id;
+    if (rank == 0)
+        checkMcclError(mcclGetUniqueId(&id), "mcclGetUniqueId");
+    checkMPIError(MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD),
+        "MPI_Bcast");
+    checkMcclError(mcclCommInitRank(&comm, total_gpus, id, rank),
+        "mcclCommInitRank");
+    checkMusaError(musaEventCreate(&start), "musaEventCreate");
+    checkMusaError(musaEventCreate(&end), "musaEventCreate");
+
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        checkMcclError(mcclAllReduce((const void*)d_src, (void*)d_dst,
+            SIZE / sizeof(float), mcclFloat, mcclSum, comm,
+            stream),
+            "mcclAllReduce");
+        checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize");
+    }
+    checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier");
+    checkMusaError(musaEventRecord(start), "musaEventRecord");
+
+    for (int i = 0; i < ITERATIONS; ++i) {
+        checkMcclError(mcclAllReduce((const void*)d_src, (void*)d_dst,
+            SIZE / sizeof(float), mcclFloat, mcclSum, comm,
+            stream),
+            "mcclAllReduce");
+        checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize");
+    }
+    checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier");
+    checkMusaError(musaEventRecord(end), "musaEventRecord");
+    checkMusaError(musaEventSynchronize(end), "musaEventSynchronize");
+    checkMusaError(musaEventElapsedTime(&elapsed_time, start, end),
+        "musaEventElapsedTime");
+    /*
+    The following are the three performance metrics commonly used
+        1. samples/s (algbw): This metric measures the number of samples
+    processed per second, indicating the algorithmic bandwidth. It reflects the
+    computational efficiency of the algorithm.
+        2. busbw: This metric represents the bus bandwidth, which measures the
+    data transfer rate across the system's bus. It is crucial for understanding
+    the communication efficiency between different parts of the system.
+        3. busbw * 2: This metric is an extension of busbw, accounting for
+    bidirectional data transfer. It doubles the bus bandwidth to reflect the full
+    duplex capability of the system. The second metric, busbw, is chosen for the
+    following reasons:
+        1. This number is obtained applying a formula to the algorithm bandwidth
+    to reflect the speed of the inter-GPU communication. Using this bus
+    bandwidth, we can compare it with the hardware peak bandwidth, independently
+    of the number of ranks used.
+        2. We can horizontally compare the MPI of different patterns such as
+    all-gather/all-reduce/reduce-scatter. The following is the derivation: algbw
+    = S/t Considering that each rank has a bandwidth to the outside world of B,
+    the time to perform an allReduce operation of S elements is at best : t =
+    (S*2*(n-1)) / (n*B) Indeed, we have S elements, 2*(n-1) operations per
+    element, and n links of bandwidth B to perform them. Reordering the equation,
+    we find that t = (S/B) * (2*(n-1)/n) Therefore, to get an AllReduce bandwidth
+    measurement which we can compare to the hardware peak bandwidth, we compute :
+        B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)
+    More details can be found in
+    https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md The final
+    calculation is the unidirectional bandwidth.
+    */
+    double algbw = SIZE * ITERATIONS / (elapsed_time / 1000.0);
+    double bandwidth = algbw * (2.0 * (total_gpus - 1) / total_gpus);
+    bandwidth = bandwidth + bandwidth;
+    if (rank == 0) {
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-algbw="
+            << std::fixed << std::setprecision(2)
+            << algbw / (1024.0 * 1024.0 * 1024.0) << "GiB/s" << std::endl;
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-algbw="
+            << std::fixed << std::setprecision(2)
+            << algbw / (1000.0 * 1000.0 * 1000.0) << "GB/s" << std::endl;
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-bandwidth="
+            << std::fixed << std::setprecision(2)
+            << bandwidth / (1024.0 * 1024.0 * 1024.0) << "GiB/s" << std::endl;
+        std::cout << "[FlagPerf Result]interconnect-MPI_interserver-bandwidth="
+            << std::fixed << std::setprecision(2)
+            << bandwidth / (1000.0 * 1000.0 * 1000.0) << "GB/s" << std::endl;
+    }
+    checkMusaError(musaFree(d_src), "musaFree");
+    checkMusaError(musaFree(d_dst), "musaFree");
+    checkMcclError(mcclCommDestroy(comm), "mcclCommDestroy");
+    checkMusaError(musaEventDestroy(start), "musaEventDestroy");
+    checkMusaError(musaEventDestroy(end), "musaEventDestroy");
+    checkMPIError(MPI_Finalize(), "MPI_Finalize");
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/main.sh
new file mode 100644
index 000000000..fb67682ed
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_interserver/mthreads/S4000/main.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+service ssh restart
+export MCCL_DEBUG=WARN
+export MCCL_PROTOS=2
+export OPAL_PREFIX=/opt/hpcx/ompi
+export PATH=/opt/hpcx/ompi/bin:$PATH
+export LD_LIBRARY_PATH=/opt/hpcx/ompi/lib/:/usr/local/musa/lib/:$LD_LIBRARY_PATH
+export MUSA_KERNEL_TIMEOUT=3600000
+HOSTS=$(yq '.HOSTS | map(. + ":8") | join(",")' ../../../../configs/host.yaml)
+mcc -c -o bandwidth.o bandwidth.mu -I/usr/local/musa/include -I/opt/hpcx/ompi/include -fPIC
+mpic++ -o bdtest bandwidth.o -L/usr/local/musa/lib -lmusart -lmccl -lmusa -lmpi
+echo "NODERANK: $NODERANK"
+if [ "$NODERANK" -eq 0 ]; then
+    echo "NODERANK is 0, executing the final command..."
+    sleep 10
+    mpirun --allow-run-as-root --host $HOSTS -x MCCL_PROTOS=2 -x MCCL_DEBUG=WARN -x MCCL_IB_DISABLE=0 -x MCCL_IB_HCA=mlx5_0,mlx5_1 -x MUSA_DEVICE_MAX_CONNECTIONS=1 ./bdtest
+fi
\ No newline at end of file
diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/README.md b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/README.md
new file mode 100644
index 000000000..e949bce6e
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/README.md
@@ -0,0 +1,48 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：8
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 机内总线协议：Speed 32GT/s, Width x16（PCIE5）
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 单机多卡的MPI互联算法带宽测试值    | 单机多卡的MPI互联算法带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | / | / | /  |
+
+| 评测项  | 单机多卡的MPI互联等效带宽测试值    | 单机多卡的MPI互联等效带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | / | / | /  |
+
+* 由于MCCL用法和NCCL类似，算法带宽、等效带宽计算可参考：https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /    | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用mcclAllReduce，进行单机多卡的MPI互联操作，计算服务器内MPI互联带宽
diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu
new file mode 100644
index 000000000..f55952504
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/bandwidth.mu
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <musa_runtime.h>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (16ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 1000
+
+void checkMusaError(musaError_t err, const char* msg) {
+    if (err != musaSuccess) {
+        fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() {
+    float* d_src, * d_dst;
+    musaEvent_t start, end;
+    float elapsed_time;
+
+    checkMusaError(musaMallocHost(&d_src, SIZE), "musaMallocHost");
+    checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc");
+
+    checkMusaError(musaEventCreate(&start), "musaEventCreate");
+    checkMusaError(musaEventCreate(&end), "musaEventCreate");
+
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy");
+    }
+
+    checkMusaError(musaEventRecord(start), "musaEventRecord");
+
+    for (int i = 0; i < ITERATIONS; ++i) {
+        checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy");
+    }
+
+    checkMusaError(musaEventRecord(end), "musaEventRecord");
+    checkMusaError(musaEventSynchronize(end), "musaEventSynchronize");
+
+    checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime");
+
+    double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0);
+
+    printf("[FlagPerf Result]transfer-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
+    printf("[FlagPerf Result]transfer-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+
+    checkMusaError(musaFreeHost(d_src), "musaFreeHost");
+    checkMusaError(musaFree(d_dst), "musaFree");
+    checkMusaError(musaEventDestroy(start), "musaEventDestroy");
+    checkMusaError(musaEventDestroy(end), "musaEventDestroy");
+
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh
new file mode 100644
index 000000000..102ab5bc3
--- /dev/null
+++ b/base/toolkits/interconnect-MPI_intraserver/mthreads/S4000/main.sh
@@ -0,0 +1,2 @@
+mcc bandwidth.mu -o bdtest -lmusart
+./bdtest
\ No newline at end of file
diff --git a/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/README.md b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/README.md
new file mode 100644
index 000000000..53b1e1dd7
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/README.md
@@ -0,0 +1,41 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：2
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 机内总线协议：Speed 32GT/s, Width x16（PCIE5）
+* RDMA网卡：50GB/s（双向）
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 跨服务器P2P互联带宽测试值(2卡平均，双向) | 跨服务器P2P互联带宽标定值(2卡平均，双向) | 测试标定比例(2卡平均) |
+| ---- | -------------- | -------------- | ------------ |
+| 评测结果 | /    | /      | /     |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(2卡平均) | 单卡最大功耗(2卡最大) | 单卡功耗标准差(2卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------------ | ------------ | ------------- | ----- |
+| 监控结果 | / | / | /    | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(2卡平均) | 单卡平均显存占用(2卡平均) |
+| ---- | --------- | -------- | ------------ | -------------- |
+| 监控结果 | /    | /   | / | /  |
+
+* 注：如镜像启动时ssh并未随命令开启，请切换至[容器内启动](https://github.com/FlagOpen/FlagPerf/blob/main/docs/utils/definitions/IN_CONTAINER_LAUNCH.md)
\ No newline at end of file
diff --git a/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/bandwidth.mu
new file mode 100644
index 000000000..71ea18dda
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/bandwidth.mu
@@ -0,0 +1,120 @@
+#include <mccl.h>
+#include <mpi.h>
+#include <musa_runtime.h>
+#include <stdio.h>
+
+#include <iomanip>
+#include <iostream>
+
+#define SIZE (1024ULL * 1024ULL * 1024ULL * sizeof(float))
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 1000
+
+void checkMusaError(musaError_t err, const char* msg) {
+    if (err != musaSuccess) {
+        fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkMcclError(mcclResult_t result, const char* msg) {
+    if (result != mcclSuccess) {
+        fprintf(stderr, "MCCL Error: %s: %s\n", msg, mcclGetErrorString(result));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void checkMPIError(int result, const char* msg) {
+    if (result != MPI_SUCCESS) {
+        char error_string[MPI_MAX_ERROR_STRING];
+        int length;
+        MPI_Error_string(result, error_string, &length);
+        fprintf(stderr, "MPI Error: %s: %s\n", msg, error_string);
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main(int argc, char** argv) {
+    float* d_tensor;
+    musaEvent_t start, end;
+    float elapsed_time;
+
+    checkMPIError(MPI_Init(&argc, &argv), "MPI_Init");
+    int rank, nranks;
+    checkMPIError(MPI_Comm_rank(MPI_COMM_WORLD, &rank), "MPI_Comm_rank");
+    checkMPIError(MPI_Comm_size(MPI_COMM_WORLD, &nranks), "MPI_Comm_size");
+    checkMusaError(musaSetDevice(0), "musaSetDevice");
+
+    mcclComm_t comm;
+    musaStream_t stream;
+
+    mcclUniqueId id;
+    if (rank == 0) {
+        checkMcclError(mcclGetUniqueId(&id), "mcclGetUniqueId");
+    }
+    MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
+
+    checkMcclError(mcclCommInitRank(&comm, nranks, id, rank), "mcclCommInitRank");
+    checkMusaError(musaStreamCreate(&stream), "musaStreamCreate");
+
+    checkMusaError(musaMalloc(&d_tensor, SIZE), "musaMalloc");
+
+    checkMusaError(musaEventCreate(&start), "musaEventCreate");
+    checkMusaError(musaEventCreate(&end), "musaEventCreate");
+
+    checkMcclError(mcclGroupStart(), "mcclGroupStart");
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        if (rank == 0) {
+            checkMcclError(
+                mcclSend(d_tensor, SIZE / sizeof(float), mcclFloat, 1, comm, stream),
+                "mcclSend");
+        }
+        else if (rank == 1) {
+            checkMcclError(
+                mcclRecv(d_tensor, SIZE / sizeof(float), mcclFloat, 0, comm, stream),
+                "mcclRecv");
+        }
+    }
+    checkMcclError(mcclGroupEnd(), "mcclGroupEnd");
+    checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize");
+    checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier");
+
+    checkMusaError(musaEventRecord(start), "musaEventRecord");
+    checkMcclError(mcclGroupStart(), "mcclGroupStart");
+    for (int i = 0; i < ITERATIONS; ++i) {
+        if (rank == 0) {
+            checkMcclError(
+                mcclSend(d_tensor, SIZE / sizeof(float), mcclFloat, 1, comm, stream),
+                "mcclSend");
+        }
+        else if (rank == 1) {
+            checkMcclError(
+                mcclRecv(d_tensor, SIZE / sizeof(float), mcclFloat, 0, comm, stream),
+                "mcclRecv");
+        }
+    }
+    checkMcclError(mcclGroupEnd(), "mcclGroupEnd");
+    checkMusaError(musaStreamSynchronize(stream), "musaStreamSynchronize");
+    checkMPIError(MPI_Barrier(MPI_COMM_WORLD), "MPI_Barrier");
+    checkMusaError(musaEventRecord(end), "musaEventRecord");
+    checkMusaError(musaEventSynchronize(end), "musaEventSynchronize");
+    checkMusaError(musaEventElapsedTime(&elapsed_time, start, end),
+        "musaEventElapsedTime");
+
+    double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0) +
+        SIZE * ITERATIONS / (elapsed_time / 1000.0);
+    std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth="
+        << std::fixed << std::setprecision(2)
+        << bandwidth / (1024.0 * 1024.0 * 1024.0) << "GiB/s" << std::endl;
+
+    std::cout << "[FlagPerf Result]interconnect-MPI_intraserver-bandwidth="
+        << std::fixed << std::setprecision(2)
+        << bandwidth / (1000.0 * 1000.0 * 1000.0) << "GB/s" << std::endl;
+    checkMusaError(musaEventDestroy(start), "musaEventDestroy");
+    checkMusaError(musaEventDestroy(end), "musaEventDestroy");
+    checkMusaError(musaFree(d_tensor), "musaFree");
+    checkMcclError(mcclCommDestroy(comm), "mcclCommDestroy");
+    checkMusaError(musaStreamDestroy(stream), "musaStreamDestroy");
+    checkMPIError(MPI_Finalize(), "MPI_Finalize");
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/main.sh
new file mode 100644
index 000000000..3e4546bf2
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_interserver/mthreads/S4000/main.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+service ssh restart
+export MCCL_DEBUG=WARN
+export MCCL_PROTOS=2
+export OPAL_PREFIX=/opt/hpcx/ompi
+export PATH=/opt/hpcx/ompi/bin:$PATH
+export LD_LIBRARY_PATH=/opt/hpcx/ompi/lib/:/usr/local/musa/lib/:$LD_LIBRARY_PATH
+export MUSA_KERNEL_TIMEOUT=3600000
+mcc -c -o bandwidth.o bandwidth.mu -I/usr/local/musa/include -I/opt/hpcx/ompi/include -fPIC
+mpic++ -o bdtest bandwidth.o -L/usr/local/musa/lib -lmusart -lmccl -lmusa -lmpi
+HOSTS=$(yq '.HOSTS | join(",")' ../../../../configs/host.yaml)
+echo "NODERANK: $NODERANK"
+if [ "$NODERANK" -eq 0 ]; then
+    echo "NODERANK is 0, executing the final command..."
+    sleep 10
+    mpirun --allow-run-as-root --host $HOSTS -np 2 -x MCCL_DEBUG=WARN -x MCCL_IB_DISABLE=0 -x MCCL_IB_HCA=mlx5_0,mlx5_1 -x MUSA_DEVICE_MAX_CONNECTIONS=1 -x MUSA_KERNEL_TIMEOUT=3600000 ./bdtest
+fi
\ No newline at end of file
diff --git a/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/README.md b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/README.md
new file mode 100644
index 000000000..b643cc97e
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/README.md
@@ -0,0 +1,43 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：2
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 机内总线协议：Speed 32GT/s, Width x16（PCIE5）
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 服务器内P2P互联带宽测试值（双向）    | 服务器P2P互联带宽标定值（双向） | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | /    | /      | /     |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗(2卡平均) | 单卡最大功耗(2卡最大) | 单卡功耗标准差(2卡最大) | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /    | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度(2卡平均) | 单卡平均显存占用(2卡平均) |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用musaMemcpy，进行服务器内AI芯片通信操作，计算服务器AI芯片内P2P互联带宽
diff --git a/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/bandwidth.mu
new file mode 100644
index 000000000..f763308ea
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/bandwidth.mu
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <musa_runtime.h>
+#include <iostream>
+#include <iomanip>
+
+#define SIZE (1024ULL * 1024ULL * 1024ULL * sizeof(float))
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 2000
+
+void checkMusaError(musaError_t err, const char* msg) {
+    if (err != musaSuccess) {
+        fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() {
+    float* d_src, * d_dst;
+    musaEvent_t start, end;
+    float elapsed_time;
+    int gpu_n;
+    checkMusaError(musaGetDeviceCount(&gpu_n), "musaGetDeviceCount");
+    printf("[FlagPerf Info]MUSA-capable device count: %i\n", gpu_n);
+    if (gpu_n < 2) {
+        fprintf(stderr, "Two or more GPUs with Peer-to-Peer access capability are required for inferconnect-P2P_intraserver-bandwidth test\n");
+        exit(EXIT_FAILURE);
+    }
+    int can_access_peer;
+    int p2pCapableGPUs[2];  // We take only 1 pair of P2P capable GPUs
+    p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
+
+    // Show all the combinations of supported P2P GPUs
+    for (int i = 0; i < gpu_n; i++) {
+        for (int j = 0; j < gpu_n; j++) {
+            if (i == j) {
+                continue;
+            }
+            checkMusaError(musaDeviceCanAccessPeer(&can_access_peer, i, j), "musaDeviceCanAccessPeer");
+            printf("[FlagPerf Info]> Peer access from (GPU%d) -> (GPU%d) : %s\n",
+                i, j, can_access_peer ? "Yes" : "No");
+            if (can_access_peer && p2pCapableGPUs[0] == -1) {
+                p2pCapableGPUs[0] = i;
+                p2pCapableGPUs[1] = j;
+            }
+        }
+    }
+    if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
+        printf(
+            "[FlagPerf Info]Two or more GPUs with Peer-to-Peer access capability are required for inferconnect-P2P_intraserver-bandwidth test\n");
+        printf(
+            "[FlagPerf Info]Peer to Peer access is not available amongst GPUs in the system, "
+            "waiving test.\n");
+        return 0;
+    }
+    int gpuid[2];
+    gpuid[0] = p2pCapableGPUs[0];
+    gpuid[1] = p2pCapableGPUs[1];
+    printf("[FlagPerf Info]Enabling peer access between GPU%d and GPU%d...\n", gpuid[0],
+        gpuid[1]);
+    printf("Allocating buffers (%iGB on GPU%d, GPU%d and CPU Host)...\n",
+        int(SIZE / 1024 / 1024 / 1024), gpuid[0], gpuid[1]);
+
+    checkMusaError(musaSetDevice(gpuid[0]), "musaSetDevice");
+    checkMusaError(musaDeviceEnablePeerAccess(gpuid[1], 0), "musaDeviceEnablePeerAccess");
+    checkMusaError(musaSetDevice(gpuid[1]), "musaSetDevice");
+    checkMusaError(musaDeviceEnablePeerAccess(gpuid[0], 0), "musaDeviceEnablePeerAccess");
+
+    checkMusaError(musaSetDevice(gpuid[0]), "musaSetDevice");
+    checkMusaError(musaMalloc(&d_src, SIZE), "musaMalloc");
+    checkMusaError(musaSetDevice(gpuid[1]), "musaSetDevice");
+    checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc");
+
+    checkMusaError(musaEventCreate(&start), "musaEventCreate");
+    checkMusaError(musaEventCreate(&end), "musaEventCreate");
+
+
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        if (i % 2 == 0) {
+            checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyDefault), "musaMemcpy");
+        }
+        else {
+            checkMusaError(musaMemcpy(d_src, d_dst, SIZE, musaMemcpyDefault), "musaMemcpy");
+        }
+    }
+
+
+    checkMusaError(musaEventRecord(start, 0), "musaEventRecord");
+
+    for (int i = 0; i < ITERATIONS; ++i) {
+        if (i % 2 == 0) {
+            checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyDefault), "musaMemcpy");
+        }
+        else {
+            checkMusaError(musaMemcpy(d_src, d_dst, SIZE, musaMemcpyDefault), "musaMemcpy");
+        }
+    }
+    checkMusaError(musaEventRecord(end, 0), "musaEventRecord");
+    checkMusaError(musaEventSynchronize(end), "musaEventSynchronize");
+    checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime");
+    double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0) + SIZE * ITERATIONS / (elapsed_time / 1000.0);
+    std::cout << "[FlagPerf Result]inferconnect-P2P_intraserver-bandwidth="
+        << std::fixed << std::setprecision(2) << bandwidth / (1024.0 * 1024.0 * 1024.0)
+        << "GiB/s" << std::endl;
+
+    std::cout << "[FlagPerf Result]inferconnect-P2P_intraserver-bandwidth="
+        << std::fixed << std::setprecision(2) << bandwidth / (1000.0 * 1000.0 * 1000.0)
+        << "GB/s" << std::endl;
+    checkMusaError(musaSetDevice(gpuid[0]), "musaSetDevice");
+    checkMusaError(musaDeviceDisablePeerAccess(gpuid[1]), "musaDeviceDisablePeerAccess");
+    checkMusaError(musaSetDevice(gpuid[1]), "musaSetDevice");
+    checkMusaError(musaDeviceDisablePeerAccess(gpuid[0]), "musaDeviceDisablePeerAccess");
+
+    checkMusaError(musaFree(d_src), "musaFree");
+    checkMusaError(musaFree(d_dst), "musaFree");
+    checkMusaError(musaEventDestroy(start), "musaEventDestroy");
+    checkMusaError(musaEventDestroy(end), "musaEventDestroy");
+
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/main.sh b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/main.sh
new file mode 100644
index 000000000..102ab5bc3
--- /dev/null
+++ b/base/toolkits/interconnect-P2P_intraserver/mthreads/S4000/main.sh
@@ -0,0 +1,2 @@
+mcc bandwidth.mu -o bdtest -lmusart
+./bdtest
\ No newline at end of file
diff --git a/base/toolkits/interconnect-h2d/mthreads/S4000/README.md b/base/toolkits/interconnect-h2d/mthreads/S4000/README.md
new file mode 100644
index 000000000..393cfe20b
--- /dev/null
+++ b/base/toolkits/interconnect-h2d/mthreads/S4000/README.md
@@ -0,0 +1,45 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：1
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | CPU-芯片互联带宽测试值    | CPU-芯片互联带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | / | / | /  |
+
+注: h2d/d2h带宽受到CPU、PCIE、内存等服务器内AI芯片以外的模块影响，无标定值
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /    | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用musaMemcpy，进行hosttodevice的CPU-AI芯片互联操作，计算CPU-AI芯片互联带宽
\ No newline at end of file
diff --git a/base/toolkits/interconnect-h2d/mthreads/S4000/bandwidth.mu b/base/toolkits/interconnect-h2d/mthreads/S4000/bandwidth.mu
new file mode 100644
index 000000000..f55952504
--- /dev/null
+++ b/base/toolkits/interconnect-h2d/mthreads/S4000/bandwidth.mu
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <musa_runtime.h>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (16ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 1000
+
+void checkMusaError(musaError_t err, const char* msg) {
+    if (err != musaSuccess) {
+        fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() {
+    float* d_src, * d_dst;
+    musaEvent_t start, end;
+    float elapsed_time;
+
+    checkMusaError(musaMallocHost(&d_src, SIZE), "musaMallocHost");
+    checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc");
+
+    checkMusaError(musaEventCreate(&start), "musaEventCreate");
+    checkMusaError(musaEventCreate(&end), "musaEventCreate");
+
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy");
+    }
+
+    checkMusaError(musaEventRecord(start), "musaEventRecord");
+
+    for (int i = 0; i < ITERATIONS; ++i) {
+        checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyHostToDevice), "musaMemcpy");
+    }
+
+    checkMusaError(musaEventRecord(end), "musaEventRecord");
+    checkMusaError(musaEventSynchronize(end), "musaEventSynchronize");
+
+    checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime");
+
+    double bandwidth = SIZE * ITERATIONS / (elapsed_time / 1000.0);
+
+    printf("[FlagPerf Result]transfer-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
+    printf("[FlagPerf Result]transfer-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+
+    checkMusaError(musaFreeHost(d_src), "musaFreeHost");
+    checkMusaError(musaFree(d_dst), "musaFree");
+    checkMusaError(musaEventDestroy(start), "musaEventDestroy");
+    checkMusaError(musaEventDestroy(end), "musaEventDestroy");
+
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/interconnect-h2d/mthreads/S4000/main.sh b/base/toolkits/interconnect-h2d/mthreads/S4000/main.sh
new file mode 100644
index 000000000..102ab5bc3
--- /dev/null
+++ b/base/toolkits/interconnect-h2d/mthreads/S4000/main.sh
@@ -0,0 +1,2 @@
+mcc bandwidth.mu -o bdtest -lmusart
+./bdtest
\ No newline at end of file
diff --git a/base/toolkits/main_memory-bandwidth/mthreads/S4000/README.md b/base/toolkits/main_memory-bandwidth/mthreads/S4000/README.md
new file mode 100644
index 000000000..b49670e1f
--- /dev/null
+++ b/base/toolkits/main_memory-bandwidth/mthreads/S4000/README.md
@@ -0,0 +1,42 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：2
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 主存储带宽测试值    | 主存储带宽标定值 | 测试标定比例 |
+| ---- | ----------- | -------- | ------ |
+| 评测结果 | / | / | /  |
+
+## 能耗监控结果
+
+| 监控项  | 系统平均功耗  | 系统最大功耗  | 系统功耗标准差 | 单机TDP | 单卡平均功耗  | 单卡最大功耗 | 单卡功耗标准差 | 单卡TDP |
+| ---- | ------- | ------- | ------- | ----- | ------- | ------ | ------- | ----- |
+| 监控结果 | / | / | /    | /     | / | / | /   | /  |
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 | 单卡平均温度  | 单卡平均显存占用 |
+| ---- | --------- | -------- | ------- | -------- |
+| 监控结果 | /    | /   | / | /  |
+
+# 厂商测试工具原理说明
+
+使用musaMemcpy，进行读+写AI芯片主存储操作，计算AI芯片主存储带宽
diff --git a/base/toolkits/main_memory-bandwidth/mthreads/S4000/bandwidth.mu b/base/toolkits/main_memory-bandwidth/mthreads/S4000/bandwidth.mu
new file mode 100644
index 000000000..909ac65bf
--- /dev/null
+++ b/base/toolkits/main_memory-bandwidth/mthreads/S4000/bandwidth.mu
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <musa_runtime.h>
+
+#define GB (1024ULL * 1024ULL * 1024ULL)
+#define SIZE (16ULL * GB)
+#define WARMUP_ITERATIONS 100
+#define ITERATIONS 10000
+
+void checkMusaError(musaError_t err, const char* msg) {
+    if (err != musaSuccess) {
+        fprintf(stderr, "MUSA Error: %s: %s\n", msg, musaGetErrorString(err));
+        exit(EXIT_FAILURE);
+    }
+}
+
+int main() {
+    float* d_src, * d_dst;
+    musaEvent_t start, end;
+    float elapsed_time;
+
+    checkMusaError(musaMalloc(&d_src, SIZE), "musaMalloc");
+    checkMusaError(musaMalloc(&d_dst, SIZE), "musaMalloc");
+
+    checkMusaError(musaEventCreate(&start), "musaEventCreate");
+    checkMusaError(musaEventCreate(&end), "musaEventCreate");
+
+    for (int i = 0; i < WARMUP_ITERATIONS; ++i) {
+        checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyDeviceToDevice), "musaMemcpy");
+    }
+
+    checkMusaError(musaEventRecord(start), "musaEventRecord");
+
+    for (int i = 0; i < ITERATIONS; ++i) {
+        checkMusaError(musaMemcpy(d_dst, d_src, SIZE, musaMemcpyDeviceToDevice), "musaMemcpy");
+    }
+
+    checkMusaError(musaEventRecord(end), "musaEventRecord");
+    checkMusaError(musaEventSynchronize(end), "musaEventSynchronize");
+
+    checkMusaError(musaEventElapsedTime(&elapsed_time, start, end), "musaEventElapsedTime");
+
+    double bandwidth = 2.0 * SIZE * ITERATIONS / (elapsed_time / 1000.0);
+
+    printf("[FlagPerf Result]main_memory-bandwidth=%.2fGiB/s\n", bandwidth / (1024.0 * 1024.0 * 1024.0));
+    printf("[FlagPerf Result]main_memory-bandwidth=%.2fGB/s\n", bandwidth / (1000.0 * 1000.0 * 1000.0));
+
+    checkMusaError(musaFree(d_src), "musaFree");
+    checkMusaError(musaFree(d_dst), "musaFree");
+    checkMusaError(musaEventDestroy(start), "musaEventDestroy");
+    checkMusaError(musaEventDestroy(end), "musaEventDestroy");
+
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/main_memory-bandwidth/mthreads/S4000/main.sh b/base/toolkits/main_memory-bandwidth/mthreads/S4000/main.sh
new file mode 100644
index 000000000..102ab5bc3
--- /dev/null
+++ b/base/toolkits/main_memory-bandwidth/mthreads/S4000/main.sh
@@ -0,0 +1,2 @@
+mcc bandwidth.mu -o bdtest -lmusart
+./bdtest
\ No newline at end of file
diff --git a/base/toolkits/main_memory-capacity/mthreads/S4000/README.md b/base/toolkits/main_memory-capacity/mthreads/S4000/README.md
new file mode 100644
index 000000000..74c7f480c
--- /dev/null
+++ b/base/toolkits/main_memory-capacity/mthreads/S4000/README.md
@@ -0,0 +1,47 @@
+# 参评AI芯片信息
+
+* 厂商：MThreads
+* 产品名称：S4000
+* 产品型号：MTT S4000
+* TDP：/
+
+# 所用服务器配置
+
+* 服务器数量：1
+* 单服务器内使用卡数：2
+* 服务器型号：/
+* 操作系统版本：Ubuntu 22.04.5 LTS
+* 操作系统内核：Linux 5.15.0-105-generic
+* CPU：/
+* docker版本：24.0.7
+* 内存：1TiB
+* 服务器间AI芯片直连规格及带宽：此评测样例无需服务器间通信
+
+# 评测结果
+
+## 核心评测结果
+
+| 评测项  | 主存储容量测试值          | 主存储容量标定值 | 测试标定比例 |
+| ---- | ----------------- | -------- | ------ |
+| 评测结果 | / | / | /  |
+
+## 能耗监控结果
+
+此评测样例中无意义
+
+## 其他重要监控结果
+
+| 监控项  | 系统平均CPU占用 | 系统平均内存占用 |
+| ---- | --------- | -------- |
+| 监控结果 | /   | /   |
+
+# 厂商测试工具原理说明
+
+通过按照一定规则不断尝试申请主存储（例如显存）来评测主存储容量
+
+1. 初始化某个INITSIZE
+2. 不断尝试musaMalloc INITSIZE大小的主存储，直到无法申请
+3. 减小INITSIZE为当前的二分之一，重复执行第2步
+4. 重复执行第3步，直到INITSIZE为1MiB
+
+上述评测过程可以确保在评测结束时，已无法申请任何1MiB的主存储，以此评测主存储容量。
\ No newline at end of file
diff --git a/base/toolkits/main_memory-capacity/mthreads/S4000/capacity.mu b/base/toolkits/main_memory-capacity/mthreads/S4000/capacity.mu
new file mode 100644
index 000000000..b7e237225
--- /dev/null
+++ b/base/toolkits/main_memory-capacity/mthreads/S4000/capacity.mu
@@ -0,0 +1,46 @@
+#include <musa_runtime.h>
+#include <stdio.h>
+
+bool CHECK(musaError_t call){
+    const musaError_t error = call;
+    return (error == musaSuccess);
+}
+
+
+void test_gpu_memory_capacity() {
+    size_t initial_byte_size = 65536;
+    size_t current_byte_size = initial_byte_size;
+    size_t min_byte_size = 1;
+    size_t total_allocated = 0;
+
+    printf("Init tensor size:  %zu MiB...\n", initial_byte_size);
+ 
+    while (current_byte_size >= min_byte_size) {
+        void* ptr = NULL;
+        bool allocation_failed = false;
+
+        while (!allocation_failed) {
+            if (CHECK(musaMalloc(&ptr, current_byte_size * 1024 * 1024))){
+                total_allocated += current_byte_size;
+                printf("Allocated: %zu MiB\n", total_allocated);
+            }
+            else{
+                printf("MUSA OOM at tensor size %zu MiB. Allocated:%zu MiB\n", current_byte_size, total_allocated);
+                allocation_failed = true;
+            }
+        }
+
+        current_byte_size /= 2;
+        printf("Reduce tensor size to %zu MiB\n", current_byte_size);
+    }
+	    
+    
+    printf("[FlagPerf Result]main_memory-capacity=%.2fGiB\n", total_allocated / (1024.0));
+    printf("[FlagPerf Result]main_memory-capacity=%.2fGB\n", total_allocated * 1024.0 * 1024.0 / (1000.0 * 1000.0 * 1000.0));  
+}
+
+int main() {
+    test_gpu_memory_capacity();
+    musaDeviceReset();
+    return 0;
+}
\ No newline at end of file
diff --git a/base/toolkits/main_memory-capacity/mthreads/S4000/main.sh b/base/toolkits/main_memory-capacity/mthreads/S4000/main.sh
new file mode 100644
index 000000000..bc1b67c0a
--- /dev/null
+++ b/base/toolkits/main_memory-capacity/mthreads/S4000/main.sh
@@ -0,0 +1,3 @@
+mcc capacity.mu -o capacitytest -lmusart
+./capacitytest
+sleep 300
\ No newline at end of file