From e0bc5f2a4dc3c19c22e102e83d07295e73a41512 Mon Sep 17 00:00:00 2001 From: "chen, suyue" Date: Thu, 5 Sep 2024 14:58:24 +0800 Subject: [PATCH] update logs from standard cd perf workflow (#733) Signed-off-by: chensuyue Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- ChatQnA/benchmark/README.md | 275 ------------------------------------ 1 file changed, 275 deletions(-) diff --git a/ChatQnA/benchmark/README.md b/ChatQnA/benchmark/README.md index 225a67bf9..4963e53de 100644 --- a/ChatQnA/benchmark/README.md +++ b/ChatQnA/benchmark/README.md @@ -302,278 +302,3 @@ cd GenAIExamples/ChatQnA/benchmark/single_gaudi kubectl delete -f . kubectl label nodes k8s-master k8s-worker1 k8s-worker2 k8s-worker3 node-type- ``` - -### Example Result - -The following is a summary of the test result, with files saved at `TEST_OUTPUT_DIR`. - -```none -Concurrency : 512 -Max request count : 2560 -Http timeout : 60000 - -Benchmark target : chatqnafixed - -=================Total statistics===================== -Succeed Response: 2560 (Total 2560, 100.0% Success), Duration: 26.44s, Input Tokens: 61440, Output Tokens: 255985, RPS: 96.82, Input Tokens per Second: 2323.71, Output Tokens per Second: 9681.57 -End to End latency(ms), P50: 3576.34, P90: 4242.19, P99: 5252.23, Avg: 3581.55 -First token latency(ms), P50: 726.64, P90: 1128.27, P99: 1796.09, Avg: 769.58 -Average Next token latency(ms): 28.41 -Average token latency(ms) : 35.85 -====================================================== -``` - -```none -benchmarkresult: - Average_Next_token_latency: '28.41' - Average_token_latency: '35.85' - Duration: '26.44' - End_to_End_latency_Avg: '3581.55' - End_to_End_latency_P50: '3576.34' - End_to_End_latency_P90: '4242.19' - End_to_End_latency_P99: '5252.23' - First_token_latency_Avg: '769.58' - First_token_latency_P50: '726.64' - First_token_latency_P90: '1128.27' - First_token_latency_P99: '1796.09' - Input_Tokens: '61440' - Input_Tokens_per_Second: '2323.71' - Onput_Tokens: '255985' - Output_Tokens_per_Second: '9681.57' - RPS: '96.82' - Succeed_Response: '2560' - locust_P50: '160' - locust_P99: '810' - locust_num_failures: '0' - locust_num_requests: '2560' -benchmarkspec: - bench-target: chatqnafixed - endtest_time: '2024-08-25T14:19:25.955973' - host: http://10.110.105.197:8888 - llm-model: Intel/neural-chat-7b-v3-3 - locustfile: /home/sdp/lvl/GenAIEval/evals/benchmark/stresscli/locust/aistress.py - max_requests: 2560 - namespace: default - processes: 2 - run_name: benchmark - runtime: 60m - starttest_time: '2024-08-25T14:18:50.366514' - stop_timeout: 120 - tool: locust - users: 512 -hardwarespec: - aise-gaudi-00: - architecture: amd64 - containerRuntimeVersion: containerd://1.7.18 - cpu: '160' - habana.ai/gaudi: '8' - kernelVersion: 5.15.0-92-generic - kubeProxyVersion: v1.29.7 - kubeletVersion: v1.29.7 - memory: 1056375272Ki - operatingSystem: linux - osImage: Ubuntu 22.04.3 LTS - aise-gaudi-01: - architecture: amd64 - containerRuntimeVersion: containerd://1.7.18 - cpu: '160' - habana.ai/gaudi: '8' - kernelVersion: 5.15.0-92-generic - kubeProxyVersion: v1.29.7 - kubeletVersion: v1.29.7 - memory: 1056375256Ki - operatingSystem: linux - osImage: Ubuntu 22.04.3 LTS - aise-gaudi-02: - architecture: amd64 - containerRuntimeVersion: containerd://1.7.18 - cpu: '160' - habana.ai/gaudi: '8' - kernelVersion: 5.15.0-92-generic - kubeProxyVersion: v1.29.7 - kubeletVersion: v1.29.7 - memory: 1056375260Ki - operatingSystem: linux - osImage: Ubuntu 22.04.3 LTS - aise-gaudi-03: - architecture: amd64 - containerRuntimeVersion: containerd://1.6.8 - cpu: '160' - habana.ai/gaudi: '8' - kernelVersion: 5.15.0-112-generic - kubeProxyVersion: v1.29.7 - kubeletVersion: v1.29.7 - memory: 1056374404Ki - operatingSystem: linux - osImage: Ubuntu 22.04.4 LTS -workloadspec: - aise-gaudi-00: - chatqna-backend-server-deploy: - replica: 1 - resources: - limits: - cpu: '8' - memory: 4000Mi - requests: - cpu: '8' - memory: 4000Mi - embedding-dependency-deploy: - replica: 1 - resources: - limits: - cpu: '80' - memory: 20000Mi - requests: - cpu: '80' - memory: 20000Mi - embedding-deploy: - replica: 1 - llm-dependency-deploy: - replica: 8 - resources: - limits: - habana.ai/gaudi: '1' - requests: - habana.ai/gaudi: '1' - llm-deploy: - replica: 1 - retriever-deploy: - replica: 1 - resources: - limits: - cpu: '8' - memory: 2500Mi - requests: - cpu: '8' - memory: 2500Mi - aise-gaudi-01: - chatqna-backend-server-deploy: - replica: 1 - resources: - limits: - cpu: '8' - memory: 4000Mi - requests: - cpu: '8' - memory: 4000Mi - embedding-dependency-deploy: - replica: 1 - resources: - limits: - cpu: '80' - memory: 20000Mi - requests: - cpu: '80' - memory: 20000Mi - embedding-deploy: - replica: 1 - llm-dependency-deploy: - replica: 8 - resources: - limits: - habana.ai/gaudi: '1' - requests: - habana.ai/gaudi: '1' - llm-deploy: - replica: 1 - prometheus-operator: - replica: 1 - resources: - limits: - cpu: 200m - memory: 200Mi - requests: - cpu: 100m - memory: 100Mi - retriever-deploy: - replica: 1 - resources: - limits: - cpu: '8' - memory: 2500Mi - requests: - cpu: '8' - memory: 2500Mi - aise-gaudi-02: - chatqna-backend-server-deploy: - replica: 1 - resources: - limits: - cpu: '8' - memory: 4000Mi - requests: - cpu: '8' - memory: 4000Mi - embedding-dependency-deploy: - replica: 1 - resources: - limits: - cpu: '80' - memory: 20000Mi - requests: - cpu: '80' - memory: 20000Mi - embedding-deploy: - replica: 1 - llm-dependency-deploy: - replica: 8 - resources: - limits: - habana.ai/gaudi: '1' - requests: - habana.ai/gaudi: '1' - llm-deploy: - replica: 1 - retriever-deploy: - replica: 1 - resources: - limits: - cpu: '8' - memory: 2500Mi - requests: - cpu: '8' - memory: 2500Mi - aise-gaudi-03: - chatqna-backend-server-deploy: - replica: 1 - resources: - limits: - cpu: '8' - memory: 4000Mi - requests: - cpu: '8' - memory: 4000Mi - dataprep-deploy: - replica: 1 - embedding-dependency-deploy: - replica: 1 - resources: - limits: - cpu: '80' - memory: 20000Mi - requests: - cpu: '80' - memory: 20000Mi - embedding-deploy: - replica: 1 - llm-dependency-deploy: - replica: 8 - resources: - limits: - habana.ai/gaudi: '1' - requests: - habana.ai/gaudi: '1' - llm-deploy: - replica: 1 - retriever-deploy: - replica: 1 - resources: - limits: - cpu: '8' - memory: 2500Mi - requests: - cpu: '8' - memory: 2500Mi - vector-db: - replica: 1 -```