From 098857c11762f0fe17a8fc345b01a92ea478ba97 Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Tue, 12 Nov 2024 11:34:44 -0800
Subject: [PATCH] Enable vLLM Profiling for ChatQnA

---
 .../docker_compose/intel/cpu/xeon/README.md   | 51 +++++++++++++++++++
 .../intel/cpu/xeon/compose_vllm.yaml          |  1 +
 2 files changed, 52 insertions(+)

diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index 990cb35374..49a7bf168e 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -432,6 +432,57 @@ curl -X POST "http://${host_ip}:6007/v1/dataprep/delete_file" \
      -H "Content-Type: application/json"
 ```
 
+
+### Profile Microservices
+
+To further analyze MicroService Performance, users could follow the instructions to profile MicroServices. 
+
+#### 1. vLLM backend Service
+   Users could follow previous section to testing vLLM microservice or ChatQnA MegaService.  
+   By default, vLLM profiling is not enabled. Users could start and stop profiling by following commands.  
+
+   ##### Start vLLM profiling
+
+   ```bash
+   curl http://${host_ip}:9009/start_profile \
+     -H "Content-Type: application/json" \
+     -d '{"model": "Intel/neural-chat-7b-v3-3"}'
+   ```
+   Users would see below docker logs from vllm-service if profiling is started correctly.
+   ```bash
+   INFO api_server.py:361] Starting profiler...
+   INFO api_server.py:363] Profiler started.
+   INFO:     x.x.x.x:35940 - "POST /start_profile HTTP/1.1" 200 OK
+   ```
+   After vLLM profiling is started, users could start asking questions and get responses from vLLM MicroService  
+   or ChatQnA MicroService.  
+   
+   ##### Stop vLLM profiling
+   By following command, users could stop vLLM profliing and generate a *.pt.trace.json.gz file as profiling result  
+   under /mnt folder in vllm-service docker instance.  
+   ```bash
+   # vLLM Service
+   curl http://${host_ip}:9009/stop_profile \
+     -H "Content-Type: application/json" \
+     -d '{"model": "Intel/neural-chat-7b-v3-3"}'
+   ```
+   Users would see below docker logs from vllm-service if profiling is stopped correctly.  
+   ```bash
+   INFO api_server.py:368] Stopping profiler...
+   INFO api_server.py:370] Profiler stopped.
+   INFO:     x.x.x.x:41614 - "POST /stop_profile HTTP/1.1" 200 OK
+   ```
+   After vllm profiling is stopped, users could use below command to get the *.pt.trace.json.gz file under /mnt folder.  
+   ```bash
+   docker cp  vllm-service:/mnt/ .
+   ```
+
+   ##### Check profiling result
+   Open a web browser and type "chrome://tracing" or "ui.perfetto.dev", and then load the json.gz file, you should be able  
+   to see the vLLM profiling result as below diagram. 
+![image](https://github.com/user-attachments/assets/55c7097e-5574-41dc-97a7-5e87c31bc286)
+
+   
 ## 🚀 Launch the UI
 
 ### Launch with origin port
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
index 6e9d9ac200..3735b75f04 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
@@ -86,6 +86,7 @@ services:
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
+      VLLM_TORCH_PROFILER_DIR: "/mnt"
     command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
   chatqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}