From 2f03a3a894b7ab8a6012edd030ecab01b55af253 Mon Sep 17 00:00:00 2001
From: XinyaoWa <xinyao.wang@intel.com>
Date: Thu, 19 Sep 2024 14:15:25 +0800
Subject: [PATCH] Align parameters for "max_token,
 repetition_penalty,presence_penalty,frequency_penalty" (#726)

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../docker_compose/intel/cpu/xeon/README.md   |  2 +-
 .../docker_compose/intel/hpu/gaudi/README.md  |  2 +-
 AudioQnA/tests/test_gmc_on_gaudi.sh           |  2 +-
 AudioQnA/tests/test_gmc_on_xeon.sh            |  2 +-
 ChatQnA/benchmark/benchmark.yaml              |  2 +-
 ChatQnA/chatqna_no_wrapper.py                 |  6 ++-
 .../docker_compose/intel/cpu/aipc/README.md   |  2 +-
 .../docker_compose/intel/cpu/xeon/README.md   | 21 +++++++--
 .../intel/cpu/xeon/README_qdrant.md           |  2 +-
 .../docker_compose/intel/hpu/gaudi/README.md  | 29 ++++++++++--
 .../hpu/gaudi/how_to_validate_service.md      |  2 +-
 ChatQnA/docker_compose/nvidia/gpu/README.md   |  2 +-
 CodeGen/README.md                             |  2 +-
 .../docker_compose/intel/cpu/xeon/README.md   |  2 +-
 .../docker_compose/intel/hpu/gaudi/README.md  |  2 +-
 CodeGen/tests/test_gmc_on_gaudi.sh            |  2 +-
 CodeGen/tests/test_gmc_on_xeon.sh             |  2 +-
 CodeTrans/README.md                           |  2 +-
 DocSum/README.md                              |  2 +-
 .../docker_compose/intel/cpu/xeon/README.md   |  2 +-
 .../docker_compose/intel/cpu/xeon/README.md   |  2 +-
 .../docker_compose/intel/hpu/gaudi/README.md  |  2 +-
 .../docker_compose/intel/cpu/xeon/README.md   | 44 +++++++++----------
 .../docker_compose/intel/hpu/gaudi/README.md  | 44 +++++++++----------
 24 files changed, 110 insertions(+), 72 deletions(-)

diff --git a/AudioQnA/docker_compose/intel/cpu/xeon/README.md b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
index 338771dd0..d08061284 100644
--- a/AudioQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/AudioQnA/docker_compose/intel/cpu/xeon/README.md
@@ -108,7 +108,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
   -H 'Content-Type: application/json'
 
 # speecht5 service
diff --git a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
index 28ec3f402..842227ee5 100644
--- a/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/AudioQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -108,7 +108,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":false}' \
   -H 'Content-Type: application/json'
 
 # speecht5 service
diff --git a/AudioQnA/tests/test_gmc_on_gaudi.sh b/AudioQnA/tests/test_gmc_on_gaudi.sh
index 898a91524..d90bd3624 100755
--- a/AudioQnA/tests/test_gmc_on_gaudi.sh
+++ b/AudioQnA/tests/test_gmc_on_gaudi.sh
@@ -34,7 +34,7 @@ function validate_audioqa() {
     export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
     echo "$CLIENT_POD"
     accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
-    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
+    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
     echo "$byte_str" > $LOG_PATH/curl_audioqa.log
     if [ -z "$byte_str" ]; then
 	echo "audioqa failed, please check the logs in ${LOG_PATH}!"
diff --git a/AudioQnA/tests/test_gmc_on_xeon.sh b/AudioQnA/tests/test_gmc_on_xeon.sh
index ed6adddd2..15e04e62c 100755
--- a/AudioQnA/tests/test_gmc_on_xeon.sh
+++ b/AudioQnA/tests/test_gmc_on_xeon.sh
@@ -34,7 +34,7 @@ function validate_audioqa() {
     export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
     echo "$CLIENT_POD"
     accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='audioqa')].status.accessUrl}")
-    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_new_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
+    byte_str=$(kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl -s -X POST  -d '{"byte_str": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA", "parameters":{"max_tokens":64, "do_sample": true, "streaming":false}}' -H 'Content-Type: application/json' | jq .byte_str)
     echo "$byte_str" > $LOG_PATH/curl_audioqa.log
     if [ -z "$byte_str" ]; then
         echo "audioqa failed, please check the logs in ${LOG_PATH}!"
diff --git a/ChatQnA/benchmark/benchmark.yaml b/ChatQnA/benchmark/benchmark.yaml
index f1eb86a37..851a3e11a 100644
--- a/ChatQnA/benchmark/benchmark.yaml
+++ b/ChatQnA/benchmark/benchmark.yaml
@@ -41,7 +41,7 @@ test_cases:
       run_test: false
       service_name: "llm-svc"  # Replace with your service name
       parameters:
-        max_new_tokens: 128
+        max_tokens: 128
         temperature: 0.01
         top_k: 10
         top_p: 0.95
diff --git a/ChatQnA/chatqna_no_wrapper.py b/ChatQnA/chatqna_no_wrapper.py
index 2780c7486..c08c6a2f3 100644
--- a/ChatQnA/chatqna_no_wrapper.py
+++ b/ChatQnA/chatqna_no_wrapper.py
@@ -69,10 +69,12 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
         next_inputs = {}
         next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
         next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
-        next_inputs["max_tokens"] = llm_parameters_dict["max_new_tokens"]
+        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
         next_inputs["top_p"] = llm_parameters_dict["top_p"]
         next_inputs["stream"] = inputs["streaming"]
-        next_inputs["frequency_penalty"] = inputs["repetition_penalty"]
+        next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
+        next_inputs["presence_penalty"] = inputs["presence_penalty"]
+        next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
         next_inputs["temperature"] = inputs["temperature"]
         inputs = next_inputs
 
diff --git a/ChatQnA/docker_compose/intel/cpu/aipc/README.md b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
index 3c28d1c10..9b13d8185 100644
--- a/ChatQnA/docker_compose/intel/cpu/aipc/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
@@ -229,7 +229,7 @@ OLLAMA_HOST=${host_ip}:11434 ollama run $OLLAMA_MODEL
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index 7eb75431a..5eca0d284 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -438,18 +438,31 @@ docker compose -f compose_vllm.yaml up -d
    This service depends on above LLM backend service startup. It will be ready after long time, to wait for them being ready in first startup.
 
    ```bash
+   # TGI service
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
+   For parameters in TGI modes, please refer to [HuggingFace InferenceClient API](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation) (except we rename "max_new_tokens" to "max_tokens".)
+
+   ```bash
+   # vLLM Service
+   curl http://${your_ip}:9000/v1/chat/completions \
+    -X POST \
+    -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
+    -H 'Content-Type: application/json'
+   ```
+
+   For parameters in vLLM modes, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
+
 8. MegaService
 
    ```bash
-   curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
-        "messages": "What is the revenue of Nike in 2023?"
-        }'
+    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
+          "messages": "What is the revenue of Nike in 2023?"
+          }'
    ```
 
 9. Dataprep Microservice（Optional）
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
index 25ba15c3f..c11ab8e9f 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
@@ -304,7 +304,7 @@ docker compose -f compose_qdrant.yaml up -d
    ```bash
    curl http://${host_ip}:6047/v1/chat/completions\
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
index bc41c782a..ec8e3ad09 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -442,18 +442,41 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
 7. LLM Microservice
 
    ```bash
+   # TGI service
+   curl http://${host_ip}:9000/v1/chat/completions\
+     -X POST \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -H 'Content-Type: application/json'
+   ```
+
+   For parameters in TGI mode, please refer to [HuggingFace InferenceClient API](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation) (except we rename "max_new_tokens" to "max_tokens".)
+
+   ```bash
+   # vLLM Service
    curl http://${host_ip}:9000/v1/chat/completions \
+    -X POST \
+    -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
+    -H 'Content-Type: application/json'
+   ```
+
+   For parameters in vLLM Mode, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
+
+   ```bash
+   # vLLM-on-Ray Service
+   curl http://${your_ip}:9000/v1/chat/completions \
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"presence_penalty":1.03","streaming":false}' \
      -H 'Content-Type: application/json'
    ```
 
+   For parameters in vLLM-on-Ray mode, can refer to [LangChain ChatOpenAI API](https://python.langchain.com/v0.2/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html)
+
 8. MegaService
 
    ```bash
    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
-        "messages": "What is the revenue of Nike in 2023?"
-        }'
+         "messages": "What is the revenue of Nike in 2023?"
+         }'
    ```
 
 9. Dataprep Microservice（Optional）
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
index 2e2d3d023..8ada1e525 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
@@ -278,7 +278,7 @@ and the log shows model warm up, please wait for a while and try it later.
 ```
 curl http://${host_ip}:9000/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
   -H 'Content-Type: application/json'
 ```
 
diff --git a/ChatQnA/docker_compose/nvidia/gpu/README.md b/ChatQnA/docker_compose/nvidia/gpu/README.md
index cfdda158f..7e3966a7f 100644
--- a/ChatQnA/docker_compose/nvidia/gpu/README.md
+++ b/ChatQnA/docker_compose/nvidia/gpu/README.md
@@ -280,7 +280,7 @@ docker compose up -d
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions \
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/CodeGen/README.md b/CodeGen/README.md
index bc93ff473..cbf94bbb8 100644
--- a/CodeGen/README.md
+++ b/CodeGen/README.md
@@ -132,7 +132,7 @@ Two ways of consuming CodeGen Service:
    http_proxy=""
    curl http://${host_ip}:8028/generate \
      -X POST \
-     -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_new_tokens":256, "do_sample": true}}' \
+     -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_tokens":256, "do_sample": true}}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/CodeGen/docker_compose/intel/cpu/xeon/README.md b/CodeGen/docker_compose/intel/cpu/xeon/README.md
index d7dc3376e..8bdde1f75 100644
--- a/CodeGen/docker_compose/intel/cpu/xeon/README.md
+++ b/CodeGen/docker_compose/intel/cpu/xeon/README.md
@@ -138,7 +138,7 @@ docker compose up -d
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_new_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/CodeGen/docker_compose/intel/hpu/gaudi/README.md b/CodeGen/docker_compose/intel/hpu/gaudi/README.md
index 74afd54ae..2a5040ea0 100644
--- a/CodeGen/docker_compose/intel/hpu/gaudi/README.md
+++ b/CodeGen/docker_compose/intel/hpu/gaudi/README.md
@@ -119,7 +119,7 @@ docker compose up -d
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_new_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","max_tokens":256,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/CodeGen/tests/test_gmc_on_gaudi.sh b/CodeGen/tests/test_gmc_on_gaudi.sh
index ad16e2108..805237208 100755
--- a/CodeGen/tests/test_gmc_on_gaudi.sh
+++ b/CodeGen/tests/test_gmc_on_gaudi.sh
@@ -34,7 +34,7 @@ function validate_codegen() {
     export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
     echo "$CLIENT_POD"
     accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='codegen')].status.accessUrl}")
-    kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl  -X POST  -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_new_tokens":256, "do_sample": true}}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_codegen.log
+    kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl  -X POST  -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_tokens":256, "do_sample": true}}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_codegen.log
     exit_code=$?
     if [ $exit_code -ne 0 ]; then
         echo "chatqna failed, please check the logs in ${LOG_PATH}!"
diff --git a/CodeGen/tests/test_gmc_on_xeon.sh b/CodeGen/tests/test_gmc_on_xeon.sh
index 92f620365..5f3ff0eae 100755
--- a/CodeGen/tests/test_gmc_on_xeon.sh
+++ b/CodeGen/tests/test_gmc_on_xeon.sh
@@ -34,7 +34,7 @@ function validate_codegen() {
     export CLIENT_POD=$(kubectl get pod -n $APP_NAMESPACE -l app=client-test -o jsonpath={.items..metadata.name})
     echo "$CLIENT_POD"
     accessUrl=$(kubectl get gmc -n $APP_NAMESPACE -o jsonpath="{.items[?(@.metadata.name=='codegen')].status.accessUrl}")
-    kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl  -X POST  -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_new_tokens":256, "do_sample": true}}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_codegen.log
+    kubectl exec "$CLIENT_POD" -n $APP_NAMESPACE -- curl $accessUrl  -X POST  -d '{"inputs":"Implement a high-level API for a TODO list application. The API takes as input an operation request and updates the TODO list in place. If the request is invalid, raise an exception.","parameters":{"max_tokens":256, "do_sample": true}}' -H 'Content-Type: application/json' > $LOG_PATH/gmc_codegen.log
     exit_code=$?
     if [ $exit_code -ne 0 ]; then
         echo "chatqna failed, please check the logs in ${LOG_PATH}!"
diff --git a/CodeTrans/README.md b/CodeTrans/README.md
index 0a00ca902..a1b95b154 100644
--- a/CodeTrans/README.md
+++ b/CodeTrans/README.md
@@ -127,7 +127,7 @@ By default, the UI runs on port 5173 internally.
    http_proxy=""
    curl http://${host_ip}:8008/generate \
      -X POST \
-     -d '{"inputs":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+     -d '{"inputs":"    ### System: Please translate the following Golang codes into  Python codes.    ### Original codes:    '\'''\'''\''Golang    \npackage main\n\nimport \"fmt\"\nfunc main() {\n    fmt.Println(\"Hello, World!\");\n    '\'''\'''\''    ### Translated codes:","parameters":{"max_tokens":17, "do_sample": true}}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/DocSum/README.md b/DocSum/README.md
index a6fb0ea9f..ca1ebfeba 100644
--- a/DocSum/README.md
+++ b/DocSum/README.md
@@ -149,7 +149,7 @@ Two ways of consuming Document Summarization Service:
    http_proxy=""
    curl http://${host_ip}:8008/generate \
      -X POST \
-     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
+     -d '{"inputs":"What is Deep Learning?","parameters":{"max_tokens":17, "do_sample": true}}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
index dd1f59f27..312f191ff 100644
--- a/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
+++ b/ProductivitySuite/docker_compose/intel/cpu/xeon/README.md
@@ -271,7 +271,7 @@ Please refer to [keycloak_setup_guide](keycloak_setup_guide.md) for more detail
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions\
      -X POST \
-     -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
      -H 'Content-Type: application/json'
    ```
 
diff --git a/SearchQnA/docker_compose/intel/cpu/xeon/README.md b/SearchQnA/docker_compose/intel/cpu/xeon/README.md
index f31975ac6..5dbd77464 100644
--- a/SearchQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/SearchQnA/docker_compose/intel/cpu/xeon/README.md
@@ -140,7 +140,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
   -H 'Content-Type: application/json'
 
 ```
diff --git a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
index b34398c35..6021c7938 100644
--- a/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/SearchQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -150,7 +150,7 @@ curl http://${host_ip}:3006/generate \
 # llm microservice
 curl http://${host_ip}:3007/v1/chat/completions\
   -X POST \
-  -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
+  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
   -H 'Content-Type: application/json'
 
 ```
diff --git a/VisualQnA/docker_compose/intel/cpu/xeon/README.md b/VisualQnA/docker_compose/intel/cpu/xeon/README.md
index 3a6058e0c..8f0d5b6b3 100644
--- a/VisualQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/VisualQnA/docker_compose/intel/cpu/xeon/README.md
@@ -138,28 +138,28 @@ Follow the instructions to validate MicroServices.
 
 2. MegaService
 
-   ```bash
-   curl http://${host_ip}:8888/v1/visualqna -H "Content-Type: application/json" -d '{
-        "messages": [
-         {
-           "role": "user",
-           "content": [
-             {
-               "type": "text",
-               "text": "What'\''s in this image?"
-             },
-             {
-               "type": "image_url",
-               "image_url": {
-                 "url": "https://www.ilankelman.org/stopsigns/australia.jpg"
-               }
-             }
-           ]
-         }
-       ],
-       "max_tokens": 300
-       }'
-   ```
+```bash
+curl http://${host_ip}:8888/v1/visualqna -H "Content-Type: application/json" -d '{
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "What'\''s in this image?"
+          },
+          {
+            "type": "image_url",
+            "image_url": {
+              "url": "https://www.ilankelman.org/stopsigns/australia.jpg"
+            }
+          }
+        ]
+      }
+    ],
+    "max_tokens": 300
+    }'
+```
 
 ## 🚀 Launch the UI
 
diff --git a/VisualQnA/docker_compose/intel/hpu/gaudi/README.md b/VisualQnA/docker_compose/intel/hpu/gaudi/README.md
index 2a8f3a276..84783353a 100644
--- a/VisualQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/VisualQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -95,28 +95,28 @@ Follow the instructions to validate MicroServices.
 
 2. MegaService
 
-   ```bash
-   curl http://${host_ip}:8888/v1/visualqna -H "Content-Type: application/json" -d '{
-        "messages": [
-         {
-           "role": "user",
-           "content": [
-             {
-               "type": "text",
-               "text": "What'\''s in this image?"
-             },
-             {
-               "type": "image_url",
-               "image_url": {
-                 "url": "https://www.ilankelman.org/stopsigns/australia.jpg"
-               }
-             }
-           ]
-         }
-       ],
-       "max_tokens": 300
-       }'
-   ```
+```bash
+curl http://${host_ip}:8888/v1/visualqna -H "Content-Type: application/json" -d '{
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": "What'\''s in this image?"
+          },
+          {
+            "type": "image_url",
+            "image_url": {
+              "url": "https://www.ilankelman.org/stopsigns/australia.jpg"
+            }
+          }
+        ]
+      }
+    ],
+    "max_tokens": 300
+    }'
+```
 
 ## 🚀 Launch the UI