diff --git a/DocSum/Dockerfile b/DocSum/Dockerfile
index d0dac691c..183aff49d 100644
--- a/DocSum/Dockerfile
+++ b/DocSum/Dockerfile
@@ -1,5 +1,3 @@
-
-
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
@@ -30,3 +28,4 @@ USER user
 WORKDIR /home/user
 
 ENTRYPOINT ["python", "docsum.py"]
+
diff --git a/DocSum/README.md b/DocSum/README.md
index 61254c006..9de353d07 100644
--- a/DocSum/README.md
+++ b/DocSum/README.md
@@ -1,37 +1,23 @@
 # Document Summarization Application
 
-Large Language Models (LLMs) have revolutionized the way we interact with text. These models can be used to create summaries of news articles, research papers, technical documents, legal documents and other types of text. Suppose you have a set of documents (PDFs, Notion pages, customer questions, etc.) and you want to summarize the content. In this example use case, we utilize LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference.
-
-The architecture for document summarization will be illustrated/described below:
+Large Language Models (LLMs) have revolutionized the way we interact with text. These models can be used to create summaries of news articles, research papers, technical documents, legal documents, multimedia documents, and other types of documents. Suppose you have a set of documents (PDFs, Notion pages, customer questions, multimedia files, etc.) and you want to summarize the content. In this example use case, we utilize LangChain to implement summarization strategies and facilitate LLM inference using Text Generation Inference.
 
 ![Architecture](./assets/img/docsum_architecture.png)
 
-![Workflow](./assets/img/docsum_workflow.png)
-
 ## Deploy Document Summarization Service
 
 The Document Summarization service can be effortlessly deployed on either Intel Gaudi2 or Intel Xeon Scalable Processors.
-Based on whether you want to use Docker or Kubernetes, follow the instructions below.
-
-Currently we support two ways of deploying Document Summarization services with docker compose:
-
-1. Start services using the docker image on `docker hub`:
-
-   ```bash
-   docker pull opea/docsum:latest
-   ```
-
-2. Start services using the docker images `built from source`: [Guide](https://github.com/opea-project/GenAIExamples/tree/main/DocSum/docker_compose)
+Based on whether you want to use Docker or Kubernetes, follow the instructions below. Currently we support deploying Document Summarization services with docker compose.
 
 ### Required Models
 
-We set default model as "Intel/neural-chat-7b-v3-3", change "LLM_MODEL_ID" in "docker_compose/set_env.sh" if you want to use other models.
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" environment variable in commands below if you want to use another model.
 
-```
+```bash
 export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 ```
 
-If use gated models, you also need to provide [huggingface token](https://huggingface.co/docs/hub/security-tokens) to "HUGGINGFACEHUB_API_TOKEN" environment variable.
+When using gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) to "HUGGINGFACEHUB_API_TOKEN" environment variable.
 
 ### Setup Environment Variable
 
@@ -57,32 +43,34 @@ To set up environment variables for deploying Document Summarization services, f
 3. Set up other environment variables:
 
    ```bash
-   source ./docker_compose/set_env.sh
+   source GenAIExamples/DocSum/docker_compose/set_env.sh
    ```
 
 ### Deploy using Docker
 
 #### Deploy on Gaudi
 
-Find the corresponding [compose.yaml](./docker_compose/intel/hpu/gaudi/compose.yaml).
+Follow the instructions provided in the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) to build Docker images from source. Once the images are built, run the following command to start the services:
 
 ```bash
 cd GenAIExamples/DocSum/docker_compose/intel/hpu/gaudi/
 docker compose -f compose.yaml up -d
 ```
 
-Refer to the [Gaudi Guide](./docker_compose/intel/hpu/gaudi/README.md) to build docker images from source.
+Find the corresponding [compose.yaml](./docker_compose/intel/hpu/gaudi/compose.yaml).
+
+> Notice: Currently only the **Habana Driver 1.16.x** is supported for Gaudi.
 
 #### Deploy on Xeon
 
-Find the corresponding [compose.yaml](./docker_compose/intel/cpu/xeon/compose.yaml).
+Follow the instructions provided in the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) to build Docker images from source. Once the images are built, run the following command to start the services:
 
 ```bash
 cd GenAIExamples/DocSum/docker_compose/intel/cpu/xeon/
-docker compose up -d
+docker compose -f compose.yaml up -d
 ```
 
-Refer to the [Xeon Guide](./docker_compose/intel/cpu/xeon/README.md) for more instructions on building docker images from source.
+Find the corresponding [compose.yaml](./docker_compose/intel/cpu/xeon/compose.yaml).
 
 ### Deploy using Kubernetes with GMC
 
@@ -120,9 +108,12 @@ flowchart LR
     classDef invisible fill:transparent,stroke:transparent;
     style DocSum-MegaService stroke:#000000
 
+
+
     %% Subgraphs %%
     subgraph DocSum-MegaService["DocSum MegaService "]
         direction LR
+        M2T([Multimedia2text MicroService]):::blue
         LLM([LLM MicroService]):::blue
     end
     subgraph UserInterface[" User Interface "]
@@ -132,20 +123,24 @@ flowchart LR
     end
 
 
-    LLM_gen{{LLM Service <br>}}
+    A2T_SRV{{Audio2Text service<br>}}
+    V2A_SRV{{Video2Audio service<br>}}
+    WSP_SRV{{whisper service<br>}}
     GW([DocSum GateWay<br>]):::orange
 
 
     %% Questions interaction
     direction LR
-    a[User Input Query] --> UI
+    a[User Document for Summarization] --> UI
     UI --> GW
     GW <==> DocSum-MegaService
-
+    M2T ==> LLM
 
     %% Embedding service flow
     direction LR
-    LLM <-.-> LLM_gen
+    M2T .-> V2A_SRV
+    M2T <-.-> A2T_SRV <-.-> WSP_SRV
+    V2A_SRV .-> A2T_SRV
 
 ```
 
@@ -155,22 +150,74 @@ Two ways of consuming Document Summarization Service:
 
 1. Use cURL command on terminal
 
+   Text:
+
    ```bash
-   #Use English mode (default).
+   curl -X POST http://${host_ip}:8888/v1/docsum \
+        -H "Content-Type: application/json" \
+        -d '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+   # Use English mode (default).
    curl http://${host_ip}:8888/v1/docsum \
        -H "Content-Type: multipart/form-data" \
+       -F "type=text" \
        -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
        -F "max_tokens=32" \
        -F "language=en" \
        -F "stream=true"
 
-   #Use Chinese mode.
+   # Use Chinese mode.
    curl http://${host_ip}:8888/v1/docsum \
        -H "Content-Type: multipart/form-data" \
+       -F "type=text" \
        -F "messages=2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。" \
        -F "max_tokens=32" \
        -F "language=zh" \
        -F "stream=true"
+
+   # Upload file
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "max_tokens=32" \
+      -F "language=en" \
+      -F "stream=true"
+   ```
+
+   > Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
+
+   Audio:
+
+   ```bash
+   curl -X POST http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: application/json" \
+      -d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
+
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=audio" \
+      -F "messages=UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" \
+      -F "max_tokens=32" \
+      -F "language=en" \
+      -F "stream=true"
+   ```
+
+   Video:
+
+   ```bash
+   curl -X POST http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: application/json" \
+      -d '{"type": "video", "messages": "convert your video to base64 data type"}'
+
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=video" \
+      -F "messages=convert your video to base64 data type" \
+      -F "max_tokens=32" \
+      -F "language=en" \
+      -F "stream=true"
    ```
 
 2. Access via frontend
@@ -184,7 +231,6 @@ Two ways of consuming Document Summarization Service:
 1. If you get errors like "Access Denied", [validate micro service](https://github.com/opea-project/GenAIExamples/tree/main/DocSum/docker_compose/intel/cpu/xeon/README.md#validate-microservices) first. A simple example:
 
    ```bash
-   http_proxy=""
    curl http://${host_ip}:8008/generate \
      -X POST \
      -d '{"inputs":"What is Deep Learning?","parameters":{"max_tokens":17, "do_sample": true}}' \
diff --git a/DocSum/assets/img/docSum_ui_gradio_text.png b/DocSum/assets/img/docSum_ui_gradio_text.png
new file mode 100644
index 000000000..dddf8c668
Binary files /dev/null and b/DocSum/assets/img/docSum_ui_gradio_text.png differ
diff --git a/DocSum/docker_compose/intel/cpu/xeon/README.md b/DocSum/docker_compose/intel/cpu/xeon/README.md
index 02c3af554..82cbcf841 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/README.md
+++ b/DocSum/docker_compose/intel/cpu/xeon/README.md
@@ -12,17 +12,46 @@ After launching your instance, you can connect to it using SSH (for Linux instan
 
 ## 🚀 Build Docker Images
 
-First of all, you need to build Docker Images locally and install the python package of it.
+### 1. Build MicroService Docker Image
 
-### 1. Build LLM Image
+First of all, you need to build Docker Images locally and install the python package of it.
 
 ```bash
 git clone https://github.com/opea-project/GenAIComps.git
 cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/tgi/langchain/Dockerfile .
 ```
 
-Then run the command `docker images`, you will have the following four Docker Images:
+#### Whisper Service
+
+The Whisper Service converts audio files to text. Follow these steps to build and run the service:
+
+```bash
+docker build -t opea/whisper:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/asr/whisper/dependency/Dockerfile .
+```
+
+#### Audio to text Service
+
+The Audio to text Service is another service for converting audio to text. Follow these steps to build and run the service:
+
+```bash
+docker build -t opea/dataprep-audio2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/audio2text/Dockerfile .
+```
+
+#### Video to Audio Service
+
+The Video to Audio Service extracts audio from video files. Follow these steps to build and run the service:
+
+```bash
+docker build -t opea/dataprep-video2audio:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/video2audio/Dockerfile .
+```
+
+#### Multimedia to Text Service
+
+The Multimedia to Text Service transforms multimedia data to text data. Follow these steps to build and run the service:
+
+```bash
+docker build -t opea/dataprep-multimedia2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/Dockerfile .
+```
 
 ### 2. Build MegaService Docker Image
 
@@ -36,6 +65,10 @@ docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-
 
 ### 3. Build UI Docker Image
 
+Several UI options are provided. If you need to work with multimedia documents, .doc, or .pdf files, suggested to use Gradio UI.
+
+#### Svelte UI
+
 Build the frontend Docker image via below command:
 
 ```bash
@@ -43,13 +76,16 @@ cd GenAIExamples/DocSum/ui
 docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
 ```
 
-Then run the command `docker images`, you will have the following Docker Images:
+#### Gradio UI
 
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
+Build the Gradio UI frontend Docker image using the following command:
 
-### 4. Build React UI Docker Image
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-gradio-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile.gradio .
+```
+
+#### React UI
 
 Build the frontend Docker image via below command:
 
@@ -61,45 +97,62 @@ docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT
 docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy  -f ./docker/Dockerfile.react .
 ```
 
-Then run the command `docker images`, you will have the following Docker Images:
-
-1. `opea/llm-docsum-tgi:latest`
-2. `opea/docsum:latest`
-3. `opea/docsum-ui:latest`
-4. `opea/docsum-react-ui:latest`
-
 ## 🚀 Start Microservices and MegaService
 
 ### Required Models
 
-We set default model as "Intel/neural-chat-7b-v3-3", change "LLM_MODEL_ID" in following Environment Variables setting if you want to use other models.
-If use gated models, you also need to provide [huggingface token](https://huggingface.co/docs/hub/security-tokens) to "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" environment variable in commands below if you want to use another model.
 
 ```bash
-export no_proxy=${your_no_proxy}
-export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
 export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
 ```
 
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+When using gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) to "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variable
+
+To set up environment variables for deploying Document Summarization services, follow these steps:
+
+1. Set the required environment variables:
+
+   ```bash
+   # Example: host_ip="192.168.1.1"
+   export host_ip="External_Public_IP"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+   export no_proxy="Your_No_Proxy"
+   export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
+   ```
+
+2. If you are in a proxy environment, also set the proxy-related environment variables:
+
+   ```bash
+   export http_proxy="Your_HTTP_Proxy"
+   export https_proxy="Your_HTTPs_Proxy"
+   ```
+
+3. Set up other environment variables:
+
+   ```bash
+   source GenAIExamples/DocSum/docker_compose/set_env.sh
+   ```
 
 ### Start Microservice Docker Containers
 
 ```bash
 cd GenAIExamples/DocSum/docker_compose/intel/cpu/xeon
-docker compose up -d
+docker compose -f compose.yaml up -d
 ```
 
+You will have the following Docker Images:
+
+1. `opea/docsum-ui:latest`
+2. `opea/docsum:latest`
+3. `opea/llm-docsum-tgi:latest`
+4. `opea/whisper:latest`
+5. `opea/dataprep-audio2text:latest`
+6. `opea/dataprep-multimedia2text:latest`
+7. `opea/dataprep-video2audio:latest`
+
 ### Validate Microservices
 
 1. TGI Service
@@ -120,31 +173,143 @@ docker compose up -d
      -H 'Content-Type: application/json'
    ```
 
-3. MegaService
+3. Whisper Microservice
 
    ```bash
+    curl http://${host_ip}:7066/v1/asr \
+        -X POST \
+        -d '{"audio":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+        -H 'Content-Type: application/json'
+   ```
+
+   Expected output:
+
+   ```bash
+     {"asr_result":"you"}
+   ```
+
+4. Audio2Text Microservice
+
+   ```bash
+    curl http://${host_ip}:9099/v1/audio/transcriptions \
+        -X POST \
+        -d '{"byte_str":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+        -H 'Content-Type: application/json'
+   ```
+
+   Expected output:
+
+   ```bash
+     {"downstream_black_list":[],"id":"--> this will be different id number for each run <--","query":"you"}
+   ```
+
+5. Multimedia to text Microservice
+
+   ```bash
+    curl http://${host_ip}:7079/v1/multimedia2text \
+        -X POST \
+        -d '{"audio":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+        -H 'Content-Type: application/json'
+   ```
+
+   Expected output:
+
+   ```bash
+     {"downstream_black_list":[],"id":"--> this will be different id number for each run <--","query":"you"}
+   ```
+
+6. MegaService
+
+   Text:
+
+   ```bash
+   curl -X POST http://${host_ip}:8888/v1/docsum \
+        -H "Content-Type: application/json" \
+        -d '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+   # Use English mode (default).
    curl http://${host_ip}:8888/v1/docsum \
        -H "Content-Type: multipart/form-data" \
+       -F "type=text" \
        -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
        -F "max_tokens=32" \
        -F "language=en" \
-       -F "stream=false"
+       -F "stream=true"
+
+   # Use Chinese mode.
+   curl http://${host_ip}:8888/v1/docsum \
+       -H "Content-Type: multipart/form-data" \
+       -F "type=text" \
+       -F "messages=2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。" \
+       -F "max_tokens=32" \
+       -F "language=zh" \
+       -F "stream=true"
+
+   # Upload file
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "max_tokens=32" \
+      -F "language=en" \
+      -F "stream=true"
    ```
 
-Following the validation of all aforementioned microservices, we are now prepared to construct a mega-service.
+   > Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
+
+   Audio:
+
+   ```bash
+   curl -X POST http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: application/json" \
+      -d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
+
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=audio" \
+      -F "messages=UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" \
+      -F "max_tokens=32" \
+      -F "language=en" \
+      -F "stream=true"
+   ```
+
+   Video:
+
+   ```bash
+   curl -X POST http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: application/json" \
+      -d '{"type": "video", "messages": "convert your video to base64 data type"}'
+
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=video" \
+      -F "messages=convert your video to base64 data type" \
+      -F "max_tokens=32" \
+      -F "language=en" \
+      -F "stream=true"
+   ```
 
 ## 🚀 Launch the UI
 
-Open this URL `http://{host_ip}:5173` in your browser to access the svelte based frontend.
+Several UI options are provided. If you need to work with multimedia documents, .doc, or .pdf files, suggested to use Gradio UI.
 
-Open this URL `http://{host_ip}:5174` in your browser to access the React based frontend.
+### Gradio UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the Gradio based frontend.
+
+![project-screenshot](../../../../assets/img/docSum_ui_gradio_text.png)
 
 ### Svelte UI
 
+Open this URL `http://{host_ip}:5173` in your browser to access the Svelte based frontend.
+
 ![project-screenshot](../../../../assets/img/docSum_ui_text.png)
 
 ### React UI (Optional)
 
+Open this URL `http://{host_ip}:5174` in your browser to access the React based frontend.
+
 To access the React-based frontend, modify the UI service in the `compose.yaml` file. Replace `docsum-xeon-ui-server` service with the `docsum-xeon-react-ui-server` service as per the config below:
 
 ```yaml
diff --git a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
index 35e673563..72332a901 100644
--- a/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/DocSum/docker_compose/intel/cpu/xeon/compose.yaml
@@ -17,7 +17,8 @@ services:
       - "./data:/data"
     shm_size: 1g
     command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
-  llm:
+
+  llm-docsum-tgi:
     image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
     container_name: llm-docsum-server
     depends_on:
@@ -32,12 +33,56 @@ services:
       TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
     restart: unless-stopped
+
+  whisper:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - "7066:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+    restart: unless-stopped
+
+  dataprep-audio2text:
+    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
+    container_name: dataprep-audio2text-service
+    ports:
+      - "9099:9099"
+    ipc: host
+    environment:
+      A2T_ENDPOINT: ${A2T_ENDPOINT}
+
+  dataprep-video2audio:
+    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
+    container_name: dataprep-video2audio-service
+    ports:
+      - "7078:7078"
+    ipc: host
+    environment:
+      V2A_ENDPOINT: ${V2A_ENDPOINT}
+
+  dataprep-multimedia2text:
+    image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest}
+    container_name: dataprep-multimedia2text
+    ports:
+      - "7079:7079"
+    ipc: host
+    environment:
+      V2A_ENDPOINT: ${V2A_ENDPOINT}
+      A2T_ENDPOINT: ${A2T_ENDPOINT}
+
   docsum-xeon-backend-server:
     image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
     container_name: docsum-xeon-backend-server
     depends_on:
       - tgi-service
-      - llm
+      - llm-docsum-tgi
+      - dataprep-multimedia2text
+      - dataprep-video2audio
+      - dataprep-audio2text
     ports:
       - "8888:8888"
     environment:
@@ -45,10 +90,12 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - DATA_SERVICE_HOST_IP=${DATA_SERVICE_HOST_IP}
       - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
     ipc: host
     restart: always
-  docsum-xeon-ui-server:
+
+  docsum-ui:
     image: ${REGISTRY:-opea}/docsum-ui:${TAG:-latest}
     container_name: docsum-xeon-ui-server
     depends_on:
@@ -59,6 +106,7 @@ services:
       - no_proxy=${no_proxy}
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
+      - BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
       - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
     ipc: host
     restart: always
diff --git a/DocSum/docker_compose/intel/hpu/gaudi/README.md b/DocSum/docker_compose/intel/hpu/gaudi/README.md
index 6882f0eba..172f24d67 100644
--- a/DocSum/docker_compose/intel/hpu/gaudi/README.md
+++ b/DocSum/docker_compose/intel/hpu/gaudi/README.md
@@ -1,47 +1,75 @@
 # Build MegaService of Document Summarization on Gaudi
 
-This document outlines the deployment process for a Document Summarization application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as llm. We will publish the Docker images to Docker Hub, which will simplify the deployment process for this service.
+This document outlines the deployment process for a Document Summarization application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline on Intel Gaudi server. The steps include Docker image creation, container deployment via Docker Compose, and service execution to integrate microservices such as `llm`. We will publish the Docker images to Docker Hub soon, which will simplify the deployment process for this service.
 
 ## 🚀 Build Docker Images
 
-First of all, you need to build Docker Images locally. This step can be ignored once the Docker images are published to Docker hub.
+### 1. Build MicroService Docker Image
 
-### 1. Pull TGI Gaudi Image
+First of all, you need to build Docker Images locally and install the python package of it.
 
-As TGI Gaudi has been officially published as a Docker image, we simply need to pull it:
+```bash
+git clone https://github.com/opea-project/GenAIComps.git
+cd GenAIComps
+```
+
+#### Audio to text Service
+
+The Audio to text Service is another service for converting audio to text. Follow these steps to build and run the service:
 
 ```bash
-docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
+docker build -t opea/dataprep-audio2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/audio2text/Dockerfile .
 ```
 
-### 2. Build LLM Image
+#### Video to Audio Service
+
+The Video to Audio Service extracts audio from video files. Follow these steps to build and run the service:
 
 ```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build -t opea/llm-docsum-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/summarization/tgi/langchain/Dockerfile .
+docker build -t opea/dataprep-video2audio:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/video2audio/Dockerfile .
 ```
 
-### 3. Build MegaService Docker Image
+#### Multimedia to Text Service
 
-To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image using the command below:
+The Multimedia to Text Service transforms multimedia data to text data. Follow these steps to build and run the service:
+
+```bash
+docker build -t opea/dataprep-multimedia2text:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/multimedia2text/Dockerfile .
+```
+
+### 2. Build MegaService Docker Image
+
+To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `docsum.py` Python script. Build the MegaService Docker image via below command:
 
 ```bash
 git clone https://github.com/opea-project/GenAIExamples
-cd GenAIExamples/DocSum
+cd GenAIExamples/DocSum/
 docker build -t opea/docsum:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
 ```
 
-### 4. Build UI Docker Image
+### 3. Build UI Docker Image
+
+Several UI options are provided. If you need to work with multimedia documents, .doc, or .pdf files, suggested to use Gradio UI.
 
-Construct the frontend Docker image using the command below:
+#### Svelte UI
+
+Build the frontend Docker image via below command:
 
 ```bash
-cd GenAIExamples/DocSum/
-docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
 ```
 
-### 5. Build React UI Docker Image
+#### Gradio UI
+
+Build the Gradio UI frontend Docker image using the following command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-gradio-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile.gradio .
+```
+
+#### React UI
 
 Build the frontend Docker image via below command:
 
@@ -49,48 +77,66 @@ Build the frontend Docker image via below command:
 cd GenAIExamples/DocSum/ui
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
 docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT -f ./docker/Dockerfile.react .
-```
-
-Then run the command `docker images`, you will have the following Docker Images:
 
-1. `ghcr.io/huggingface/tgi-gaudi:2.0.6`
-2. `opea/llm-docsum-tgi:latest`
-3. `opea/docsum:latest`
-4. `opea/docsum-ui:latest`
-5. `opea/docsum-react-ui:latest`
+docker build -t opea/docsum-react-ui:latest --build-arg BACKEND_SERVICE_ENDPOINT=$BACKEND_SERVICE_ENDPOINT --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy  -f ./docker/Dockerfile.react .
+```
 
 ## 🚀 Start Microservices and MegaService
 
 ### Required Models
 
-We set default model as "Intel/neural-chat-7b-v3-3", change "LLM_MODEL_ID" in following setting if you want to use other models.
-If use gated models, you also need to provide [huggingface token](https://huggingface.co/docs/hub/security-tokens) to "HUGGINGFACEHUB_API_TOKEN" environment variable.
-
-### Setup Environment Variables
-
-Since the `compose.yaml` will consume some environment variables, you need to setup them in advance as below.
+Default model is "Intel/neural-chat-7b-v3-3". Change "LLM_MODEL_ID" environment variable in commands below if you want to use another model.
 
 ```bash
-export no_proxy=${your_no_proxy}
-export http_proxy=${your_http_proxy}
-export https_proxy=${your_http_proxy}
 export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
-export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
-export MEGA_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
-export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
 ```
 
-Note: Please replace with `host_ip` with your external IP address, do not use localhost.
+When using gated models, you also need to provide [HuggingFace token](https://huggingface.co/docs/hub/security-tokens) to "HUGGINGFACEHUB_API_TOKEN" environment variable.
+
+### Setup Environment Variable
+
+To set up environment variables for deploying Document Summarization services, follow these steps:
+
+1. Set the required environment variables:
+
+   ```bash
+   # Example: host_ip="192.168.1.1"
+   export host_ip="External_Public_IP"
+   # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
+   export no_proxy="Your_No_Proxy"
+   export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
+   ```
+
+2. If you are in a proxy environment, also set the proxy-related environment variables:
+
+   ```bash
+   export http_proxy="Your_HTTP_Proxy"
+   export https_proxy="Your_HTTPs_Proxy"
+   ```
+
+3. Set up other environment variables:
+
+   ```bash
+   source GenAIExamples/DocSum/docker_compose/set_env.sh
+   ```
 
 ### Start Microservice Docker Containers
 
 ```bash
 cd GenAIExamples/DocSum/docker_compose/intel/hpu/gaudi
-docker compose up -d
+docker compose -f compose.yaml up -d
 ```
 
+You will have the following Docker Images:
+
+1. `opea/docsum-ui:latest`
+2. `opea/docsum:latest`
+3. `opea/llm-docsum-tgi:latest`
+4. `opea/whisper:latest`
+5. `opea/dataprep-audio2text:latest`
+6. `opea/dataprep-multimedia2text:latest`
+7. `opea/dataprep-video2audio:latest`
+
 ### Validate Microservices
 
 1. TGI Service
@@ -98,7 +144,7 @@ docker compose up -d
    ```bash
    curl http://${host_ip}:8008/generate \
      -X POST \
-     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":64, "do_sample": true}}' \
+     -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}' \
      -H 'Content-Type: application/json'
    ```
 
@@ -111,20 +157,137 @@ docker compose up -d
      -H 'Content-Type: application/json'
    ```
 
-3. MegaService
+3. Whisper Microservice
+
+   ```bash
+    curl http://${host_ip}:7066/v1/asr \
+        -X POST \
+        -d '{"audio":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+        -H 'Content-Type: application/json'
+   ```
+
+   Expected output:
+
+   ```bash
+     {"asr_result":"you"}
+   ```
+
+4. Audio2Text Microservice
 
    ```bash
+    curl http://${host_ip}:9199/v1/audio/transcriptions \
+        -X POST \
+        -d '{"byte_str":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+        -H 'Content-Type: application/json'
+   ```
+
+   Expected output:
+
+   ```bash
+     {"downstream_black_list":[],"id":"--> this will be different id number for each run <--","query":"you"}
+   ```
+
+5. Multimedia to text Microservice
+
+   ```bash
+    curl http://${host_ip}:7079/v1/multimedia2text \
+        -X POST \
+        -d '{"audio":"UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}' \
+        -H 'Content-Type: application/json'
+   ```
+
+   Expected output:
+
+   ```bash
+     {"downstream_black_list":[],"id":"--> this will be different id number for each run <--","query":"you"}
+   ```
+
+6. MegaService
+
+   Text:
+
+   ```bash
+   curl -X POST http://${host_ip}:8888/v1/docsum \
+        -H "Content-Type: application/json" \
+        -d '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
+   # Use English mode (default).
    curl http://${host_ip}:8888/v1/docsum \
        -H "Content-Type: multipart/form-data" \
+       -F "type=text" \
        -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." \
        -F "max_tokens=32" \
        -F "language=en" \
-       -F "stream=false"
+       -F "stream=true"
+
+   # Use Chinese mode.
+   curl http://${host_ip}:8888/v1/docsum \
+       -H "Content-Type: multipart/form-data" \
+       -F "type=text" \
+       -F "messages=2024年9月26日，北京——今日，英特尔正式发布英特尔® 至强® 6性能核处理器（代号Granite Rapids），为AI、数据分析、科学计算等计算密集型业务提供卓越性能。" \
+       -F "max_tokens=32" \
+       -F "language=zh" \
+       -F "stream=true"
+
+   # Upload file
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=text" \
+      -F "messages=" \
+      -F "files=@/path to your file (.txt, .docx, .pdf)" \
+      -F "max_tokens=32" \
+      -F "language=en" \
+      -F "stream=true"
+   ```
+
+   > Audio and Video file uploads are not supported in docsum with curl request, please use the Gradio-UI.
+
+   Audio:
+
+   ```bash
+   curl -X POST http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: application/json" \
+      -d '{"type": "audio", "messages": "UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA"}'
+
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=audio" \
+      -F "messages=UklGRigAAABXQVZFZm10IBIAAAABAAEARKwAAIhYAQACABAAAABkYXRhAgAAAAEA" \
+      -F "max_tokens=32" \
+      -F "language=en" \
+      -F "stream=true"
+   ```
+
+   Video:
+
+   ```bash
+   curl -X POST http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: application/json" \
+      -d '{"type": "video", "messages": "convert your video to base64 data type"}'
+
+   curl http://${host_ip}:8888/v1/docsum \
+      -H "Content-Type: multipart/form-data" \
+      -F "type=video" \
+      -F "messages=convert your video to base64 data type" \
+      -F "max_tokens=32" \
+      -F "language=en" \
+      -F "stream=true"
    ```
 
+> More detailed tests can be found here `cd GenAIExamples/DocSum/test`
+
+## 🚀 Launch the UI
+
+Several UI options are provided. If you need to work with multimedia documents, .doc, or .pdf files, suggested to use Gradio UI.
+
+### Gradio UI
+
+Open this URL `http://{host_ip}:5173` in your browser to access the Gradio based frontend.
+![project-screenshot](../../../../assets/img/docSum_ui_gradio_text.png)
+
 ## 🚀 Launch the Svelte UI
 
-Open this URL `http://{host_ip}:5173` in your browser to access the frontend.
+Open this URL `http://{host_ip}:5173` in your browser to access the Svelte based frontend.
 
 ![project-screenshot](https://github.com/intel-ai-tce/GenAIExamples/assets/21761437/93b1ed4b-4b76-4875-927e-cc7818b4825b)
 
diff --git a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
index 71c52b40a..39bb3d477 100644
--- a/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/DocSum/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -24,7 +24,8 @@ services:
       - SYS_NICE
     ipc: host
     command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
-  llm:
+
+  llm-docsum-tgi:
     image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
     container_name: llm-docsum-gaudi-server
     depends_on:
@@ -39,12 +40,61 @@ services:
       TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
     restart: unless-stopped
+
+  whisper:
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+    container_name: whisper-service
+    ports:
+      - "7066:7066"
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    restart: unless-stopped
+
+  dataprep-audio2text:
+    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
+    container_name: dataprep-audio2text-service
+    ports:
+      - "9199:9099"
+    ipc: host
+    environment:
+      A2T_ENDPOINT: ${A2T_ENDPOINT}
+
+  dataprep-video2audio:
+    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
+    container_name: dataprep-video2audio-service
+    ports:
+      - "7078:7078"
+    ipc: host
+    environment:
+      V2A_ENDPOINT: ${V2A_ENDPOINT}
+
+  dataprep-multimedia2text:
+    image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest}
+    container_name: dataprep-multimedia2text
+    ports:
+      - "7079:7079"
+    ipc: host
+    environment:
+      V2A_ENDPOINT: ${V2A_ENDPOINT}
+      A2T_ENDPOINT: ${A2T_ENDPOINT}
+
   docsum-gaudi-backend-server:
     image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
     container_name: docsum-gaudi-backend-server
     depends_on:
       - tgi-service
-      - llm
+      - llm-docsum-tgi
+      - dataprep-multimedia2text
+      - dataprep-video2audio
+      - dataprep-audio2text
     ports:
       - "8888:8888"
     environment:
@@ -52,10 +102,13 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
+      - DATA_SERVICE_HOST_IP=${DATA_SERVICE_HOST_IP}
       - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+
     ipc: host
     restart: always
-  docsum-gaudi-ui-server:
+
+  docsum-ui:
     image: ${REGISTRY:-opea}/docsum-ui:${TAG:-latest}
     container_name: docsum-gaudi-ui-server
     depends_on:
@@ -66,6 +119,7 @@ services:
       - no_proxy=${no_proxy}
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
+      - BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
       - DOC_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
     ipc: host
     restart: always
diff --git a/DocSum/docker_compose/set_env.sh b/DocSum/docker_compose/set_env.sh
index e9ac5b4f7..658a57ef8 100644
--- a/DocSum/docker_compose/set_env.sh
+++ b/DocSum/docker_compose/set_env.sh
@@ -6,9 +6,19 @@ pushd "../../" > /dev/null
 source .set_env.sh
 popd > /dev/null
 
-
-export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export no_proxy="${no_proxy},${host_ip}"
 export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
 export MEGA_SERVICE_HOST_IP=${host_ip}
 export LLM_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+
+export V2A_SERVICE_HOST_IP=${host_ip}
+export V2A_ENDPOINT=http://$host_ip:7078
+
+export A2T_ENDPOINT=http://$host_ip:7066
+export A2T_SERVICE_HOST_IP=${host_ip}
+export A2T_SERVICE_PORT=9099
+
+export DATA_ENDPOINT=http://$host_ip:7079
+export DATA_SERVICE_HOST_IP=${host_ip}
+export DATA_SERVICE_PORT=7079
diff --git a/DocSum/docker_image_build/build.yaml b/DocSum/docker_image_build/build.yaml
index 13e0af59d..da777ebb7 100644
--- a/DocSum/docker_image_build/build.yaml
+++ b/DocSum/docker_image_build/build.yaml
@@ -11,6 +11,15 @@ services:
       context: ../
       dockerfile: ./Dockerfile
     image: ${REGISTRY:-opea}/docsum:${TAG:-latest}
+  docsum-gradio-ui:
+    build:
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
+      context: ../ui
+      dockerfile: ./docker/Dockerfile.gradio
+    extends: docsum
+    image: ${REGISTRY:-opea}/docsum-gradio-ui:${TAG:-latest}
   docsum-ui:
     build:
       context: ../ui
@@ -23,6 +32,42 @@ services:
       dockerfile: ./docker/Dockerfile.react
     extends: docsum
     image: ${REGISTRY:-opea}/docsum-react-ui:${TAG:-latest}
+  whisper:
+    build:
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
+      context: GenAIComps
+      dockerfile: comps/asr/whisper/dependency/Dockerfile
+    extends: docsum
+    image: ${REGISTRY:-opea}/whisper:${TAG:-latest}
+  dataprep-multimedia2text:
+    build:
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
+      context: GenAIComps
+      dockerfile: comps/dataprep/multimedia2text/Dockerfile
+    extends: docsum
+    image: ${REGISTRY:-opea}/dataprep-multimedia2text:${TAG:-latest}
+  dataprep-audio2text:
+    build:
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
+      context: GenAIComps
+      dockerfile: comps/dataprep/multimedia2text/audio2text/Dockerfile
+    extends: docsum
+    image: ${REGISTRY:-opea}/dataprep-audio2text:${TAG:-latest}
+  dataprep-video2audio:
+    build:
+      args:
+        http_proxy: ${http_proxy}
+        https_proxy: ${https_proxy}
+      context: GenAIComps
+      dockerfile: comps/dataprep/multimedia2text/video2audio/Dockerfile
+    extends: docsum
+    image: ${REGISTRY:-opea}/dataprep-video2audio:${TAG:-latest}
   llm-docsum-tgi:
     build:
       context: GenAIComps
diff --git a/DocSum/docsum.py b/DocSum/docsum.py
index fe6d3229c..f6094191a 100644
--- a/DocSum/docsum.py
+++ b/DocSum/docsum.py
@@ -8,6 +8,10 @@
 
 MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0")
 MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
+
+DATA_SERVICE_HOST_IP = os.getenv("DATA_SERVICE_HOST_IP", "0.0.0.0")
+DATA_SERVICE_PORT = int(os.getenv("DATA_SERVICE_PORT", 7079))
+
 LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
 LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))
 
@@ -19,6 +23,16 @@ def __init__(self, host="0.0.0.0", port=8000):
         self.megaservice = ServiceOrchestrator()
 
     def add_remote_service(self):
+
+        data = MicroService(
+            name="multimedia2text",
+            host=DATA_SERVICE_HOST_IP,
+            port=DATA_SERVICE_PORT,
+            endpoint="/v1/multimedia2text",
+            use_remote_service=True,
+            service_type=ServiceType.DATAPREP,
+        )
+
         llm = MicroService(
             name="llm",
             host=LLM_SERVICE_HOST_IP,
@@ -27,7 +41,9 @@ def add_remote_service(self):
             use_remote_service=True,
             service_type=ServiceType.LLM,
         )
-        self.megaservice.add(llm)
+
+        self.megaservice.add(data).add(llm)
+        self.megaservice.flow_to(data, llm)
         self.gateway = DocSumGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
 
 
diff --git a/DocSum/docsum.yaml b/DocSum/docsum.yaml
deleted file mode 100644
index 9e9936ff4..000000000
--- a/DocSum/docsum.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-opea_micro_services:
-  tgi-service:
-    host: ${TGI_SERVICE_IP}
-    ports: ${TGI_SERVICE_PORT}
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.6
-    volumes:
-      - "./data:/data"
-    runtime: habana
-    cap_add:
-      - SYS_NICE
-    ipc: host
-    environment:
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HABANA_VISIBLE_DEVICES: all
-      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
-    model-id: ${LLM_MODEL_ID}
-  llm:
-    host: ${LLM_SERVICE_HOST_IP}
-    ports: ${LLM_SERVICE_PORT}
-    image: opea/llm-tgi:latest
-    endpoint: /v1/chat/completions
-    environment:
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-  ui:
-    host: ${UI_SERVICE_HOST_IP}
-    ports:
-      - "5173:5173"
-    image: opea/docsum-ui:latest
-    environment:
-      - CHAT_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-
-opea_mega_service:
-  host: ${MEGA_SERVICE_HOST_IP}
-  ports: ${MEGA_SERVICE_PORT}
-  image: opea/docsum:latest
-  endpoint: /v1/docsum
-  mega_flow:
-    - llm
diff --git a/DocSum/tests/data/test.mp4 b/DocSum/tests/data/test.mp4
new file mode 100644
index 000000000..6b72f4122
Binary files /dev/null and b/DocSum/tests/data/test.mp4 differ
diff --git a/DocSum/tests/data/test.wav b/DocSum/tests/data/test.wav
new file mode 100644
index 000000000..21657414d
Binary files /dev/null and b/DocSum/tests/data/test.wav differ
diff --git a/DocSum/tests/test_compose_on_gaudi.sh b/DocSum/tests/test_compose_on_gaudi.sh
index 12a6a8861..91499197d 100644
--- a/DocSum/tests/test_compose_on_gaudi.sh
+++ b/DocSum/tests/test_compose_on_gaudi.sh
@@ -3,23 +3,50 @@
 # SPDX-License-Identifier: Apache-2.0
 
 set -e
+
 IMAGE_REPO=${IMAGE_REPO:-"opea"}
 IMAGE_TAG=${IMAGE_TAG:-"latest"}
+export http_proxy=$http_proxy
+export https_proxy=$https_proxy
+export host_ip=$(hostname -I | awk '{print $1}')
+
 echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
 
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export LLM_SERVICE_HOST_IP=${host_ip}
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+export no_proxy="${no_proxy},${host_ip}"
+
+export V2A_SERVICE_HOST_IP=${host_ip}
+export V2A_ENDPOINT=http://$host_ip:7078
+
+export A2T_ENDPOINT=http://$host_ip:7066
+export A2T_SERVICE_HOST_IP=${host_ip}
+export A2T_SERVICE_PORT=9099
+
+export DATA_ENDPOINT=http://$host_ip:7079
+export DATA_SERVICE_HOST_IP=${host_ip}
+export DATA_SERVICE_PORT=7079
+
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
+
+
+# Get the root folder of the current script
+ROOT_FOLDER=$(dirname "$(readlink -f "$0")")
 
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="docsum docsum-ui llm-docsum-tgi"
+    service_list="docsum docsum-ui whisper dataprep-multimedia2text dataprep-audio2text dataprep-video2audio llm-docsum-tgi"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.6
@@ -29,19 +56,9 @@ function build_docker_images() {
 function start_services() {
     cd $WORKPATH/docker_compose/intel/hpu/gaudi
 
-    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export MEGA_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
-    export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/docsum"
-
-    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
+    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    sleep 3m
 
-    # Start Docker Containers
-    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
-
-    n=0
     until [[ "$n" -ge 100 ]]; do
         docker logs tgi-gaudi-server > ${LOG_PATH}/tgi_service_start.log
         if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
@@ -60,6 +77,9 @@ function validate_services() {
     local INPUT_DATA="$5"
 
     local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+    echo "==========================================="
+
     if [ "$HTTP_STATUS" -eq 200 ]; then
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
 
@@ -80,12 +100,86 @@ function validate_services() {
     sleep 1s
 }
 
+get_base64_str() {
+    local file_name=$1
+    base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+    local document_type=$1
+    case $document_type in
+        ("text")
+            echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+            ;;
+        ("audio")
+            get_base64_str "$ROOT_FOLDER/data/test.wav"
+            ;;
+        ("video")
+            get_base64_str "$ROOT_FOLDER/data/test.mp4"
+            ;;
+        (*)
+            echo "Invalid document type" >&2
+            exit 1
+            ;;
+    esac
+}
+
 function validate_microservices() {
     # Check if the microservices are running correctly.
 
+    # whisper microservice
+    ulimit -s 65536
+    validate_services \
+        "${host_ip}:7066/v1/asr" \
+        '{"asr_result":"well"}' \
+        "whisper" \
+        "whisper-service" \
+        "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+    # Audio2Text service
+    validate_services \
+        "${host_ip}:9199/v1/audio/transcriptions" \
+        '"query":"well"' \
+        "dataprep-audio2text" \
+        "dataprep-audio2text-service" \
+        "{\"byte_str\": \"$(input_data_for_test "audio")\"}"
+
+    # Video2Audio service
+    validate_services \
+        "${host_ip}:7078/v1/video2audio" \
+        "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU4LjI5LjEwMAAAAAAAAAAAAAAA//tQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAAIAAAN3wAtLS0tLS0tLS0tLS1LS0tLS0tLS0tLS0tpaWlpaWlpaWlpaWlph4eHh4eHh4eHh4eHpaWlpaWlpaWlpaWlpcPDw8PDw8PDw8PDw+Hh4eHh4eHh4eHh4eH///////////////8AAAAATGF2YzU4LjU0AAAAAAAAAAAAAAAAJAYwAAAAAAAADd9L18KaAAAAAAAAAAAAAAAAAAAAAP/7kGQAAAMhClSVMEACMOAabaCMAREA" \
+        "dataprep-video2audio" \
+        "dataprep-video2audio-service" \
+        "{\"byte_str\": \"$(input_data_for_test "video")\"}"
+
+    # Docsum Data service - video
+    validate_services \
+        "${host_ip}:7079/v1/multimedia2text" \
+        '"query":"well"' \
+        "dataprep-multimedia2text" \
+        "dataprep-multimedia2text" \
+        "{\"video\": \"$(input_data_for_test "video")\"}"
+
+    # Docsum Data service - audio
+    validate_services \
+        "${host_ip}:7079/v1/multimedia2text" \
+        '"query":"well"' \
+        "dataprep-multimedia2text" \
+        "dataprep-multimedia2text" \
+        "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+    # Docsum Data service - text
+    validate_services \
+        "${host_ip}:7079/v1/multimedia2text" \
+        "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco" \
+        "dataprep-multimedia2text" \
+        "dataprep-multimedia2text" \
+        "{\"text\": \"$(input_data_for_test "text")\"}"
+
     # tgi for llm service
     validate_services \
-        "${ip_address}:8008/generate" \
+        "${host_ip}:8008/generate" \
         "generated_text" \
         "tgi-gaudi" \
         "tgi-gaudi-server" \
@@ -93,24 +187,28 @@ function validate_microservices() {
 
     # llm microservice
     validate_services \
-        "${ip_address}:9000/v1/chat/docsum" \
+        "${host_ip}:9000/v1/chat/docsum" \
         "data: " \
-        "llm" \
+        "llm-docsum-tgi" \
         "llm-docsum-gaudi-server" \
         '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
 }
 
 function validate_megaservice() {
-    local SERVICE_NAME="mega-docsum"
+    local SERVICE_NAME="docsum-gaudi-backend-server"
     local DOCKER_NAME="docsum-gaudi-backend-server"
-    local EXPECTED_RESULT="embedding"
+    local EXPECTED_RESULT="[DONE]"
     local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
-    local URL="${ip_address}:8888/v1/docsum"
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+    local URL="${host_ip}:8888/v1/docsum"
+    local DATA_TYPE="type=text"
+
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
     if [ "$HTTP_STATUS" -eq 200 ]; then
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
 
-        local CONTENT=$(curl -s -X POST -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+        local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
 
         if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
             echo "[ $SERVICE_NAME ] Content is as expected."
@@ -127,32 +225,32 @@ function validate_megaservice() {
     sleep 1s
 }
 
-function validate_frontend() {
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniforge3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+function validate_megaservice_json() {
+    # Curl the Mega Service
+    echo ">>> Checking text data with Content-Type: application/json"
+    validate_services \
+        "${host_ip}:8888/v1/docsum" \
+        "[DONE]" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
+        '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
 
-    conda install -c conda-forge nodejs -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
+    echo ">>> Checking audio data"
+    validate_services \
+        "${host_ip}:8888/v1/docsum" \
+        "[DONE]" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "{\"type\": \"audio\",  \"messages\": \"$(input_data_for_test "audio")\"}"
 
-    exit_status=0
-    npx playwright test || exit_status=$?
+    echo ">>> Checking video data"
+    validate_services \
+        "${host_ip}:8888/v1/docsum" \
+        "[DONE]" \
+        "docsum-gaudi-backend-server" \
+        "docsum-gaudi-backend-server" \
+        "{\"type\": \"video\",  \"messages\": \"$(input_data_for_test "video")\"}"
 
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
 }
 
 function stop_docker() {
@@ -161,19 +259,39 @@ function stop_docker() {
 }
 
 function main() {
-
+    echo "==========================================="
+    echo ">>>> Stopping any running Docker containers..."
     stop_docker
 
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    echo "==========================================="
+    if [[ "$IMAGE_REPO" == "opea" ]]; then
+        echo ">>>> Building Docker images..."
+        build_docker_images
+    fi
+
+    echo "==========================================="
+    echo ">>>> Starting Docker services..."
     start_services
 
+    echo "==========================================="
+    echo ">>>> Validating microservices..."
     validate_microservices
+
+    echo "==========================================="
+    echo ">>>> Validating megaservice..."
     validate_megaservice
-    validate_frontend
+    echo ">>>> Validating validate_megaservice_json..."
+    validate_megaservice_json
 
+    echo "==========================================="
+    echo ">>>> Stopping Docker containers..."
     stop_docker
-    echo y | docker system prune
 
+    echo "==========================================="
+    echo ">>>> Pruning Docker system..."
+    echo y | docker system prune
+    echo ">>>> Docker system pruned successfully."
+    echo "==========================================="
 }
 
 main
diff --git a/DocSum/tests/test_compose_on_xeon.sh b/DocSum/tests/test_compose_on_xeon.sh
index 7f0b2f8c5..555633cfc 100644
--- a/DocSum/tests/test_compose_on_xeon.sh
+++ b/DocSum/tests/test_compose_on_xeon.sh
@@ -3,23 +3,49 @@
 # SPDX-License-Identifier: Apache-2.0
 
 set -xe
+
 IMAGE_REPO=${IMAGE_REPO:-"opea"}
 IMAGE_TAG=${IMAGE_TAG:-"latest"}
+export http_proxy=$http_proxy
+export https_proxy=$https_proxy
+export host_ip=$(hostname -I | awk '{print $1}')
+
 echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
 echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
 export REGISTRY=${IMAGE_REPO}
 export TAG=${IMAGE_TAG}
 
+export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export MEGA_SERVICE_HOST_IP=${host_ip}
+export LLM_SERVICE_HOST_IP=${host_ip}
+export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/docsum"
+export no_proxy="${no_proxy},${host_ip}"
+
+export V2A_SERVICE_HOST_IP=${host_ip}
+export V2A_ENDPOINT=http://$host_ip:7078
+
+export A2T_ENDPOINT=http://$host_ip:7066
+export A2T_SERVICE_HOST_IP=${host_ip}
+export A2T_SERVICE_PORT=9099
+
+export DATA_ENDPOINT=http://$host_ip:7079
+export DATA_SERVICE_HOST_IP=${host_ip}
+export DATA_SERVICE_PORT=7079
+
 WORKPATH=$(dirname "$PWD")
 LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
+
+# Get the root folder of the current script
+ROOT_FOLDER=$(dirname "$(readlink -f "$0")")
 
 function build_docker_images() {
     cd $WORKPATH/docker_image_build
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="docsum docsum-ui llm-docsum-tgi"
+    service_list="docsum docsum-ui whisper dataprep-multimedia2text dataprep-audio2text dataprep-video2audio llm-docsum-tgi"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-generation-inference:1.4
@@ -29,17 +55,8 @@ function build_docker_images() {
 function start_services() {
     cd $WORKPATH/docker_compose/intel/cpu/xeon/
 
-    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export MEGA_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
-    export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/docsum"
-
-    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
-
-    # Start Docker Containers
-    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
+    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+    sleep 3m
 
     until [[ "$n" -ge 100 ]]; do
         docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log
@@ -59,6 +76,9 @@ function validate_services() {
     local INPUT_DATA="$5"
 
     local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
+
+    echo "==========================================="
+
     if [ "$HTTP_STATUS" -eq 200 ]; then
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
 
@@ -67,9 +87,12 @@ function validate_services() {
         if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
             echo "[ $SERVICE_NAME ] Content is as expected."
         else
+            echo "EXPECTED_RESULT==> $EXPECTED_RESULT"
+            echo "CONTENT==> $CONTENT"
             echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
             docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
             exit 1
+
         fi
     else
         echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
@@ -79,12 +102,86 @@ function validate_services() {
     sleep 1s
 }
 
+get_base64_str() {
+    local file_name=$1
+    base64 -w 0 "$file_name"
+}
+
+# Function to generate input data for testing based on the document type
+input_data_for_test() {
+    local document_type=$1
+    case $document_type in
+        ("text")
+            echo "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco of education week reports it takes just 10 minutes to cross through gillette wyoming this small city sits in the northeast corner of the state surrounded by 100s of miles of prairie but schools here in campbell county are on the edge of something big the next generation science standards you are going to build a strand of dna and you are going to decode it and figure out what that dna actually says for christy mathis at sage valley junior high school the new standards are about learning to think like a scientist there is a lot of really good stuff in them every standard is a performance task it is not you know the child needs to memorize these things it is the student needs to be able to do some pretty intense stuff we are analyzing we are critiquing we are."
+            ;;
+        ("audio")
+            get_base64_str "$ROOT_FOLDER/data/test.wav"
+            ;;
+        ("video")
+            get_base64_str "$ROOT_FOLDER/data/test.mp4"
+            ;;
+        (*)
+            echo "Invalid document type" >&2
+            exit 1
+            ;;
+    esac
+}
+
 function validate_microservices() {
     # Check if the microservices are running correctly.
 
+    # whisper microservice
+    ulimit -s 65536
+    validate_services \
+        "${host_ip}:7066/v1/asr" \
+        '{"asr_result":"well"}' \
+        "whisper-service" \
+        "whisper-service" \
+        "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+    # Audio2Text service
+    validate_services \
+        "${host_ip}:9099/v1/audio/transcriptions" \
+        '"query":"well"' \
+        "dataprep-audio2text" \
+        "dataprep-audio2text-service" \
+        "{\"byte_str\": \"$(input_data_for_test "audio")\"}"
+
+    # Video2Audio service
+    validate_services \
+        "${host_ip}:7078/v1/video2audio" \
+        "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU4LjI5LjEwMAAAAAAAAAAAAAAA//tQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAAIAAAN3wAtLS0tLS0tLS0tLS1LS0tLS0tLS0tLS0tpaWlpaWlpaWlpaWlph4eHh4eHh4eHh4eHpaWlpaWlpaWlpaWlpcPDw8PDw8PDw8PDw+Hh4eHh4eHh4eHh4eH///////////////8AAAAATGF2YzU4LjU0AAAAAAAAAAAAAAAAJAYwAAAAAAAADd9L18KaAAAAAAAAAAAAAAAAAAAAAP/7kGQAAAMhClSVMEACMOAabaCMAREA" \
+        "dataprep-video2audio" \
+        "dataprep-video2audio-service" \
+        "{\"byte_str\": \"$(input_data_for_test "video")\"}"
+
+    # Docsum Data service - video
+    validate_services \
+        "${host_ip}:7079/v1/multimedia2text" \
+        '"query":"well"' \
+        "dataprep-multimedia2text-service" \
+        "dataprep-multimedia2text" \
+        "{\"video\": \"$(input_data_for_test "video")\"}"
+
+    # Docsum Data service - audio
+    validate_services \
+        "${host_ip}:7079/v1/multimedia2text" \
+        '"query":"well"' \
+        "dataprep-multimedia2text-service" \
+        "dataprep-multimedia2text" \
+        "{\"audio\": \"$(input_data_for_test "audio")\"}"
+
+    # Docsum Data service - text
+    validate_services \
+        "${host_ip}:7079/v1/multimedia2text" \
+        "THIS IS A TEST >>>> and a number of states are starting to adopt them voluntarily special correspondent john delenco" \
+        "dataprep-multimedia2text-service" \
+        "dataprep-multimedia2text" \
+        "{\"text\": \"$(input_data_for_test "text")\"}"
+
     # tgi for llm service
     validate_services \
-        "${ip_address}:8008/generate" \
+        "${host_ip}:8008/generate" \
         "generated_text" \
         "tgi-llm" \
         "tgi-service" \
@@ -92,24 +189,28 @@ function validate_microservices() {
 
     # llm microservice
     validate_services \
-        "${ip_address}:9000/v1/chat/docsum" \
+        "${host_ip}:9000/v1/chat/docsum" \
         "data: " \
         "llm" \
         "llm-docsum-server" \
         '{"query":"Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
+
 }
 
 function validate_megaservice() {
-    local SERVICE_NAME="mega-docsum"
-    local DOCKER_NAME="docsum-gaudi-backend-server"
-    local EXPECTED_RESULT="embedding"
+    local SERVICE_NAME="docsum-xeon-backend-server"
+    local DOCKER_NAME="docsum-xeon-backend-server"
+    local EXPECTED_RESULT="[DONE]"
     local INPUT_DATA="messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."
-    local URL="${ip_address}:8888/v1/docsum"
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+    local URL="${host_ip}:8888/v1/docsum"
+    local DATA_TYPE="type=text"
+
+    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL")
+
     if [ "$HTTP_STATUS" -eq 200 ]; then
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
 
-        local CONTENT=$(curl -s -X POST -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
+        local CONTENT=$(curl -s -X POST -F "$DATA_TYPE" -F "$INPUT_DATA" -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
 
         if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
             echo "[ $SERVICE_NAME ] Content is as expected."
@@ -126,32 +227,33 @@ function validate_megaservice() {
     sleep 1s
 }
 
-function validate_frontend() {
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniforge3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
+function validate_megaservice_json() {
+    # Curl the Mega Service
+    echo ""
+    echo ">>> Checking text data with Content-Type: application/json"
+    validate_services \
+        "${host_ip}:8888/v1/docsum" \
+        "[DONE]" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        '{"type": "text", "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5."}'
 
-    conda install -c conda-forge nodejs -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
+    echo ">>> Checking audio data"
+    validate_services \
+        "${host_ip}:8888/v1/docsum" \
+        "[DONE]" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "{\"type\": \"audio\",  \"messages\": \"$(input_data_for_test "audio")\"}"
 
-    exit_status=0
-    npx playwright test || exit_status=$?
+    echo ">>> Checking video data"
+    validate_services \
+        "${host_ip}:8888/v1/docsum" \
+        "[DONE]" \
+        "docsum-xeon-backend-server" \
+        "docsum-xeon-backend-server" \
+        "{\"type\": \"video\",  \"messages\": \"$(input_data_for_test "video")\"}"
 
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
 }
 
 function stop_docker() {
@@ -160,19 +262,39 @@ function stop_docker() {
 }
 
 function main() {
-
+    echo "==========================================="
+    echo ">>>> Stopping any running Docker containers..."
     stop_docker
 
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
+    echo "==========================================="
+    if [[ "$IMAGE_REPO" == "opea" ]]; then
+        echo ">>>> Building Docker images..."
+        build_docker_images
+    fi
+
+    echo "==========================================="
+    echo ">>>> Starting Docker services..."
     start_services
 
+    echo "==========================================="
+    echo ">>>> Validating microservices..."
     validate_microservices
+
+    echo "==========================================="
+    echo ">>>> Validating megaservice..."
     validate_megaservice
-    validate_frontend
+    echo ">>>> Validating validate_megaservice_json..."
+    validate_megaservice_json
 
+    echo "==========================================="
+    echo ">>>> Stopping Docker containers..."
     stop_docker
-    echo y | docker system prune
 
+    echo "==========================================="
+    echo ">>>> Pruning Docker system..."
+    echo y | docker system prune
+    echo ">>>> Docker system pruned successfully."
+    echo "==========================================="
 }
 
 main
diff --git a/DocSum/tests/test_manifest_on_gaudi.sh b/DocSum/tests/test_manifest_on_gaudi.sh
index 48c03b427..785d9a899 100755
--- a/DocSum/tests/test_manifest_on_gaudi.sh
+++ b/DocSum/tests/test_manifest_on_gaudi.sh
@@ -36,6 +36,7 @@ function validate_docsum() {
     # Curl the Mega Service
     curl http://${ip_address}:${port}/v1/docsum \
     -H 'Content-Type: multipart/form-data' \
+    -F 'type=text' \
     -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
     exit_code=$?
     if [ $exit_code -ne 0 ]; then
@@ -46,7 +47,7 @@ function validate_docsum() {
     echo "Checking response results, make sure the output is reasonable. "
     local status=false
     if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "embedding" $LOGFILE) != 0 ]]; then
+    [[ $(grep -c "[DONE]" $LOGFILE) != 0 ]]; then
         status=true
     fi
 
diff --git a/DocSum/tests/test_manifest_on_xeon.sh b/DocSum/tests/test_manifest_on_xeon.sh
index b066a33ea..b98cc9187 100755
--- a/DocSum/tests/test_manifest_on_xeon.sh
+++ b/DocSum/tests/test_manifest_on_xeon.sh
@@ -36,6 +36,7 @@ function validate_docsum() {
     # Curl the Mega Service
     curl http://${ip_address}:${port}/v1/docsum \
     -H 'Content-Type: multipart/form-data' \
+    -F 'type=text' \
     -F "messages=Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E5." > $LOGFILE
     exit_code=$?
     if [ $exit_code -ne 0 ]; then
@@ -46,7 +47,7 @@ function validate_docsum() {
     echo "Checking response results, make sure the output is reasonable. "
     local status=false
     if [[ -f $LOGFILE ]] && \
-    [[ $(grep -c "embedding" $LOGFILE) != 0 ]]; then
+    [[ $(grep -c "[DONE]" $LOGFILE) != 0 ]]; then
         status=true
     fi
 
diff --git a/DocSum/ui/docker/Dockerfile.gradio b/DocSum/ui/docker/Dockerfile.gradio
new file mode 100644
index 000000000..a4154dfe8
--- /dev/null
+++ b/DocSum/ui/docker/Dockerfile.gradio
@@ -0,0 +1,34 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Use the official Python 3.11 slim image as the base image
+FROM python:3.11-slim
+
+# Set the default language environment variable
+ENV LANG=C.UTF-8
+
+# Define a build argument for architecture (default is "cpu")
+ARG ARCH="cpu"
+
+# Update the package list and install necessary packages
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing build-essential 
+    
+# Create a directory for the application
+RUN mkdir -p /home/user
+
+# Copy the application code and requirements file to the container
+COPY ./gradio/docsum_ui_gradio.py /home/user/docsum_ui_gradio.py
+COPY ./gradio/requirements.txt /home/user/requirements.txt 
+
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r /home/user/requirements.txt
+
+# Set the working directory
+WORKDIR /home/user/
+
+# Expose the port that the application will run on
+EXPOSE 5173
+
+# Define the command to run the application
+CMD ["python", "docsum_ui_gradio.py"]
diff --git a/DocSum/ui/gradio/README.md b/DocSum/ui/gradio/README.md
new file mode 100644
index 000000000..6d7ce64cb
--- /dev/null
+++ b/DocSum/ui/gradio/README.md
@@ -0,0 +1,76 @@
+# Document Summary
+
+This project provides a user interface for summarizing documents and text using a Dockerized frontend application. Users can upload files or paste text to generate summaries.
+
+## Docker
+
+### Build UI Docker Image
+
+To build the frontend Docker image, navigate to the `GenAIExamples/DocSum/ui` directory and run the following command:
+
+```bash
+cd GenAIExamples/DocSum/ui
+docker build -t opea/docsum-gradio-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile.gradio .
+```
+
+This command builds the Docker image with the tag `opea/docsum-ui:latest`. It also passes the proxy settings as build arguments to ensure that the build process can access the internet if you are behind a corporate firewall.
+
+### Run UI Docker Image
+
+To run the frontend Docker image, navigate to the `GenAIExamples/DocSum/ui/docker` directory and execute the following commands:
+
+```bash
+cd GenAIExamples/DocSum/ui/docker
+
+ip_address=$(hostname -I | awk '{print $1}')
+docker run -d -p 5173:5173 --ipc=host \
+   -e http_proxy=$http_proxy \
+   -e https_proxy=$https_proxy \
+   -e no_proxy=$no_proxy \
+   -e BACKEND_SERVICE_ENDPOINT=http://$ip_address:8888/v1/docsum \
+   opea/docsum-gradio-ui:latest
+```
+
+This command runs the Docker container in interactive mode, mapping port 5173 of the host to port 5173 of the container. It also sets several environment variables, including the backend service endpoint, which is required for the frontend to communicate with the backend service.
+
+### Python
+
+To run the frontend application directly using Python, navigate to the `GenAIExamples/DocSum/ui/gradio` directory and run the following command:
+
+```bash
+cd GenAIExamples/DocSum/ui/gradio
+python docsum_ui_gradio.py
+```
+
+This command starts the frontend application using Python.
+
+## 📸 Project Screenshots
+
+![project-screenshot](../../assets/img/docSum_ui_gradio_text.png)
+
+### 🧐 Features
+
+Here are some of the project's features:
+
+- Summarizing Uploaded Files: Users can upload files from their local device. Once a file is uploaded, the summarization of the document will start automatically. The summary will be displayed in the 'Summary' box.
+- Summarizing Text via Pasting: Users can paste the text to be summarized into the text box. By clicking the 'Generate Summary' button, a condensed summary of the content will be produced and displayed in the 'Summary' box on the right.
+
+## Additional Information
+
+### Prerequisites
+
+Ensure you have Docker installed and running on your system. Also, make sure you have the necessary proxy settings configured if you are behind a corporate firewall.
+
+### Environment Variables
+
+- `http_proxy`: Proxy setting for HTTP connections.
+- `https_proxy`: Proxy setting for HTTPS connections.
+- `no_proxy`: Comma-separated list of hosts that should be excluded from proxying.
+- `BACKEND_SERVICE_ENDPOINT`: The endpoint of the backend service that the frontend will communicate with.
+
+### Troubleshooting
+
+- Docker Build Issues: If you encounter issues while building the Docker image, ensure that your proxy settings are correctly configured and that you have internet access.
+- Docker Run Issues: If the Docker container fails to start, check the environment variables and ensure that the backend service is running and accessible.
+
+This README file provides detailed instructions and explanations for building and running the Dockerized frontend application, as well as running it directly using Python. It also highlights the key features of the project and provides additional information for troubleshooting and configuring the environment.
diff --git a/DocSum/ui/gradio/docsum_ui_gradio.py b/DocSum/ui/gradio/docsum_ui_gradio.py
new file mode 100644
index 000000000..e9b9b5099
--- /dev/null
+++ b/DocSum/ui/gradio/docsum_ui_gradio.py
@@ -0,0 +1,257 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import ast
+import base64
+import json
+import logging
+import os
+
+import gradio as gr
+import requests
+import uvicorn
+from fastapi import FastAPI
+from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class DocSumUI:
+    def __init__(self):
+        """Initialize the DocSumUI class with accepted file types, headers, and backend service endpoint."""
+        self.ACCEPTED_FILE_TYPES = ["pdf", "doc", "docx"]
+        self.HEADERS = {"Content-Type": "application/json"}
+        self.BACKEND_SERVICE_ENDPOINT = os.getenv("BACKEND_SERVICE_ENDPOINT", "http://localhost:8888/v1/docsum")
+
+    def encode_file_to_base64(self, file_path):
+        """Encode the content of a file to a base64 string.
+
+        Args:
+            file_path (str): The path to the file to be encoded.
+
+        Returns:
+            str: The base64 encoded string of the file content.
+        """
+        logger.info(">>> Encoding file to base64: %s", file_path)
+        with open(file_path, "rb") as f:
+            base64_str = base64.b64encode(f.read()).decode("utf-8")
+        return base64_str
+
+    def read_file(self, file):
+        """Read and process the content of a file.
+
+        Args:
+            file (file-like object): The file to be read.
+
+        Returns:
+            str: The content of the file or an error message if the file type is unsupported.
+        """
+        self.page_content = ""
+        self.pages = []
+
+        if file.name.endswith(".pdf"):
+            loader = PyPDFLoader(file)
+        elif file.name.endswith((".doc", ".docx")):
+            loader = Docx2txtLoader(file)
+        else:
+            msg = f"Unsupported file type '{file.name}'. Choose from {self.ACCEPTED_FILE_TYPES}"
+            logger.error(msg)
+            return msg
+
+        for page in loader.lazy_load():
+            self.page_content += page.page_content
+
+        return self.page_content
+
+    def read_audio_file(self, file):
+        """Read and process the content of an audio file.
+
+        Args:
+            file (file-like object): The audio file to be read.
+
+        Returns:
+            str: The base64 encoded content of the audio file.
+        """
+        logger.info(">>> Reading audio file: %s", file.name)
+        base64_str = self.encode_file_to_base64(file)
+        return self.generate_summary(base64_str, document_type="audio")
+
+    def read_video_file(self, file):
+        """Read and process the content of a video file.
+
+        Args:
+            file (file-like object): The video file to be read.
+
+        Returns:
+            str: The base64 encoded content of the video file.
+        """
+        logger.info(">>> Reading video file: %s", file.name)
+        base64_str = self.encode_file_to_base64(file)
+        return self.generate_summary(base64_str, document_type="video")
+
+    def generate_summary(self, doc_content, document_type="text"):
+        """Generate a summary for the given document content.
+
+        Args:
+            doc_content (str): The content of the document.
+            document_type (str): The type of the document (default is "text").
+
+        Returns:
+            str: The generated summary or an error message.
+        """
+
+        logger.info(">>> BACKEND_SERVICE_ENDPOINT - %s", self.BACKEND_SERVICE_ENDPOINT)
+
+        data = {"max_tokens": 256, "type": document_type, "messages": doc_content}
+
+        try:
+            response = requests.post(
+                url=self.BACKEND_SERVICE_ENDPOINT,
+                headers=self.HEADERS,
+                data=json.dumps(data),
+                proxies={"http_proxy": os.environ["http_proxy"], "https_proxy": os.environ["https_proxy"]},
+            )
+
+            if response.status_code == 200:
+                try:
+                    # Check if the specific log path is in the response text
+                    if "/logs/LLMChain/final_output" in response.text:
+                        # Extract the relevant part of the response
+                        temp = ast.literal_eval(
+                            [
+                                i.split("data: ")[1]
+                                for i in response.text.split("\n\n")
+                                if "/logs/LLMChain/final_output" in i
+                            ][0]
+                        )["ops"]
+
+                        # Find the final output value
+                        final_output = [i["value"] for i in temp if i["path"] == "/logs/LLMChain/final_output"][0]
+                        return final_output["text"]
+                    else:
+                        # Perform string replacements to clean the response text
+                        cleaned_text = response.text
+                        replacements = [
+                            ("'\n\ndata: b'", ""),
+                            ("data: b' ", ""),
+                            ("</s>'\n\ndata: [DONE]\n\n", ""),
+                            ("\n\ndata: b", ""),
+                            ("'\n\n", ""),
+                            ("'\n", ""),
+                            ('''\'"''', ""),
+                        ]
+                        for old, new in replacements:
+                            cleaned_text = cleaned_text.replace(old, new)
+                        return cleaned_text
+                except (IndexError, KeyError, ValueError) as e:
+                    # Handle potential errors during parsing
+                    logger.error("Error parsing response: %s", e)
+                    return response.text
+
+        except requests.exceptions.RequestException as e:
+            logger.error("Request exception: %s", e)
+            return str(e)
+
+        return str(response.status_code)
+
+    def create_upload_ui(self, label, file_types, process_function):
+        """Create a Gradio UI for file uploads.
+
+        Args:
+            label (str): The label for the upload button.
+            file_types (list): The list of accepted file types.
+            process_function (function): The function to process the uploaded file.
+
+        Returns:
+            gr.Blocks: The Gradio Blocks object representing the upload UI.
+        """
+        logger.info(">>> Creating upload UI for label: %s", label)
+        with gr.Blocks() as upload_ui:
+            with gr.Row():
+                with gr.Column():
+                    upload_btn = gr.UploadButton(label=label, file_count="single", file_types=file_types)
+                with gr.Column():
+                    generated_text = gr.TextArea(
+                        label="Text Summary", placeholder="Summarized text will be displayed here"
+                    )
+            upload_btn.upload(lambda file: self.generate_summary(process_function(file)), upload_btn, generated_text)
+        return upload_ui
+
+    def render(self):
+        """Render the Gradio UI for the DocSum application.
+
+        Returns:
+            gr.Blocks: The Gradio Blocks object representing the UI.
+        """
+        logger.info(">>> Rendering Gradio UI")
+        # Plain text UI
+        with gr.Blocks() as text_ui:
+            with gr.Row():
+                with gr.Column():
+                    input_text = gr.TextArea(
+                        label="Please paste content for summarization",
+                        placeholder="Paste the text information you need to summarize",
+                    )
+                    submit_btn = gr.Button("Generate Summary")
+                with gr.Column():
+                    generated_text = gr.TextArea(
+                        label="Text Summary", placeholder="Summarized text will be displayed here"
+                    )
+            submit_btn.click(fn=self.generate_summary, inputs=[input_text], outputs=[generated_text])
+
+        # File Upload UI
+        file_ui = self.create_upload_ui(
+            label="Please upload a document (.pdf, .doc, .docx)",
+            file_types=[".pdf", ".doc", ".docx"],
+            process_function=self.read_file,
+        )
+
+        # Audio Upload UI
+        audio_ui = self.create_upload_ui(
+            label="Please upload audio file (.wav, .mp3)",
+            file_types=[".wav", ".mp3"],
+            process_function=self.read_audio_file,
+        )
+
+        # Video Upload UI
+        video_ui = self.create_upload_ui(
+            label="Please upload Video file (.mp4)", file_types=[".mp4"], process_function=self.read_video_file
+        )
+
+        # Render all the UI in separate tabs
+        with gr.Blocks() as self.demo:
+            gr.Markdown("# Doc Summary")
+            with gr.Tabs():
+                with gr.TabItem("Paste Text"):
+                    text_ui.render()
+                with gr.TabItem("Upload file"):
+                    file_ui.render()
+                with gr.TabItem("Upload Audio"):
+                    audio_ui.render()
+                with gr.TabItem("Upload Video"):
+                    video_ui.render()
+
+        return self.demo
+
+
+app = FastAPI()
+
+demo = DocSumUI().render()
+
+demo.queue()
+
+app = gr.mount_gradio_app(app, demo, path="/")
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=5173)
+
+    args = parser.parse_args()
+    logger.info(">>> Starting server at %s:%d", args.host, args.port)
+
+    uvicorn.run(app, host=args.host, port=args.port)
diff --git a/DocSum/ui/gradio/requirements.txt b/DocSum/ui/gradio/requirements.txt
new file mode 100644
index 000000000..b2ed7b43f
--- /dev/null
+++ b/DocSum/ui/gradio/requirements.txt
@@ -0,0 +1,8 @@
+docx2txt
+gradio==4.44.0
+langchain_community
+moviepy==1.0.3
+numpy==1.26.4
+opencv-python==4.10.0.82
+Pillow==10.3.0
+pypdf
diff --git a/DocSum/ui/svelte/src/lib/shared/Network.ts b/DocSum/ui/svelte/src/lib/shared/Network.ts
index 705019c89..02e78f288 100644
--- a/DocSum/ui/svelte/src/lib/shared/Network.ts
+++ b/DocSum/ui/svelte/src/lib/shared/Network.ts
@@ -28,9 +28,11 @@ export async function fetchTextStream(query: string | Blob, params: string, file
   if (params === "doc_id") {
     formData.append("files", file, fileName);
     formData.append("messages", query);
+    formData.append("type", "text");
   } else if (params === "text") {
     formData.append("files", file, fileName);
     formData.append("messages", query);
+    formData.append("type", "text");
   }
 
   // Initiate the POST request to upload the file