diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md
index 36017eb6bd..46e5e3c0f7 100644
--- a/comps/dataprep/README.md
+++ b/comps/dataprep/README.md
@@ -2,6 +2,13 @@
 
 The Dataprep Microservice aims to preprocess the data from various sources (either structured or unstructured data) to text data, and convert the text data to embedding vectors then store them in the database.
 
+## Install Requirements
+
+```bash
+apt-get update
+apt-get install libreoffice
+```
+
 ## Use LVM (Large Vision Model) for Summarizing Image Data
 
 Occasionally unstructured data will contain image data, to convert the image data to the text data, LVM can be used to summarize the image. To leverage LVM, please refer to this [readme](../lvms/README.md) to start the LVM microservice first and then set the below environment variable, before starting any dataprep microservice.
diff --git a/comps/dataprep/milvus/requirements.txt b/comps/dataprep/milvus/requirements.txt
index fbfe59080d..9f71a69d41 100644
--- a/comps/dataprep/milvus/requirements.txt
+++ b/comps/dataprep/milvus/requirements.txt
@@ -1,5 +1,7 @@
 beautifulsoup4
+cairosvg
 docarray[full]
+docx2txt
 easyocr
 fastapi
 frontend==0.0.3
@@ -8,6 +10,7 @@ langchain
 langchain-community
 langchain-text-splitters
 langchain_milvus
+markdown
 numpy
 opentelemetry-api
 opentelemetry-exporter-otlp
@@ -19,6 +22,7 @@ pydantic==2.7.3
 pymilvus==2.4.3
 pymupdf==1.24.5
 python-docx==0.8.11
+python-pptx
 sentence_transformers
 shortuuid
 unstructured[all-docs]==0.11.5
diff --git a/comps/dataprep/qdrant/README.md b/comps/dataprep/qdrant/README.md
index bfa5c8f464..24f58fc09d 100644
--- a/comps/dataprep/qdrant/README.md
+++ b/comps/dataprep/qdrant/README.md
@@ -13,7 +13,7 @@ apt-get install poppler-utils -y
 
 ## Start Qdrant Server
 
-Please refer to this [readme](../../../vectorstores/langchain/qdrant/README.md).
+Please refer to this [readme](../../vectorstores/langchain/qdrant/README.md).
 
 ## Setup Environment Variables
 
@@ -24,6 +24,7 @@ export https_proxy=${your_http_proxy}
 export QDRANT=${host_ip}
 export QDRANT_PORT=6333
 export COLLECTION_NAME=${your_collection_name}
+export PYTHONPATH=${path_to_comps}
 ```
 
 ## Start Document Preparation Microservice for Qdrant with Python Script
diff --git a/comps/dataprep/qdrant/requirements.txt b/comps/dataprep/qdrant/requirements.txt
index bcba8a1560..2c9df40f57 100644
--- a/comps/dataprep/qdrant/requirements.txt
+++ b/comps/dataprep/qdrant/requirements.txt
@@ -1,11 +1,14 @@
 beautifulsoup4
+cairosvg
 docarray[full]
+docx2txt
 easyocr
 fastapi
 huggingface_hub
 langchain
 langchain-community
 langchain-text-splitters
+markdown
 numpy
 opentelemetry-api
 opentelemetry-exporter-otlp
@@ -15,6 +18,7 @@ Pillow
 prometheus-fastapi-instrumentator
 pymupdf
 python-docx
+python-pptx
 qdrant-client
 sentence_transformers
 shortuuid
diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md
index 8c58fc85e3..6ef6853897 100644
--- a/comps/dataprep/redis/README.md
+++ b/comps/dataprep/redis/README.md
@@ -31,7 +31,7 @@ cd langchain_ray; pip install -r requirements_ray.txt
 
 ## 1.2 Start Redis Stack Server
 
-Please refer to this [readme](../../../vectorstores/langchain/redis/README.md).
+Please refer to this [readme](../../vectorstores/langchain/redis/README.md).
 
 ## 1.3 Setup Environment Variables
 
@@ -41,6 +41,7 @@ export INDEX_NAME=${your_index_name}
 export LANGCHAIN_TRACING_V2=true
 export LANGCHAIN_API_KEY=${your_langchain_api_key}
 export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep"
+export PYTHONPATH=${path_to_comps}
 ```
 
 ## 1.4 Start Document Preparation Microservice for Redis with Python Script
diff --git a/comps/dataprep/redis/langchain/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt
index 3c542bed6b..f7a95a5370 100644
--- a/comps/dataprep/redis/langchain/requirements.txt
+++ b/comps/dataprep/redis/langchain/requirements.txt
@@ -1,5 +1,7 @@
 beautifulsoup4
+cairosvg
 docarray[full]
+docx2txt
 easyocr
 fastapi
 huggingface_hub
@@ -18,6 +20,7 @@ prometheus-fastapi-instrumentator
 pymupdf
 pyspark
 python-docx
+python-pptx
 redis
 sentence_transformers
 shortuuid
diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py
index 76f5e578e6..786d5c8dcf 100644
--- a/comps/dataprep/utils.py
+++ b/comps/dataprep/utils.py
@@ -9,15 +9,20 @@
 import multiprocessing
 import os
 import re
+import shutil
 import signal
 import timeit
 import unicodedata
 from urllib.parse import urlparse, urlunparse
 
+import cairosvg
+import docx
+import docx2txt
 import easyocr
 import fitz
 import numpy as np
 import pandas as pd
+import pptx
 import requests
 import yaml
 from bs4 import BeautifulSoup
@@ -27,7 +32,6 @@
     UnstructuredHTMLLoader,
     UnstructuredImageLoader,
     UnstructuredMarkdownLoader,
-    UnstructuredPowerPointLoader,
     UnstructuredXMLLoader,
 )
 from langchain_community.llms import HuggingFaceEndpoint
@@ -131,32 +135,81 @@ def load_txt(txt_path):
 
 def load_doc(doc_path):
     """Load doc file."""
-    txt_path = doc_path.replace(".doc", ".txt")
-    try:
-        os.system(f'antiword "{doc_path}" > "{txt_path}"')
-    except:
-        raise AssertionError(
-            "antiword failed or not installed, if not installed,"
-            + 'use "apt-get update && apt-get install -y antiword" to install it.'
-        )
-    text = load_txt(txt_path)
-    os.remove(txt_path)
+    print("Converting doc file to docx file...")
+    docx_path = doc_path + "x"
+    os.system(f"libreoffice --headless --invisible --convert-to docx --outdir {os.path.dirname(docx_path)} {doc_path}")
+    print("Converted doc file to docx file.")
+    text = load_docx(docx_path)
+    os.remove(docx_path)
     return text
 
 
 def load_docx(docx_path):
     """Load docx file."""
-    doc = DDocument(docx_path)
+    doc = docx.Document(docx_path)
     text = ""
+    # Save all 'rId:filenames' relationships in an dictionary and save the images if any.
+    rid2img = {}
+    for r in doc.part.rels.values():
+        if isinstance(r._target, docx.parts.image.ImagePart):
+            rid2img[r.rId] = os.path.basename(r._target.partname)
+    if rid2img:
+        save_path = "./imgs/"
+        os.makedirs(save_path, exist_ok=True)
+        docx2txt.process(docx_path, save_path)
     for paragraph in doc.paragraphs:
-        text += paragraph.text
+        if hasattr(paragraph, "text"):
+            text += paragraph.text + "\n"
+        if "graphicData" in paragraph._p.xml:
+            for rid in rid2img:
+                if rid in paragraph._p.xml:
+                    img_path = os.path.join(save_path, rid2img[rid])
+                    img_text = load_image(img_path)
+                    if img_text:
+                        text += img_text + "\n"
+    if rid2img:
+        shutil.rmtree(save_path)
+    return text
+
+
+def load_ppt(ppt_path):
+    """Load ppt file."""
+    print("Converting ppt file to pptx file...")
+    pptx_path = ppt_path + "x"
+    os.system(f"libreoffice --headless --invisible --convert-to pptx --outdir {os.path.dirname(pptx_path)} {ppt_path}")
+    print("Converted ppt file to pptx file.")
+    text = load_pptx(pptx_path)
+    os.remove(pptx_path)
     return text
 
 
 def load_pptx(pptx_path):
     """Load pptx file."""
-    loader = UnstructuredPowerPointLoader(pptx_path)
-    text = loader.load()[0].page_content
+    text = ""
+    prs = pptx.Presentation(pptx_path)
+    for slide in prs.slides:
+        for shape in sorted(slide.shapes, key=lambda shape: (shape.top, shape.left)):
+            if shape.has_text_frame:
+                if shape.text:
+                    text += shape.text + "\n"
+            if shape.has_table:
+                table_contents = "\n".join(
+                    [
+                        "\t".join([(cell.text if hasattr(cell, "text") else "") for cell in row.cells])
+                        for row in shape.table.rows
+                        if hasattr(row, "cells")
+                    ]
+                )
+                if table_contents:
+                    text += table_contents + "\n"
+            if hasattr(shape, "image") and hasattr(shape.image, "blob"):
+                img_path = f"./{shape.image.filename}"
+                with open(img_path, "wb") as f:
+                    f.write(shape.image.blob)
+                img_text = load_image(img_path)
+                if img_text:
+                    text += img_text + "\n"
+                os.remove(img_path)
     return text
 
 
@@ -214,13 +267,11 @@ def load_image(image_path):
         return response.json()["text"].strip()
     loader = UnstructuredImageLoader(image_path)
     text = loader.load()[0].page_content
-    return text
+    return text.strip()
 
 
 def load_svg(svg_path):
     """Load the svg file."""
-    import cairosvg
-
     png_path = svg_path.replace(".svg", ".png")
     cairosvg.svg2png(url=svg_path, write_to=png_path)
     text = load_image(png_path)
@@ -239,7 +290,9 @@ def document_loader(doc_path):
         return load_doc(doc_path)
     elif doc_path.endswith(".docx"):
         return load_docx(doc_path)
-    elif doc_path.endswith(".pptx") or doc_path.endswith(".ppt"):
+    elif doc_path.endswith(".ppt"):
+        return load_ppt(doc_path)
+    elif doc_path.endswith(".pptx"):
         return load_pptx(doc_path)
     elif doc_path.endswith(".md"):
         return load_md(doc_path)
@@ -261,7 +314,7 @@ def document_loader(doc_path):
     ):
         return load_image(doc_path)
     elif doc_path.endswith(".svg"):
-        return load_image(doc_path)
+        return load_svg(doc_path)
     else:
         raise NotImplementedError(
             "Current only support pdf, html, txt, doc, docx, pptx, ppt, md, xml"