From e2374544db483fd2f8e05f8b6fb31236d93357cf Mon Sep 17 00:00:00 2001 From: XinyuYe-Intel Date: Thu, 27 Jun 2024 00:18:30 +0800 Subject: [PATCH] Added support for extracting info from image in the docs (#120) Signed-off-by: Ye, Xinyu --- comps/dataprep/README.md | 7 ++ comps/dataprep/milvus/requirements.txt | 4 + comps/dataprep/qdrant/README.md | 3 +- comps/dataprep/qdrant/requirements.txt | 4 + comps/dataprep/redis/README.md | 3 +- .../dataprep/redis/langchain/requirements.txt | 3 + comps/dataprep/utils.py | 93 +++++++++++++++---- 7 files changed, 95 insertions(+), 22 deletions(-) diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md index 36017eb6bd..46e5e3c0f7 100644 --- a/comps/dataprep/README.md +++ b/comps/dataprep/README.md @@ -2,6 +2,13 @@ The Dataprep Microservice aims to preprocess the data from various sources (either structured or unstructured data) to text data, and convert the text data to embedding vectors then store them in the database. +## Install Requirements + +```bash +apt-get update +apt-get install libreoffice +``` + ## Use LVM (Large Vision Model) for Summarizing Image Data Occasionally unstructured data will contain image data, to convert the image data to the text data, LVM can be used to summarize the image. To leverage LVM, please refer to this [readme](../lvms/README.md) to start the LVM microservice first and then set the below environment variable, before starting any dataprep microservice. diff --git a/comps/dataprep/milvus/requirements.txt b/comps/dataprep/milvus/requirements.txt index fbfe59080d..9f71a69d41 100644 --- a/comps/dataprep/milvus/requirements.txt +++ b/comps/dataprep/milvus/requirements.txt @@ -1,5 +1,7 @@ beautifulsoup4 +cairosvg docarray[full] +docx2txt easyocr fastapi frontend==0.0.3 @@ -8,6 +10,7 @@ langchain langchain-community langchain-text-splitters langchain_milvus +markdown numpy opentelemetry-api opentelemetry-exporter-otlp @@ -19,6 +22,7 @@ pydantic==2.7.3 pymilvus==2.4.3 pymupdf==1.24.5 python-docx==0.8.11 +python-pptx sentence_transformers shortuuid unstructured[all-docs]==0.11.5 diff --git a/comps/dataprep/qdrant/README.md b/comps/dataprep/qdrant/README.md index bfa5c8f464..24f58fc09d 100644 --- a/comps/dataprep/qdrant/README.md +++ b/comps/dataprep/qdrant/README.md @@ -13,7 +13,7 @@ apt-get install poppler-utils -y ## Start Qdrant Server -Please refer to this [readme](../../../vectorstores/langchain/qdrant/README.md). +Please refer to this [readme](../../vectorstores/langchain/qdrant/README.md). ## Setup Environment Variables @@ -24,6 +24,7 @@ export https_proxy=${your_http_proxy} export QDRANT=${host_ip} export QDRANT_PORT=6333 export COLLECTION_NAME=${your_collection_name} +export PYTHONPATH=${path_to_comps} ``` ## Start Document Preparation Microservice for Qdrant with Python Script diff --git a/comps/dataprep/qdrant/requirements.txt b/comps/dataprep/qdrant/requirements.txt index bcba8a1560..2c9df40f57 100644 --- a/comps/dataprep/qdrant/requirements.txt +++ b/comps/dataprep/qdrant/requirements.txt @@ -1,11 +1,14 @@ beautifulsoup4 +cairosvg docarray[full] +docx2txt easyocr fastapi huggingface_hub langchain langchain-community langchain-text-splitters +markdown numpy opentelemetry-api opentelemetry-exporter-otlp @@ -15,6 +18,7 @@ Pillow prometheus-fastapi-instrumentator pymupdf python-docx +python-pptx qdrant-client sentence_transformers shortuuid diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md index 8c58fc85e3..6ef6853897 100644 --- a/comps/dataprep/redis/README.md +++ b/comps/dataprep/redis/README.md @@ -31,7 +31,7 @@ cd langchain_ray; pip install -r requirements_ray.txt ## 1.2 Start Redis Stack Server -Please refer to this [readme](../../../vectorstores/langchain/redis/README.md). +Please refer to this [readme](../../vectorstores/langchain/redis/README.md). ## 1.3 Setup Environment Variables @@ -41,6 +41,7 @@ export INDEX_NAME=${your_index_name} export LANGCHAIN_TRACING_V2=true export LANGCHAIN_API_KEY=${your_langchain_api_key} export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep" +export PYTHONPATH=${path_to_comps} ``` ## 1.4 Start Document Preparation Microservice for Redis with Python Script diff --git a/comps/dataprep/redis/langchain/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt index 3c542bed6b..f7a95a5370 100644 --- a/comps/dataprep/redis/langchain/requirements.txt +++ b/comps/dataprep/redis/langchain/requirements.txt @@ -1,5 +1,7 @@ beautifulsoup4 +cairosvg docarray[full] +docx2txt easyocr fastapi huggingface_hub @@ -18,6 +20,7 @@ prometheus-fastapi-instrumentator pymupdf pyspark python-docx +python-pptx redis sentence_transformers shortuuid diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index 76f5e578e6..786d5c8dcf 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -9,15 +9,20 @@ import multiprocessing import os import re +import shutil import signal import timeit import unicodedata from urllib.parse import urlparse, urlunparse +import cairosvg +import docx +import docx2txt import easyocr import fitz import numpy as np import pandas as pd +import pptx import requests import yaml from bs4 import BeautifulSoup @@ -27,7 +32,6 @@ UnstructuredHTMLLoader, UnstructuredImageLoader, UnstructuredMarkdownLoader, - UnstructuredPowerPointLoader, UnstructuredXMLLoader, ) from langchain_community.llms import HuggingFaceEndpoint @@ -131,32 +135,81 @@ def load_txt(txt_path): def load_doc(doc_path): """Load doc file.""" - txt_path = doc_path.replace(".doc", ".txt") - try: - os.system(f'antiword "{doc_path}" > "{txt_path}"') - except: - raise AssertionError( - "antiword failed or not installed, if not installed," - + 'use "apt-get update && apt-get install -y antiword" to install it.' - ) - text = load_txt(txt_path) - os.remove(txt_path) + print("Converting doc file to docx file...") + docx_path = doc_path + "x" + os.system(f"libreoffice --headless --invisible --convert-to docx --outdir {os.path.dirname(docx_path)} {doc_path}") + print("Converted doc file to docx file.") + text = load_docx(docx_path) + os.remove(docx_path) return text def load_docx(docx_path): """Load docx file.""" - doc = DDocument(docx_path) + doc = docx.Document(docx_path) text = "" + # Save all 'rId:filenames' relationships in an dictionary and save the images if any. + rid2img = {} + for r in doc.part.rels.values(): + if isinstance(r._target, docx.parts.image.ImagePart): + rid2img[r.rId] = os.path.basename(r._target.partname) + if rid2img: + save_path = "./imgs/" + os.makedirs(save_path, exist_ok=True) + docx2txt.process(docx_path, save_path) for paragraph in doc.paragraphs: - text += paragraph.text + if hasattr(paragraph, "text"): + text += paragraph.text + "\n" + if "graphicData" in paragraph._p.xml: + for rid in rid2img: + if rid in paragraph._p.xml: + img_path = os.path.join(save_path, rid2img[rid]) + img_text = load_image(img_path) + if img_text: + text += img_text + "\n" + if rid2img: + shutil.rmtree(save_path) + return text + + +def load_ppt(ppt_path): + """Load ppt file.""" + print("Converting ppt file to pptx file...") + pptx_path = ppt_path + "x" + os.system(f"libreoffice --headless --invisible --convert-to pptx --outdir {os.path.dirname(pptx_path)} {ppt_path}") + print("Converted ppt file to pptx file.") + text = load_pptx(pptx_path) + os.remove(pptx_path) return text def load_pptx(pptx_path): """Load pptx file.""" - loader = UnstructuredPowerPointLoader(pptx_path) - text = loader.load()[0].page_content + text = "" + prs = pptx.Presentation(pptx_path) + for slide in prs.slides: + for shape in sorted(slide.shapes, key=lambda shape: (shape.top, shape.left)): + if shape.has_text_frame: + if shape.text: + text += shape.text + "\n" + if shape.has_table: + table_contents = "\n".join( + [ + "\t".join([(cell.text if hasattr(cell, "text") else "") for cell in row.cells]) + for row in shape.table.rows + if hasattr(row, "cells") + ] + ) + if table_contents: + text += table_contents + "\n" + if hasattr(shape, "image") and hasattr(shape.image, "blob"): + img_path = f"./{shape.image.filename}" + with open(img_path, "wb") as f: + f.write(shape.image.blob) + img_text = load_image(img_path) + if img_text: + text += img_text + "\n" + os.remove(img_path) return text @@ -214,13 +267,11 @@ def load_image(image_path): return response.json()["text"].strip() loader = UnstructuredImageLoader(image_path) text = loader.load()[0].page_content - return text + return text.strip() def load_svg(svg_path): """Load the svg file.""" - import cairosvg - png_path = svg_path.replace(".svg", ".png") cairosvg.svg2png(url=svg_path, write_to=png_path) text = load_image(png_path) @@ -239,7 +290,9 @@ def document_loader(doc_path): return load_doc(doc_path) elif doc_path.endswith(".docx"): return load_docx(doc_path) - elif doc_path.endswith(".pptx") or doc_path.endswith(".ppt"): + elif doc_path.endswith(".ppt"): + return load_ppt(doc_path) + elif doc_path.endswith(".pptx"): return load_pptx(doc_path) elif doc_path.endswith(".md"): return load_md(doc_path) @@ -261,7 +314,7 @@ def document_loader(doc_path): ): return load_image(doc_path) elif doc_path.endswith(".svg"): - return load_image(doc_path) + return load_svg(doc_path) else: raise NotImplementedError( "Current only support pdf, html, txt, doc, docx, pptx, ppt, md, xml"