DataPrep extract info from table in the docs (#146)

* Add microservice for table extraction Signed-off-by: Liangyx2 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix Signed-off-by: Liangyx2 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update license copyright Signed-off-by: Liangyx2 <[email protected]> * DataPrep extract info from table in the docs Signed-off-by: Liangyx2 <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine Signed-off-by: Liangyx2 <[email protected]> * refine Signed-off-by: Liangyx2 <[email protected]> * Update prepare_doc_redis.py * Update prepare_doc_qdrant.py * Update prepare_doc_milvus.py --------- Signed-off-by: Liangyx2 <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: chen, suyue <[email protected]> Co-authored-by: XuhuiRen <[email protected]>
opea-project · Jun 26, 2024 · 953e784 · 953e784
1 parent 54e9b20
commit 953e784
Show file tree

Hide file tree

Showing 11 changed files with 170 additions and 8 deletions.
diff --git a/comps/cores/proto/docarray.py b/comps/cores/proto/docarray.py
@@ -22,6 +22,8 @@ class DocPath(BaseDoc):
     path: str
     chunk_size: int = 1500
     chunk_overlap: int = 100
+    process_table: bool = False
+    table_strategy: str = "fast"
 
 
 class EmbedDoc768(BaseDoc):

diff --git a/comps/dataprep/milvus/README.md b/comps/dataprep/milvus/README.md
@@ -6,6 +6,9 @@
 
 ```bash
 pip install -r requirements.txt
+apt-get install tesseract-ocr -y
+apt-get install libtesseract-dev -y
+apt-get install poppler-utils -y
 ```
 
 ## Start Milvus Server
@@ -60,3 +63,11 @@ You can specify chunk_size and chunk_size by the following commands.
 ```bash
 curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","chunk_size":1500,"chunk_overlap":100}' http://localhost:6010/v1/dataprep
 ```
+
+We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast".
+
+Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`.
+
+```bash
+curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep
+```
diff --git a/comps/dataprep/milvus/prepare_doc_milvus.py b/comps/dataprep/milvus/prepare_doc_milvus.py
@@ -13,7 +13,7 @@
 from comps.cores.mega.micro_service import opea_microservices, register_microservice
 from comps.cores.proto.docarray import DocPath
 from comps.cores.telemetry.opea_telemetry import opea_telemetry
-from comps.dataprep.utils import document_loader
+from comps.dataprep.utils import document_loader, get_tables_result
 
 # current_script_path = os.path.dirname(os.path.abspath(__file__))
 # parent_dir = os.path.dirname(current_script_path)
@@ -49,7 +49,9 @@ def ingest_documents(doc_path: DocPath):
 
     content = document_loader(path)
     chunks = text_splitter.split_text(content)
-
+    if doc_path.process_table and path.endswith(".pdf"):
+        table_chunks = get_tables_result(path, doc_path.table_strategy)
+        chunks = chunks + table_chunks
     print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
     # Create vectorstore
     if EMBEDDING_ENDPOINT:

diff --git a/comps/dataprep/milvus/requirements.txt b/comps/dataprep/milvus/requirements.txt
@@ -21,3 +21,4 @@ pymupdf==1.24.5
 python-docx==0.8.11
 sentence_transformers
 shortuuid
+unstructured[all-docs]==0.11.5
diff --git a/comps/dataprep/qdrant/README.md b/comps/dataprep/qdrant/README.md
@@ -6,6 +6,9 @@
 
 ```bash
 pip install -r requirements.txt
+apt-get install tesseract-ocr -y
+apt-get install libtesseract-dev -y
+apt-get install poppler-utils -y
 ```
 
 ## Start Qdrant Server
@@ -49,7 +52,6 @@ docker run -d --name="dataprep-qdrant-server" -p 6000:6000 --ipc=host -e http_pr
 ## Setup Environment Variables
 
 ```bash
-export no_proxy=${your_no_proxy}
 export http_proxy=${your_http_proxy}
 export https_proxy=${your_http_proxy}
 export QDRANT=${host_ip}
@@ -77,3 +79,11 @@ You can specify chunk_size and chunk_size by the following commands.
 ```bash
 curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","chunk_size":1500,"chunk_overlap":100}' http://localhost:6000/v1/dataprep
 ```
+
+We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast".
+
+Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`.
+
+```bash
+curl -X POST -H "Content-Type: application/json" -d '{"path":"/path/to/document","process_table":true,"table_strategy":"hq"}' http://localhost:6000/v1/dataprep
+```
diff --git a/comps/dataprep/qdrant/prepare_doc_qdrant.py b/comps/dataprep/qdrant/prepare_doc_qdrant.py
@@ -10,7 +10,7 @@
 from langchain_text_splitters import HTMLHeaderTextSplitter
 
 from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
-from comps.dataprep.utils import document_loader
+from comps.dataprep.utils import document_loader, get_tables_result
 
 tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
 
@@ -43,7 +43,9 @@ def ingest_documents(doc_path: DocPath):
 
     content = document_loader(path)
     chunks = text_splitter.split_text(content)
-
+    if doc_path.process_table and path.endswith(".pdf"):
+        table_chunks = get_tables_result(path, doc_path.table_strategy)
+        chunks = chunks + table_chunks
     print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
     # Create vectorstore
     if tei_embedding_endpoint:

diff --git a/comps/dataprep/qdrant/requirements.txt b/comps/dataprep/qdrant/requirements.txt
@@ -18,3 +18,4 @@ python-docx
 qdrant-client
 sentence_transformers
 shortuuid
+unstructured[all-docs]==0.11.5
diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md
@@ -13,6 +13,9 @@ We organized these two folders in the same way, so you can use either framework
 ```bash
 apt update
 apt install default-jre
+apt-get install tesseract-ocr -y
+apt-get install libtesseract-dev -y
+apt-get install poppler-utils -y
 # for langchain
 cd langchain
 # for llama_index
@@ -147,12 +150,25 @@ You can specify chunk_size and chunk_size by the following commands.
 ```bash
 curl -X POST \
     -H "Content-Type: multipart/form-data" \
-    -F "files=@/home/sdp/yuxiang/opea_intent/GenAIComps4/comps/table_extraction/LLAMA2_page6.pdf" \
+    -F "files=@./file1.txt" \
     -F "chunk_size=1500" \
     -F "chunk_overlap=100" \
     http://localhost:6007/v1/dataprep
 ```
 
+We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast".
+
+Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`.
+
+```bash
+curl -X POST \
+    -H "Content-Type: multipart/form-data" \
+    -F "files=@./your_file.pdf" \
+    -F "process_table=true" \
+    -F "table_strategy=hq" \
+    http://localhost:6007/v1/dataprep
+```
+
 - Multiple file upload
 
 ```bash

diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py
@@ -17,7 +17,7 @@
 from pyspark import SparkConf, SparkContext
 
 from comps import DocPath, opea_microservices, register_microservice
-from comps.dataprep.utils import document_loader, parse_html
+from comps.dataprep.utils import document_loader, get_tables_result, parse_html
 
 tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")
 
@@ -53,6 +53,9 @@ def ingest_data_to_redis(doc_path: DocPath):
     content = document_loader(path)
 
     chunks = text_splitter.split_text(content)
+    if doc_path.process_table and path.endswith(".pdf"):
+        table_chunks = get_tables_result(path, doc_path.table_strategy)
+        chunks = chunks + table_chunks
     print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
 
     # Create vectorstore
@@ -117,6 +120,8 @@ async def ingest_documents(
     link_list: Optional[str] = Form(None),
     chunk_size: int = Form(1500),
     chunk_overlap: int = Form(100),
+    process_table: bool = Form(False),
+    table_strategy: str = Form("fast"),
 ):
     print(f"files:{files}")
     print(f"link_list:{link_list}")
@@ -133,6 +138,15 @@ async def ingest_documents(
         for file in files:
             save_path = upload_folder + file.filename
             await save_file_to_local_disk(save_path, file)
+            ingest_data_to_redis(
+                DocPath(
+                    path=save_path,
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap,
+                    process_table=process_table,
+                    table_strategy=table_strategy,
+                )
+            )
             uploaded_files.append(save_path)
             print(f"Successfully saved file {save_path}")
 

diff --git a/comps/dataprep/redis/langchain/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt
@@ -21,4 +21,4 @@ python-docx
 redis
 sentence_transformers
 shortuuid
-unstructured
+unstructured[all-docs]==0.11.5
diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py
@@ -22,13 +22,15 @@
 import yaml
 from bs4 import BeautifulSoup
 from docx import Document as DDocument
+from langchain import LLMChain, PromptTemplate
 from langchain_community.document_loaders import (
     UnstructuredHTMLLoader,
     UnstructuredImageLoader,
     UnstructuredMarkdownLoader,
     UnstructuredPowerPointLoader,
     UnstructuredXMLLoader,
 )
+from langchain_community.llms import HuggingFaceEndpoint
 from PIL import Image
 
 
@@ -457,3 +459,104 @@ def parse_html(input):
             print("The given link/str {} cannot be parsed.".format(link))
 
     return chucks
+
+
+def get_tables_result(pdf_path, table_strategy):
+    """Extract tables information from pdf file."""
+    if table_strategy == "fast":
+        return None
+
+    from unstructured.documents.elements import FigureCaption
+    from unstructured.partition.pdf import partition_pdf
+
+    tables_result = []
+    raw_pdf_elements = partition_pdf(
+        filename=pdf_path,
+        infer_table_structure=True,
+    )
+    tables = [el for el in raw_pdf_elements if el.category == "Table"]
+    for table in tables:
+        table_coords = table.metadata.coordinates.points
+        content = table.metadata.text_as_html
+        table_page_number = table.metadata.page_number
+        min_distance = float("inf")
+        table_summary = None
+        if table_strategy == "hq":
+            for element in raw_pdf_elements:
+                if isinstance(element, FigureCaption) or element.text.startswith("Tab"):
+                    caption_page_number = element.metadata.page_number
+                    caption_coords = element.metadata.coordinates.points
+                    related, y_distance = get_relation(
+                        table_coords, caption_coords, table_page_number, caption_page_number
+                    )
+                    if related:
+                        if y_distance < min_distance:
+                            min_distance = y_distance
+                            table_summary = element.text
+            if table_summary is None:
+                parent_id = table.metadata.parent_id
+                for element in raw_pdf_elements:
+                    if element.id == parent_id:
+                        table_summary = element.text
+                        break
+        elif table_strategy == "llm":
+            table_summary = llm_generate(content)
+            table_summary = table_summary.lstrip("\n ")
+        elif table_strategy is None:
+            table_summary = None
+        if table_summary is None:
+            text = f"[Table: {content}]"
+        else:
+            text = f"|Table: [Summary: {table_summary}], [Content: {content}]|"
+        tables_result.append(text)
+    return tables_result
+
+
+def llm_generate(content):
+    llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
+    llm = HuggingFaceEndpoint(
+        endpoint_url=llm_endpoint,
+        max_new_tokens=1000,
+        top_k=40,
+        top_p=0.9,
+        temperature=0.8,
+        streaming=False,
+        num_beams=2,
+        num_return_sequences=2,
+        use_cache=True,
+        timeout=600,
+    )
+
+    table_summary_template = """
+    Task: Your task is to give a concise summary of the table. \
+    The summary should cover the overall table structure and all detailed information of the table. \
+    The table will be given in html format. Summarize the table below.
+    ---
+    ### Table:
+    {table_content}
+    ---
+    ### Generated Summary:
+    """
+
+    prompt = PromptTemplate(template=table_summary_template, input_variables=["table_content"])
+
+    llm_chain = LLMChain(prompt=prompt, llm=llm)
+
+    response = llm_chain.invoke(content)
+    response = response["text"]
+    print("response", response)
+    return response
+
+
+def get_relation(table_coords, caption_coords, table_page_number, caption_page_number, threshold=100):
+    """Get the relation of a pair of table and caption."""
+    same_page = table_page_number == caption_page_number
+    x_overlap = (min(table_coords[2][0], caption_coords[2][0]) - max(table_coords[0][0], caption_coords[0][0])) > 0
+    if table_coords[0][1] - caption_coords[1][1] >= 0:
+        y_distance = table_coords[0][1] - caption_coords[1][1]
+    elif caption_coords[0][1] - table_coords[1][1] >= 0:
+        y_distance = caption_coords[0][1] - table_coords[1][1]
+    else:
+        y_distance = 0
+    y_close = y_distance < threshold
+    return same_page and x_overlap and y_close, y_distance