Fix Dataprep Upload Link issue (opea-project#913)

* fix html content loading problem Signed-off-by: letonghan <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add empty list check (opea-project#914) * Add outputs. Signed-off-by: ZePan110 <[email protected]> * Add empty list check Signed-off-by: ZePan110 <[email protected]> * test CI. Signed-off-by: ZePan110 <[email protected]> * Remove test files Signed-off-by: ZePan110 <[email protected]> * remove debug code Signed-off-by: chensuyue <[email protected]> --------- Signed-off-by: ZePan110 <[email protected]> Signed-off-by: chensuyue <[email protected]> Co-authored-by: chensuyue <[email protected]> * Fix hardware tag retrieval issue (opea-project#916) Signed-off-by: ZePan110 <[email protected]> * fix html content loading problem Signed-off-by: letonghan <[email protected]> * fix milvus connection issue Signed-off-by: letonghan <[email protected]> * update parse_html function for all dbs Signed-off-by: letonghan <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: letonghan <[email protected]> Signed-off-by: ZePan110 <[email protected]> Signed-off-by: chensuyue <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: ZePan110 <[email protected]> Co-authored-by: chensuyue <[email protected]>
opea-aws-proserve · Nov 28, 2024 · bd0c166 · bd0c166
1 parent b1d8d8c
commit bd0c166
Show file tree

Hide file tree

Showing 17 changed files with 53 additions and 24 deletions.
diff --git a/comps/dataprep/milvus/langchain/prepare_doc_milvus.py b/comps/dataprep/milvus/langchain/prepare_doc_milvus.py
@@ -30,7 +30,7 @@
     encode_filename,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
 )
@@ -39,17 +39,16 @@
 logflag = os.getenv("LOGFLAG", False)
 
 # workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py
-# from utils import document_loader, get_tables_result, parse_html
 index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}}
 partition_field_name = "filename"
 upload_folder = "./uploaded_files/"
+milvus_uri = f"http://{MILVUS_HOST}:{MILVUS_PORT}"
 
 
 class MosecEmbeddings(OpenAIEmbeddings):
     def _get_len_safe_embeddings(
         self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
     ) -> List[List[float]]:
-        _chunk_size = chunk_size or self.chunk_size
         batched_embeddings: List[List[float]] = []
         response = self.client.create(input=texts, **self._invocation_params)
         if not isinstance(response, dict):
@@ -93,7 +92,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List):
                 batch_docs,
                 embeddings,
                 collection_name=COLLECTION_NAME,
-                connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
+                connection_args={"uri": milvus_uri},
                 partition_key_field=partition_field_name,
             )
         except Exception as e:
@@ -211,7 +210,7 @@ async def ingest_documents(
     my_milvus = Milvus(
         embedding_function=embeddings,
         collection_name=COLLECTION_NAME,
-        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
+        connection_args={"uri": milvus_uri},
         index_params=index_params,
         auto_id=True,
     )
@@ -318,7 +317,7 @@ async def ingest_documents(
                     )
 
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             await save_content_to_local_disk(save_path, content)
             ingest_data_to_milvus(
                 DocPath(
@@ -347,7 +346,7 @@ async def rag_get_file_structure():
     my_milvus = Milvus(
         embedding_function=embeddings,
         collection_name=COLLECTION_NAME,
-        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
+        connection_args={"uri": milvus_uri},
         index_params=index_params,
         auto_id=True,
     )
@@ -405,7 +404,7 @@ async def delete_single_file(file_path: str = Body(..., embed=True)):
     my_milvus = Milvus(
         embedding_function=embeddings,
         collection_name=COLLECTION_NAME,
-        connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
+        connection_args={"uri": milvus_uri},
         index_params=index_params,
         auto_id=True,
     )

diff --git a/comps/dataprep/milvus/langchain/requirements.txt b/comps/dataprep/milvus/langchain/requirements.txt
@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

diff --git a/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py b/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py
@@ -48,7 +48,7 @@
     encode_filename,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     save_content_to_local_disk,
 )
 
@@ -654,7 +654,7 @@ async def ingest_documents(
         for link in link_list:
             encoded_link = encode_filename(link)
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             try:
                 await save_content_to_local_disk(save_path, content)
                 index = ingest_data_to_neo4j(

diff --git a/comps/dataprep/neo4j/llama_index/requirements.txt b/comps/dataprep/neo4j/llama_index/requirements.txt
@@ -6,6 +6,7 @@ easyocr
 fastapi
 future
 graspologic 
+html2text
 huggingface_hub
 ipython
 langchain

diff --git a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py
@@ -21,7 +21,7 @@
     encode_filename,
     get_file_structure,
     get_separators,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
 )
@@ -158,7 +158,7 @@ async def ingest_link_to_pgvector(link_list: List[str]):
 
     for link in link_list:
         texts = []
-        content = parse_html([link])[0][0]
+        content = parse_html_new([link], chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
         if logflag:
             logger.info(f"[ ingest link ] link: {link} content: {content}")
         encoded_link = encode_filename(link)

diff --git a/comps/dataprep/pgvector/langchain/requirements.txt b/comps/dataprep/pgvector/langchain/requirements.txt
@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

diff --git a/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py b/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py
@@ -24,7 +24,7 @@
     get_file_structure,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
 )
@@ -158,7 +158,7 @@ def ingest_data_to_pinecone(doc_path: DocPath):
     pc = Pinecone(api_key=PINECONE_API_KEY)
 
 
-async def ingest_link_to_pinecone(link_list: List[str]):
+async def ingest_link_to_pinecone(link_list: List[str], chunk_size, chunk_overlap):
     # Create embedding obj
     if tei_embedding_endpoint:
         # create embeddings using TEI endpoint service
@@ -178,7 +178,7 @@ async def ingest_link_to_pinecone(link_list: List[str]):
 
     # save link contents and doc_ids one by one
     for link in link_list:
-        content = parse_html([link])[0][0]
+        content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         if logflag:
             logger.info(f"[ ingest link ] link: {link} content: {content}")
         encoded_link = encode_filename(link)
@@ -239,7 +239,7 @@ async def ingest_documents(
             link_list = json.loads(link_list)  # Parse JSON string to list
             if not isinstance(link_list, list):
                 raise HTTPException(status_code=400, detail="link_list should be a list.")
-            await ingest_link_to_pinecone(link_list)
+            await ingest_link_to_pinecone(link_list, chunk_size, chunk_overlap)
             result = {"status": 200, "message": "Data preparation succeeded"}
             if logflag:
                 logger.info(f"Successfully saved link list {link_list}")

diff --git a/comps/dataprep/pinecone/langchain/requirements.txt b/comps/dataprep/pinecone/langchain/requirements.txt
@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

diff --git a/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py b/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py
@@ -19,7 +19,7 @@
     encode_filename,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     save_content_to_local_disk,
 )
 
@@ -149,7 +149,7 @@ async def ingest_documents(
         for link in link_list:
             encoded_link = encode_filename(link)
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             try:
                 await save_content_to_local_disk(save_path, content)
                 ingest_data_to_qdrant(

diff --git a/comps/dataprep/qdrant/langchain/requirements.txt b/comps/dataprep/qdrant/langchain/requirements.txt
@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py
@@ -26,7 +26,7 @@
     format_search_results,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
 )
@@ -320,7 +320,7 @@ async def ingest_documents(
                 )
 
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             await save_content_to_local_disk(save_path, content)
             ingest_data_to_redis(
                 DocPath(

diff --git a/comps/dataprep/redis/langchain/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt
@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

diff --git a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py
@@ -48,7 +48,7 @@
     encode_filename,
     get_file_structure,
     get_separators,
-    parse_html,
+    parse_html_new,
     remove_folder_with_ignore,
     save_content_to_local_disk,
     timeout,
@@ -255,7 +255,7 @@ def ingest_link_to_redis(link_list: List[str], enable_ray=False, num_cpus=20):
     link_list = [str(f) for f in link_list]
 
     def _parse_html(link):
-        data = parse_html([link])
+        data = parse_html_new([link], chunk_size=1500, chunk_overlap=100)
         return data[0][0]
 
     if enable_ray:

diff --git a/comps/dataprep/redis/langchain_ray/requirements.txt b/comps/dataprep/redis/langchain_ray/requirements.txt
@@ -4,6 +4,7 @@ docarray[full]
 docx2txt
 easyocr
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community

diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py
@@ -620,6 +620,28 @@ def parse_html(input):
     return chucks
 
 
+def load_html_content(links, chunk_size=1500, chunk_overlap=50):
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+    from langchain_community.document_loaders import AsyncHtmlLoader
+    from langchain_community.document_transformers import Html2TextTransformer
+
+    loader = AsyncHtmlLoader(links, ignore_load_errors=True, trust_env=True)
+    docs = loader.load()
+    html2text = Html2TextTransformer()
+    docs = list(html2text.transform_documents(docs))
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    docs = text_splitter.split_documents(docs)
+    return docs
+
+
+def parse_html_new(input, chunk_size, chunk_overlap):
+    docs = load_html_content(input, chunk_size, chunk_overlap)
+    html_content = ""
+    for doc in docs:
+        html_content += doc.page_content + "\n"
+    return html_content
+
+
 def get_tables_result(pdf_path, table_strategy):
     """Extract tables information from pdf file."""
     if table_strategy == "fast":

diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py
@@ -19,7 +19,7 @@
     encode_filename,
     get_separators,
     get_tables_result,
-    parse_html,
+    parse_html_new,
     save_content_to_local_disk,
 )
 
@@ -143,7 +143,7 @@ async def ingest_documents(
             # check whether the link file already exists
 
             save_path = upload_folder + encoded_link + ".txt"
-            content = parse_html([link])[0][0]
+            content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
             await save_content_to_local_disk(save_path, content)
             ingest_data_to_vdms(
                 DocPath(

diff --git a/comps/dataprep/vdms/langchain/requirements.txt b/comps/dataprep/vdms/langchain/requirements.txt
@@ -6,6 +6,7 @@ docx2txt
 easyocr
 einops
 fastapi
+html2text
 huggingface_hub
 langchain
 langchain-community