diff --git a/comps/dataprep/milvus/langchain/prepare_doc_milvus.py b/comps/dataprep/milvus/langchain/prepare_doc_milvus.py index 3def86f81..a6014b621 100644 --- a/comps/dataprep/milvus/langchain/prepare_doc_milvus.py +++ b/comps/dataprep/milvus/langchain/prepare_doc_milvus.py @@ -30,7 +30,7 @@ encode_filename, get_separators, get_tables_result, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, ) @@ -39,17 +39,16 @@ logflag = os.getenv("LOGFLAG", False) # workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py -# from utils import document_loader, get_tables_result, parse_html index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}} partition_field_name = "filename" upload_folder = "./uploaded_files/" +milvus_uri = f"http://{MILVUS_HOST}:{MILVUS_PORT}" class MosecEmbeddings(OpenAIEmbeddings): def _get_len_safe_embeddings( self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None ) -> List[List[float]]: - _chunk_size = chunk_size or self.chunk_size batched_embeddings: List[List[float]] = [] response = self.client.create(input=texts, **self._invocation_params) if not isinstance(response, dict): @@ -93,7 +92,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List): batch_docs, embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": milvus_uri}, partition_key_field=partition_field_name, ) except Exception as e: @@ -211,7 +210,7 @@ async def ingest_documents( my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": milvus_uri}, index_params=index_params, auto_id=True, ) @@ -318,7 +317,7 @@ async def ingest_documents( ) save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) await save_content_to_local_disk(save_path, content) ingest_data_to_milvus( DocPath( @@ -347,7 +346,7 @@ async def rag_get_file_structure(): my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": milvus_uri}, index_params=index_params, auto_id=True, ) @@ -405,7 +404,7 @@ async def delete_single_file(file_path: str = Body(..., embed=True)): my_milvus = Milvus( embedding_function=embeddings, collection_name=COLLECTION_NAME, - connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT}, + connection_args={"uri": milvus_uri}, index_params=index_params, auto_id=True, ) diff --git a/comps/dataprep/milvus/langchain/requirements.txt b/comps/dataprep/milvus/langchain/requirements.txt index 85ba3e972..611c95a15 100644 --- a/comps/dataprep/milvus/langchain/requirements.txt +++ b/comps/dataprep/milvus/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py b/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py index 198b61048..a7ece023f 100644 --- a/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py +++ b/comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py @@ -48,7 +48,7 @@ encode_filename, get_separators, get_tables_result, - parse_html, + parse_html_new, save_content_to_local_disk, ) @@ -654,7 +654,7 @@ async def ingest_documents( for link in link_list: encoded_link = encode_filename(link) save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) try: await save_content_to_local_disk(save_path, content) index = ingest_data_to_neo4j( diff --git a/comps/dataprep/neo4j/llama_index/requirements.txt b/comps/dataprep/neo4j/llama_index/requirements.txt index fc5f7b8d6..c183ecf3d 100644 --- a/comps/dataprep/neo4j/llama_index/requirements.txt +++ b/comps/dataprep/neo4j/llama_index/requirements.txt @@ -6,6 +6,7 @@ easyocr fastapi future graspologic +html2text huggingface_hub ipython langchain diff --git a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py index 1331f3772..78f9e3eea 100644 --- a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py +++ b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py @@ -21,7 +21,7 @@ encode_filename, get_file_structure, get_separators, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, ) @@ -158,7 +158,7 @@ async def ingest_link_to_pgvector(link_list: List[str]): for link in link_list: texts = [] - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP) if logflag: logger.info(f"[ ingest link ] link: {link} content: {content}") encoded_link = encode_filename(link) diff --git a/comps/dataprep/pgvector/langchain/requirements.txt b/comps/dataprep/pgvector/langchain/requirements.txt index 5235cd5ff..ab3d19db4 100644 --- a/comps/dataprep/pgvector/langchain/requirements.txt +++ b/comps/dataprep/pgvector/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py b/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py index 9bb5c35ff..aa24e44b1 100644 --- a/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py +++ b/comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py @@ -24,7 +24,7 @@ get_file_structure, get_separators, get_tables_result, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, ) @@ -158,7 +158,7 @@ def ingest_data_to_pinecone(doc_path: DocPath): pc = Pinecone(api_key=PINECONE_API_KEY) -async def ingest_link_to_pinecone(link_list: List[str]): +async def ingest_link_to_pinecone(link_list: List[str], chunk_size, chunk_overlap): # Create embedding obj if tei_embedding_endpoint: # create embeddings using TEI endpoint service @@ -178,7 +178,7 @@ async def ingest_link_to_pinecone(link_list: List[str]): # save link contents and doc_ids one by one for link in link_list: - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) if logflag: logger.info(f"[ ingest link ] link: {link} content: {content}") encoded_link = encode_filename(link) @@ -239,7 +239,7 @@ async def ingest_documents( link_list = json.loads(link_list) # Parse JSON string to list if not isinstance(link_list, list): raise HTTPException(status_code=400, detail="link_list should be a list.") - await ingest_link_to_pinecone(link_list) + await ingest_link_to_pinecone(link_list, chunk_size, chunk_overlap) result = {"status": 200, "message": "Data preparation succeeded"} if logflag: logger.info(f"Successfully saved link list {link_list}") diff --git a/comps/dataprep/pinecone/langchain/requirements.txt b/comps/dataprep/pinecone/langchain/requirements.txt index 80f81bd5e..27bbac44b 100644 --- a/comps/dataprep/pinecone/langchain/requirements.txt +++ b/comps/dataprep/pinecone/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py b/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py index a97987817..80678e98e 100644 --- a/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py +++ b/comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py @@ -19,7 +19,7 @@ encode_filename, get_separators, get_tables_result, - parse_html, + parse_html_new, save_content_to_local_disk, ) @@ -149,7 +149,7 @@ async def ingest_documents( for link in link_list: encoded_link = encode_filename(link) save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) try: await save_content_to_local_disk(save_path, content) ingest_data_to_qdrant( diff --git a/comps/dataprep/qdrant/langchain/requirements.txt b/comps/dataprep/qdrant/langchain/requirements.txt index f505af163..8f92c8ca8 100644 --- a/comps/dataprep/qdrant/langchain/requirements.txt +++ b/comps/dataprep/qdrant/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py index 6902117dc..ae69a28fc 100644 --- a/comps/dataprep/redis/langchain/prepare_doc_redis.py +++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py @@ -26,7 +26,7 @@ format_search_results, get_separators, get_tables_result, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, ) @@ -320,7 +320,7 @@ async def ingest_documents( ) save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) await save_content_to_local_disk(save_path, content) ingest_data_to_redis( DocPath( diff --git a/comps/dataprep/redis/langchain/requirements.txt b/comps/dataprep/redis/langchain/requirements.txt index 8c3b116fa..43ff2f93b 100644 --- a/comps/dataprep/redis/langchain/requirements.txt +++ b/comps/dataprep/redis/langchain/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py index d5ec731ba..2af834cac 100644 --- a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py +++ b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py @@ -48,7 +48,7 @@ encode_filename, get_file_structure, get_separators, - parse_html, + parse_html_new, remove_folder_with_ignore, save_content_to_local_disk, timeout, @@ -255,7 +255,7 @@ def ingest_link_to_redis(link_list: List[str], enable_ray=False, num_cpus=20): link_list = [str(f) for f in link_list] def _parse_html(link): - data = parse_html([link]) + data = parse_html_new([link], chunk_size=1500, chunk_overlap=100) return data[0][0] if enable_ray: diff --git a/comps/dataprep/redis/langchain_ray/requirements.txt b/comps/dataprep/redis/langchain_ray/requirements.txt index 0237109e7..853304542 100644 --- a/comps/dataprep/redis/langchain_ray/requirements.txt +++ b/comps/dataprep/redis/langchain_ray/requirements.txt @@ -4,6 +4,7 @@ docarray[full] docx2txt easyocr fastapi +html2text huggingface_hub langchain langchain-community diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index 910bca343..cf104017f 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -620,6 +620,28 @@ def parse_html(input): return chucks +def load_html_content(links, chunk_size=1500, chunk_overlap=50): + from langchain.text_splitter import RecursiveCharacterTextSplitter + from langchain_community.document_loaders import AsyncHtmlLoader + from langchain_community.document_transformers import Html2TextTransformer + + loader = AsyncHtmlLoader(links, ignore_load_errors=True, trust_env=True) + docs = loader.load() + html2text = Html2TextTransformer() + docs = list(html2text.transform_documents(docs)) + text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) + docs = text_splitter.split_documents(docs) + return docs + + +def parse_html_new(input, chunk_size, chunk_overlap): + docs = load_html_content(input, chunk_size, chunk_overlap) + html_content = "" + for doc in docs: + html_content += doc.page_content + "\n" + return html_content + + def get_tables_result(pdf_path, table_strategy): """Extract tables information from pdf file.""" if table_strategy == "fast": diff --git a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py index d45373f00..a50a95853 100644 --- a/comps/dataprep/vdms/langchain/prepare_doc_vdms.py +++ b/comps/dataprep/vdms/langchain/prepare_doc_vdms.py @@ -19,7 +19,7 @@ encode_filename, get_separators, get_tables_result, - parse_html, + parse_html_new, save_content_to_local_disk, ) @@ -143,7 +143,7 @@ async def ingest_documents( # check whether the link file already exists save_path = upload_folder + encoded_link + ".txt" - content = parse_html([link])[0][0] + content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap) await save_content_to_local_disk(save_path, content) ingest_data_to_vdms( DocPath( diff --git a/comps/dataprep/vdms/langchain/requirements.txt b/comps/dataprep/vdms/langchain/requirements.txt index 88b2c033a..8f7be8d56 100644 --- a/comps/dataprep/vdms/langchain/requirements.txt +++ b/comps/dataprep/vdms/langchain/requirements.txt @@ -6,6 +6,7 @@ docx2txt easyocr einops fastapi +html2text huggingface_hub langchain langchain-community