diff --git a/comps/dataprep/milvus/prepare_doc_milvus.py b/comps/dataprep/milvus/prepare_doc_milvus.py index c14ba0f5c..46a81e3f3 100644 --- a/comps/dataprep/milvus/prepare_doc_milvus.py +++ b/comps/dataprep/milvus/prepare_doc_milvus.py @@ -25,7 +25,7 @@ from pyspark import SparkConf, SparkContext from comps import DocPath, opea_microservices, register_microservice -from comps.dataprep.utils import document_loader, get_tables_result, parse_html +from comps.dataprep.utils import document_loader, get_separators, get_tables_result, parse_html # workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py # from utils import document_loader, get_tables_result, parse_html @@ -82,7 +82,7 @@ def ingest_data_to_milvus(doc_path: DocPath): text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) else: text_splitter = RecursiveCharacterTextSplitter( - chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True + chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() ) content = document_loader(path) diff --git a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py index 02f034b3f..9c38cbe6a 100644 --- a/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py +++ b/comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py @@ -15,7 +15,7 @@ from langsmith import traceable from comps import DocPath, ServiceType, opea_microservices, register_microservice, register_statistics -from comps.dataprep.utils import document_loader, parse_html +from comps.dataprep.utils import document_loader, get_separators, parse_html tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") @@ -36,7 +36,9 @@ def ingest_doc_to_pgvector(doc_path: DocPath): doc_path = doc_path.path print(f"Parsing document {doc_path}.") - text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators() + ) content = document_loader(doc_path) chunks = text_splitter.split_text(content) print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf") diff --git a/comps/dataprep/pinecone/prepare_doc_pinecone.py b/comps/dataprep/pinecone/prepare_doc_pinecone.py index 1c82e2599..1a001a1fd 100644 --- a/comps/dataprep/pinecone/prepare_doc_pinecone.py +++ b/comps/dataprep/pinecone/prepare_doc_pinecone.py @@ -9,7 +9,7 @@ from langchain_community.vectorstores import Pinecone from comps import DocPath, opea_microservices, opea_telemetry, register_microservice -from comps.dataprep.utils import document_loader +from comps.dataprep.utils import document_loader, get_separators tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") @@ -28,7 +28,9 @@ def ingest_documents(doc_path: DocPath): doc_path = doc_path.path print(f"Parsing document {doc_path}.") - text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators() + ) content = document_loader(doc_path) chunks = text_splitter.split_text(content) diff --git a/comps/dataprep/qdrant/prepare_doc_qdrant.py b/comps/dataprep/qdrant/prepare_doc_qdrant.py index 1dc554eff..422854eec 100644 --- a/comps/dataprep/qdrant/prepare_doc_qdrant.py +++ b/comps/dataprep/qdrant/prepare_doc_qdrant.py @@ -10,7 +10,7 @@ from langchain_text_splitters import HTMLHeaderTextSplitter from comps import DocPath, opea_microservices, opea_telemetry, register_microservice -from comps.dataprep.utils import document_loader, get_tables_result +from comps.dataprep.utils import document_loader, get_separators, get_tables_result tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") @@ -38,7 +38,7 @@ def ingest_documents(doc_path: DocPath): text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) else: text_splitter = RecursiveCharacterTextSplitter( - chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True + chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() ) content = document_loader(path) diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py index a7734c768..78537b0d4 100644 --- a/comps/dataprep/redis/langchain/prepare_doc_redis.py +++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py @@ -23,6 +23,7 @@ document_loader, encode_filename, get_file_structure, + get_separators, get_tables_result, parse_html, remove_folder_with_ignore, @@ -47,7 +48,7 @@ def ingest_data_to_redis(doc_path: DocPath): text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) else: text_splitter = RecursiveCharacterTextSplitter( - chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True + chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() ) content = document_loader(path) diff --git a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py index 6bd906477..eca41f649 100644 --- a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py +++ b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py @@ -48,6 +48,7 @@ document_loader, encode_filename, get_file_structure, + get_separators, parse_html, remove_folder_with_ignore, save_content_to_local_disk, @@ -170,7 +171,9 @@ def data_to_redis_ray(data): def data_to_redis(data): - text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True) + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators(), is_separator_regex=False + ) chunks = text_splitter.split_text(data) # Create vectorstore diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index 416d92fe3..786366a12 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -88,6 +88,23 @@ def __exit__(self, *a, **kw): print(f'{" " * Timer.level}{self.name} took {timeit.default_timer() - self.start} sec') +def get_separators(): + separators = [ + "\n\n", + "\n", + " ", + ".", + ",", + "\u200b", # Zero-width space + "\uff0c", # Fullwidth comma + "\u3001", # Ideographic comma + "\uff0e", # Fullwidth full stop + "\u3002", # Ideographic full stop + "", + ] + return separators + + def load_pdf(pdf_path): """Load the pdf file.""" doc = fitz.open(pdf_path)