Skip to content

Commit

Permalink
Enhance the dataprep microservice by adding separators. (#312)
Browse files Browse the repository at this point in the history
* add separators for raw text data chunk to keep the semantic completeness.
Signed-off-by: zhlsunshine <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
zhlsunshine and pre-commit-ci[bot] authored Jul 18, 2024
1 parent 5e232a9 commit ef97c24
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 10 deletions.
4 changes: 2 additions & 2 deletions comps/dataprep/milvus/prepare_doc_milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pyspark import SparkConf, SparkContext

from comps import DocPath, opea_microservices, register_microservice
from comps.dataprep.utils import document_loader, get_tables_result, parse_html
from comps.dataprep.utils import document_loader, get_separators, get_tables_result, parse_html

# workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py
# from utils import document_loader, get_tables_result, parse_html
Expand Down Expand Up @@ -82,7 +82,7 @@ def ingest_data_to_milvus(doc_path: DocPath):
text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True
chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators()
)

content = document_loader(path)
Expand Down
6 changes: 4 additions & 2 deletions comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from langsmith import traceable

from comps import DocPath, ServiceType, opea_microservices, register_microservice, register_statistics
from comps.dataprep.utils import document_loader, parse_html
from comps.dataprep.utils import document_loader, get_separators, parse_html

tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")

Expand All @@ -36,7 +36,9 @@ def ingest_doc_to_pgvector(doc_path: DocPath):
doc_path = doc_path.path
print(f"Parsing document {doc_path}.")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators()
)
content = document_loader(doc_path)
chunks = text_splitter.split_text(content)
print("Done preprocessing. Created ", len(chunks), " chunks of the original pdf")
Expand Down
6 changes: 4 additions & 2 deletions comps/dataprep/pinecone/prepare_doc_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from langchain_community.vectorstores import Pinecone

from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
from comps.dataprep.utils import document_loader
from comps.dataprep.utils import document_loader, get_separators

tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")

Expand All @@ -28,7 +28,9 @@ def ingest_documents(doc_path: DocPath):
doc_path = doc_path.path
print(f"Parsing document {doc_path}.")

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators()
)
content = document_loader(doc_path)
chunks = text_splitter.split_text(content)

Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/qdrant/prepare_doc_qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from langchain_text_splitters import HTMLHeaderTextSplitter

from comps import DocPath, opea_microservices, opea_telemetry, register_microservice
from comps.dataprep.utils import document_loader, get_tables_result
from comps.dataprep.utils import document_loader, get_separators, get_tables_result

tei_embedding_endpoint = os.getenv("TEI_ENDPOINT")

Expand Down Expand Up @@ -38,7 +38,7 @@ def ingest_documents(doc_path: DocPath):
text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True
chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators()
)

content = document_loader(path)
Expand Down
3 changes: 2 additions & 1 deletion comps/dataprep/redis/langchain/prepare_doc_redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
document_loader,
encode_filename,
get_file_structure,
get_separators,
get_tables_result,
parse_html,
remove_folder_with_ignore,
Expand All @@ -47,7 +48,7 @@ def ingest_data_to_redis(doc_path: DocPath):
text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
else:
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True
chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators()
)

content = document_loader(path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
document_loader,
encode_filename,
get_file_structure,
get_separators,
parse_html,
remove_folder_with_ignore,
save_content_to_local_disk,
Expand Down Expand Up @@ -170,7 +171,9 @@ def data_to_redis_ray(data):


def data_to_redis(data):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100, add_start_index=True)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500, chunk_overlap=100, add_start_index=True, separators=get_separators(), is_separator_regex=False
)
chunks = text_splitter.split_text(data)

# Create vectorstore
Expand Down
17 changes: 17 additions & 0 deletions comps/dataprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,23 @@ def __exit__(self, *a, **kw):
print(f'{" " * Timer.level}{self.name} took {timeit.default_timer() - self.start} sec')


def get_separators():
separators = [
"\n\n",
"\n",
" ",
".",
",",
"\u200b", # Zero-width space
"\uff0c", # Fullwidth comma
"\u3001", # Ideographic comma
"\uff0e", # Fullwidth full stop
"\u3002", # Ideographic full stop
"",
]
return separators


def load_pdf(pdf_path):
"""Load the pdf file."""
doc = fitz.open(pdf_path)
Expand Down

0 comments on commit ef97c24

Please sign in to comment.