Skip to content

Commit

Permalink
Fix Dataprep Upload Link issue (opea-project#913)
Browse files Browse the repository at this point in the history
* fix html content loading problem

Signed-off-by: letonghan <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add empty list check (opea-project#914)

* Add outputs.

Signed-off-by: ZePan110 <[email protected]>

* Add empty list check

Signed-off-by: ZePan110 <[email protected]>

* test CI.

Signed-off-by: ZePan110 <[email protected]>

* Remove test files

Signed-off-by: ZePan110 <[email protected]>

* remove debug code

Signed-off-by: chensuyue <[email protected]>

---------

Signed-off-by: ZePan110 <[email protected]>
Signed-off-by: chensuyue <[email protected]>
Co-authored-by: chensuyue <[email protected]>

* Fix hardware tag retrieval issue (opea-project#916)

Signed-off-by: ZePan110 <[email protected]>

* fix html content loading problem

Signed-off-by: letonghan <[email protected]>

* fix milvus connection issue

Signed-off-by: letonghan <[email protected]>

* update parse_html function for all dbs

Signed-off-by: letonghan <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: letonghan <[email protected]>
Signed-off-by: ZePan110 <[email protected]>
Signed-off-by: chensuyue <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: ZePan110 <[email protected]>
Co-authored-by: chensuyue <[email protected]>
  • Loading branch information
4 people authored and cameronmorin committed Nov 28, 2024
1 parent b1d8d8c commit bd0c166
Show file tree
Hide file tree
Showing 17 changed files with 53 additions and 24 deletions.
15 changes: 7 additions & 8 deletions comps/dataprep/milvus/langchain/prepare_doc_milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
encode_filename,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
)
Expand All @@ -39,17 +39,16 @@
logflag = os.getenv("LOGFLAG", False)

# workaround notes: cp comps/dataprep/utils.py ./milvus/utils.py
# from utils import document_loader, get_tables_result, parse_html
index_params = {"index_type": "FLAT", "metric_type": "IP", "params": {}}
partition_field_name = "filename"
upload_folder = "./uploaded_files/"
milvus_uri = f"http://{MILVUS_HOST}:{MILVUS_PORT}"


class MosecEmbeddings(OpenAIEmbeddings):
def _get_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]:
_chunk_size = chunk_size or self.chunk_size
batched_embeddings: List[List[float]] = []
response = self.client.create(input=texts, **self._invocation_params)
if not isinstance(response, dict):
Expand Down Expand Up @@ -93,7 +92,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List):
batch_docs,
embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": milvus_uri},
partition_key_field=partition_field_name,
)
except Exception as e:
Expand Down Expand Up @@ -211,7 +210,7 @@ async def ingest_documents(
my_milvus = Milvus(
embedding_function=embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": milvus_uri},
index_params=index_params,
auto_id=True,
)
Expand Down Expand Up @@ -318,7 +317,7 @@ async def ingest_documents(
)

save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
await save_content_to_local_disk(save_path, content)
ingest_data_to_milvus(
DocPath(
Expand Down Expand Up @@ -347,7 +346,7 @@ async def rag_get_file_structure():
my_milvus = Milvus(
embedding_function=embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": milvus_uri},
index_params=index_params,
auto_id=True,
)
Expand Down Expand Up @@ -405,7 +404,7 @@ async def delete_single_file(file_path: str = Body(..., embed=True)):
my_milvus = Milvus(
embedding_function=embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": milvus_uri},
index_params=index_params,
auto_id=True,
)
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/milvus/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/neo4j/llama_index/extract_graph_neo4j.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
encode_filename,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
save_content_to_local_disk,
)

Expand Down Expand Up @@ -654,7 +654,7 @@ async def ingest_documents(
for link in link_list:
encoded_link = encode_filename(link)
save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
try:
await save_content_to_local_disk(save_path, content)
index = ingest_data_to_neo4j(
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/neo4j/llama_index/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ easyocr
fastapi
future
graspologic
html2text
huggingface_hub
ipython
langchain
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/pgvector/langchain/prepare_doc_pgvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
encode_filename,
get_file_structure,
get_separators,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
)
Expand Down Expand Up @@ -158,7 +158,7 @@ async def ingest_link_to_pgvector(link_list: List[str]):

for link in link_list:
texts = []
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
if logflag:
logger.info(f"[ ingest link ] link: {link} content: {content}")
encoded_link = encode_filename(link)
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/pgvector/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
8 changes: 4 additions & 4 deletions comps/dataprep/pinecone/langchain/prepare_doc_pinecone.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
get_file_structure,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
)
Expand Down Expand Up @@ -158,7 +158,7 @@ def ingest_data_to_pinecone(doc_path: DocPath):
pc = Pinecone(api_key=PINECONE_API_KEY)


async def ingest_link_to_pinecone(link_list: List[str]):
async def ingest_link_to_pinecone(link_list: List[str], chunk_size, chunk_overlap):
# Create embedding obj
if tei_embedding_endpoint:
# create embeddings using TEI endpoint service
Expand All @@ -178,7 +178,7 @@ async def ingest_link_to_pinecone(link_list: List[str]):

# save link contents and doc_ids one by one
for link in link_list:
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
if logflag:
logger.info(f"[ ingest link ] link: {link} content: {content}")
encoded_link = encode_filename(link)
Expand Down Expand Up @@ -239,7 +239,7 @@ async def ingest_documents(
link_list = json.loads(link_list) # Parse JSON string to list
if not isinstance(link_list, list):
raise HTTPException(status_code=400, detail="link_list should be a list.")
await ingest_link_to_pinecone(link_list)
await ingest_link_to_pinecone(link_list, chunk_size, chunk_overlap)
result = {"status": 200, "message": "Data preparation succeeded"}
if logflag:
logger.info(f"Successfully saved link list {link_list}")
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/pinecone/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/qdrant/langchain/prepare_doc_qdrant.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
encode_filename,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
save_content_to_local_disk,
)

Expand Down Expand Up @@ -149,7 +149,7 @@ async def ingest_documents(
for link in link_list:
encoded_link = encode_filename(link)
save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
try:
await save_content_to_local_disk(save_path, content)
ingest_data_to_qdrant(
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/qdrant/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/redis/langchain/prepare_doc_redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
format_search_results,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
)
Expand Down Expand Up @@ -320,7 +320,7 @@ async def ingest_documents(
)

save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
await save_content_to_local_disk(save_path, content)
ingest_data_to_redis(
DocPath(
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/redis/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
encode_filename,
get_file_structure,
get_separators,
parse_html,
parse_html_new,
remove_folder_with_ignore,
save_content_to_local_disk,
timeout,
Expand Down Expand Up @@ -255,7 +255,7 @@ def ingest_link_to_redis(link_list: List[str], enable_ray=False, num_cpus=20):
link_list = [str(f) for f in link_list]

def _parse_html(link):
data = parse_html([link])
data = parse_html_new([link], chunk_size=1500, chunk_overlap=100)
return data[0][0]

if enable_ray:
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/redis/langchain_ray/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ docarray[full]
docx2txt
easyocr
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down
22 changes: 22 additions & 0 deletions comps/dataprep/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,28 @@ def parse_html(input):
return chucks


def load_html_content(links, chunk_size=1500, chunk_overlap=50):
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.document_transformers import Html2TextTransformer

loader = AsyncHtmlLoader(links, ignore_load_errors=True, trust_env=True)
docs = loader.load()
html2text = Html2TextTransformer()
docs = list(html2text.transform_documents(docs))
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(docs)
return docs


def parse_html_new(input, chunk_size, chunk_overlap):
docs = load_html_content(input, chunk_size, chunk_overlap)
html_content = ""
for doc in docs:
html_content += doc.page_content + "\n"
return html_content


def get_tables_result(pdf_path, table_strategy):
"""Extract tables information from pdf file."""
if table_strategy == "fast":
Expand Down
4 changes: 2 additions & 2 deletions comps/dataprep/vdms/langchain/prepare_doc_vdms.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
encode_filename,
get_separators,
get_tables_result,
parse_html,
parse_html_new,
save_content_to_local_disk,
)

Expand Down Expand Up @@ -143,7 +143,7 @@ async def ingest_documents(
# check whether the link file already exists

save_path = upload_folder + encoded_link + ".txt"
content = parse_html([link])[0][0]
content = parse_html_new([link], chunk_size=chunk_size, chunk_overlap=chunk_overlap)
await save_content_to_local_disk(save_path, content)
ingest_data_to_vdms(
DocPath(
Expand Down
1 change: 1 addition & 0 deletions comps/dataprep/vdms/langchain/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ docx2txt
easyocr
einops
fastapi
html2text
huggingface_hub
langchain
langchain-community
Expand Down

0 comments on commit bd0c166

Please sign in to comment.