diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md index 6ef685389..58fe3b34d 100644 --- a/comps/dataprep/redis/README.md +++ b/comps/dataprep/redis/README.md @@ -106,13 +106,13 @@ docker build -t opea/dataprep-on-ray-redis:latest --build-arg https_proxy=$https - option 1: Start single-process version (for 1-10 files processing) ```bash -docker run -d --name="dataprep-redis-server" -p 6007:6007 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT opea/dataprep-redis:latest +docker run -d --name="dataprep-redis-server" -p 6007:6007 -p 6008:6008 -p 6009:6009 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT opea/dataprep-redis:latest ``` - option 2: Start multi-process version (for >10 files processing) ```bash -docker run -d --name="dataprep-redis-server" -p 6007:6007 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e TIMEOUT_SECONDS=600 opea/dataprep-on-ray-redis:latest +docker run -d --name="dataprep-redis-server" -p 6007:6007 -p 6008:6008 -p 6009:6009 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e TIMEOUT_SECONDS=600 opea/dataprep-on-ray-redis:latest ``` ## 2.5 Run with Docker Compose (Option B - deprecated, will move to genAIExample in future) @@ -133,6 +133,8 @@ docker container logs -f dataprep-redis-server # 🚀4. Consume Microservice +## 4.1 Consume Upload API + Once document preparation microservice for Redis is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database. Make sure the file path after `files=@` is correct. @@ -210,3 +212,58 @@ try: except requests.exceptions.RequestException as e: print("An error occurred:", e) ``` + +## 4.2 Consume get_file API + +To get uploaded file structures, use the following command: + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + http://localhost:6008/v1/dataprep/get_file +``` + +Then you will get the response JSON like this: + +```json +[ + { + "name": "uploaded_file_1.txt", + "id": "uploaded_file_1.txt", + "type": "File", + "parent": "" + }, + { + "name": "uploaded_file_2.txt", + "id": "uploaded_file_2.txt", + "type": "File", + "parent": "" + } +] +``` + +## 4.3 Consume delete_file API + +To delete uploaded file/link, use the following command. + +The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API. + +```bash +# delete link +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "https://www.ces.tech/.txt"}' \ + http://10.165.57.68:6009/v1/dataprep/delete_file + +# delete file +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "uploaded_file_1.txt"}' \ + http://10.165.57.68:6009/v1/dataprep/delete_file + +# delete all files and links +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' \ + http://10.165.57.68:6009/v1/dataprep/delete_file +``` diff --git a/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml b/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml index cd668397b..e2775972d 100644 --- a/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml +++ b/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml @@ -14,6 +14,8 @@ services: container_name: dataprep-redis-server ports: - "6007:6007" + - "6008:6008" + - "6009:6009" ipc: host environment: no_proxy: ${no_proxy} diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py index 36577f8d8..a7734c768 100644 --- a/comps/dataprep/redis/langchain/prepare_doc_redis.py +++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py @@ -3,12 +3,13 @@ import json import os +import shutil import uuid from pathlib import Path from typing import List, Optional, Union from config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL -from fastapi import File, Form, HTTPException, UploadFile +from fastapi import Body, File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores import Redis @@ -17,20 +18,19 @@ from pyspark import SparkConf, SparkContext from comps import DocPath, opea_microservices, register_microservice -from comps.dataprep.utils import document_loader, get_tables_result, parse_html +from comps.dataprep.utils import ( + create_upload_folder, + document_loader, + encode_filename, + get_file_structure, + get_tables_result, + parse_html, + remove_folder_with_ignore, + save_content_to_local_disk, +) tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") - - -async def save_file_to_local_disk(save_path: str, file): - save_path = Path(save_path) - with save_path.open("wb") as fout: - try: - content = await file.read() - fout.write(content) - except Exception as e: - print(f"Write file failed. Exception: {e}") - raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") +upload_folder = "./uploaded_files/" def ingest_data_to_redis(doc_path: DocPath): @@ -84,9 +84,16 @@ def ingest_data_to_redis(doc_path: DocPath): return True -def ingest_link_to_redis(link_list: List[str]): +async def ingest_link_to_redis(link_list: List[str]): data_collection = parse_html(link_list) + for content, link in data_collection: + print(f"[ ingest link ] link: {link} content: {content}") + encoded_link = encode_filename(link) + save_path = upload_folder + encoded_link + ".txt" + print(f"[ ingest link ] save_path: {save_path}") + await save_content_to_local_disk(save_path, content) + texts = [] metadatas = [] for data, meta in data_collection: @@ -125,19 +132,15 @@ async def ingest_documents( ): print(f"files:{files}") print(f"link_list:{link_list}") - if files and link_list: - raise HTTPException(status_code=400, detail="Provide either a file or a string list, not both.") if files: if not isinstance(files, list): files = [files] - upload_folder = "./uploaded_files/" - if not os.path.exists(upload_folder): - Path(upload_folder).mkdir(parents=True, exist_ok=True) uploaded_files = [] for file in files: - save_path = upload_folder + file.filename - await save_file_to_local_disk(save_path, file) + encode_file = encode_filename(file.filename) + save_path = upload_folder + encode_file + await save_content_to_local_disk(save_path, file) ingest_data_to_redis( DocPath( path=save_path, @@ -178,7 +181,7 @@ def process_files_wrapper(files): link_list = json.loads(link_list) # Parse JSON string to list if not isinstance(link_list, list): raise HTTPException(status_code=400, detail="link_list should be a list.") - ingest_link_to_redis(link_list) + await ingest_link_to_redis(link_list) print(f"Successfully saved link list {link_list}") return {"status": 200, "message": "Data preparation succeeded"} except json.JSONDecodeError: @@ -187,5 +190,67 @@ def process_files_wrapper(files): raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") +@register_microservice( + name="opea_service@prepare_doc_redis_file", endpoint="/v1/dataprep/get_file", host="0.0.0.0", port=6008 +) +@traceable(run_type="tool") +async def rag_get_file_structure(): + print("[ get_file_structure] ") + + if not Path(upload_folder).exists(): + print("No file uploaded, return empty list.") + return [] + + file_content = get_file_structure(upload_folder) + return file_content + + +@register_microservice( + name="opea_service@prepare_doc_redis_del", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6009 +) +@traceable(run_type="tool") +async def delete_single_file(file_path: str = Body(..., embed=True)): + """Delete file according to `file_path`. + + `file_path`: + - specific file path (e.g. /path/to/file.txt) + - folder path (e.g. /path/to/folder) + - "all": delete all files uploaded + """ + # delete all uploaded files + if file_path == "all": + print("[dataprep - del] delete all files") + remove_folder_with_ignore(upload_folder) + print("[dataprep - del] successfully delete all files.") + create_upload_folder(upload_folder) + return {"status": True} + + delete_path = Path(upload_folder + "/" + encode_filename(file_path)) + print(f"[dataprep - del] delete_path: {delete_path}") + + # partially delete files/folders + if delete_path.exists(): + # delete file + if delete_path.is_file(): + try: + delete_path.unlink() + except Exception as e: + print(f"[dataprep - del] fail to delete file {delete_path}: {e}") + return {"status": False} + # delete folder + else: + try: + shutil.rmtree(delete_path) + except Exception as e: + print(f"[dataprep - del] fail to delete folder {delete_path}: {e}") + return {"status": False} + return {"status": True} + else: + raise HTTPException(status_code=404, detail="File/folder not found. Please check del_path.") + + if __name__ == "__main__": + create_upload_folder(upload_folder) opea_microservices["opea_service@prepare_doc_redis"].start() + opea_microservices["opea_service@prepare_doc_redis_file"].start() + opea_microservices["opea_service@prepare_doc_redis_del"].start() diff --git a/comps/dataprep/redis/langchain_ray/docker/Dockerfile b/comps/dataprep/redis/langchain_ray/docker/Dockerfile index 4c4dd083c..4d1c44cc2 100644 --- a/comps/dataprep/redis/langchain_ray/docker/Dockerfile +++ b/comps/dataprep/redis/langchain_ray/docker/Dockerfile @@ -12,7 +12,8 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin build-essential \ libgl1-mesa-glx \ libjemalloc-dev \ - vim + vim \ + libcairo2 RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py index be8240f03..6bd906477 100644 --- a/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py +++ b/comps/dataprep/redis/langchain_ray/prepare_doc_redis_on_ray.py @@ -15,13 +15,14 @@ import json import os import pathlib +import shutil import sys from pathlib import Path from typing import Callable, List, Optional, Union import pandas as pd from config import EMBED_MODEL, INDEX_NAME, INDEX_SCHEMA, REDIS_URL, TIMEOUT_SECONDS -from fastapi import File, Form, HTTPException, UploadFile +from fastapi import Body, File, Form, HTTPException, UploadFile from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings from langchain_community.vectorstores import Redis @@ -32,7 +33,7 @@ sys.path.append(comps_path) import hashlib import timeit -from typing import TYPE_CHECKING, Any, Dict, Iterator +from typing import Any, Dict, Iterator import pyarrow import ray @@ -41,10 +42,21 @@ from tqdm import tqdm from comps import DocPath, opea_microservices, register_microservice -from comps.dataprep.utils import Timer, document_loader, parse_html, timeout +from comps.dataprep.utils import ( + Timer, + create_upload_folder, + document_loader, + encode_filename, + get_file_structure, + parse_html, + remove_folder_with_ignore, + save_content_to_local_disk, + timeout, +) tei_embedding_endpoint = os.getenv("TEI_ENDPOINT") debug = False +upload_folder = "./uploaded_files/" def prepare_env(enable_ray=False, pip_requirements=None): @@ -127,17 +139,6 @@ def ray_execute(ds, log_name): return ret -async def save_file_to_local_disk(save_path: str, file): - save_path = Path(save_path) - with save_path.open("wb") as fout: - try: - content = await file.read() - fout.write(content) - except Exception as e: - print(f"Write file failed. Exception: {e}") - raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") - - @timeout(seconds=TIMEOUT_SECONDS) def data_to_redis_ray(data): content = data["data"] @@ -275,14 +276,13 @@ async def ingest_documents(files: List[UploadFile] = File(None), link_list: str try: if not isinstance(files, list): files = [files] - upload_folder = "./uploaded_files/" if not os.path.exists(upload_folder): Path(upload_folder).mkdir(parents=True, exist_ok=True) # TODO: use ray to parallelize the file saving for file in files: save_path = upload_folder + file.filename - await save_file_to_local_disk(save_path, file) + await save_content_to_local_disk(save_path, file) saved_path_list.append(DocPath(path=save_path)) if len(saved_path_list) <= 10: @@ -317,6 +317,66 @@ async def ingest_documents(files: List[UploadFile] = File(None), link_list: str raise HTTPException(status_code=400, detail=f"An error occurred: {e}") +@register_microservice( + name="opea_service@prepare_doc_redis_file", endpoint="/v1/dataprep/get_file", host="0.0.0.0", port=6008 +) +@traceable(run_type="tool") +async def rag_get_file_structure(): + print("[ get_file_structure] ") + + if not Path(upload_folder).exists(): + print("No file uploaded, return empty list.") + return [] + + file_content = get_file_structure(upload_folder) + return file_content + + +@register_microservice( + name="opea_service@prepare_doc_redis_del", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6009 +) +@traceable(run_type="tool") +async def delete_single_file(file_path: str = Body(..., embed=True)): + """Delete file according to `file_path`. + + `file_path`: + - specific file path (e.g. /path/to/file.txt) + - folder path (e.g. /path/to/folder) + - "all": delete all files uploaded + """ + # delete all uploaded files + if file_path == "all": + print("[dataprep - del] delete all files") + remove_folder_with_ignore(upload_folder) + print("[dataprep - del] successfully delete all files.") + create_upload_folder(upload_folder) + return {"status": True} + + delete_path = Path(upload_folder + "/" + encode_filename(file_path)) + print(f"[dataprep - del] delete_path: {delete_path}") + + # partially delete files/folders + if delete_path.exists(): + # delete file + if delete_path.is_file(): + try: + delete_path.unlink() + except Exception as e: + print(f"[dataprep - del] fail to delete file {delete_path}: {e}") + return {"status": False} + # delete folder + else: + try: + shutil.rmtree(delete_path) + except Exception as e: + print(f"[dataprep - del] fail to delete folder {delete_path}: {e}") + return {"status": False} + return {"status": True} + else: + raise HTTPException(status_code=404, detail="File/folder not found. Please check del_path.") + + if __name__ == "__main__": opea_microservices["opea_service@prepare_doc_redis"].start() + opea_microservices["opea_service@prepare_doc_redis_file"].start() diff --git a/comps/dataprep/redis/langchain_ray/requirements.txt b/comps/dataprep/redis/langchain_ray/requirements.txt index 6e8f3ad9d..829b1e1c3 100644 --- a/comps/dataprep/redis/langchain_ray/requirements.txt +++ b/comps/dataprep/redis/langchain_ray/requirements.txt @@ -1,5 +1,7 @@ beautifulsoup4 +cairosvg docarray[full] +docx2txt easyocr fastapi huggingface_hub @@ -16,6 +18,7 @@ prometheus-fastapi-instrumentator pyarrow pymupdf python-docx +python-pptx ray redis sentence_transformers diff --git a/comps/dataprep/redis/llama_index/docker/docker-compose-dataprep-redis.yaml b/comps/dataprep/redis/llama_index/docker/docker-compose-dataprep-redis.yaml index ecb1bf4bd..7a52c1cef 100644 --- a/comps/dataprep/redis/llama_index/docker/docker-compose-dataprep-redis.yaml +++ b/comps/dataprep/redis/llama_index/docker/docker-compose-dataprep-redis.yaml @@ -14,6 +14,8 @@ services: container_name: dataprep-redis-server ports: - "6007:6007" + - "6008:6008" + - "6009:6009" ipc: host environment: no_proxy: ${no_proxy} diff --git a/comps/dataprep/redis/llama_index/prepare_doc_redis.py b/comps/dataprep/redis/llama_index/prepare_doc_redis.py index ec0ddf0fa..3192419eb 100644 --- a/comps/dataprep/redis/llama_index/prepare_doc_redis.py +++ b/comps/dataprep/redis/llama_index/prepare_doc_redis.py @@ -1,13 +1,13 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import json import os +import shutil from pathlib import Path from typing import List, Optional, Union from config import EMBED_MODEL, INDEX_NAME, REDIS_URL -from fastapi import File, Form, HTTPException, UploadFile +from fastapi import Body, File, HTTPException, UploadFile from langsmith import traceable from llama_index.core import SimpleDirectoryReader, StorageContext, VectorStoreIndex from llama_index.core.settings import Settings @@ -17,17 +17,15 @@ from redisvl.schema import IndexSchema from comps import DocPath, opea_microservices, register_microservice +from comps.dataprep.utils import ( + create_upload_folder, + encode_filename, + get_file_structure, + remove_folder_with_ignore, + save_content_to_local_disk, +) - -async def save_file_to_local_disk(save_path: str, file): - save_path = Path(save_path) - with save_path.open("wb") as fout: - try: - content = await file.read() - fout.write(content) - except Exception as e: - print(f"Write file failed. Exception: {e}") - raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") +upload_folder = "./uploaded_files/" async def ingest_data_to_redis(doc_path: DocPath): @@ -72,13 +70,12 @@ async def ingest_documents(files: Optional[Union[UploadFile, List[UploadFile]]] if not isinstance(files, list): files = [files] - upload_folder = "./uploaded_files/" if not os.path.exists(upload_folder): Path(upload_folder).mkdir(parents=True, exist_ok=True) try: for file in files: save_path = upload_folder + file.filename - await save_file_to_local_disk(save_path, file) + await save_content_to_local_disk(save_path, file) await ingest_data_to_redis(DocPath(path=save_path)) print(f"Successfully saved file {save_path}") return {"status": 200, "message": "Data preparation succeeded"} @@ -87,5 +84,66 @@ async def ingest_documents(files: Optional[Union[UploadFile, List[UploadFile]]] raise HTTPException(status_code=500, detail=f"Data preparation failed. Exception: {e}") +@register_microservice( + name="opea_service@prepare_doc_redis_file", endpoint="/v1/dataprep/get_file", host="0.0.0.0", port=6008 +) +@traceable(run_type="tool") +async def rag_get_file_structure(): + print("[ get_file_structure] ") + + if not Path(upload_folder).exists(): + print("No file uploaded, return empty list.") + return [] + + file_content = get_file_structure(upload_folder) + return file_content + + +@register_microservice( + name="opea_service@prepare_doc_redis_del", endpoint="/v1/dataprep/delete_file", host="0.0.0.0", port=6009 +) +@traceable(run_type="tool") +async def delete_single_file(file_path: str = Body(..., embed=True)): + """Delete file according to `file_path`. + + `file_path`: + - specific file path (e.g. /path/to/file.txt) + - folder path (e.g. /path/to/folder) + - "all": delete all files uploaded + """ + # delete all uploaded files + if file_path == "all": + print("[dataprep - del] delete all files") + remove_folder_with_ignore(upload_folder) + print("[dataprep - del] successfully delete all files.") + create_upload_folder(upload_folder) + return {"status": True} + + delete_path = Path(upload_folder + "/" + encode_filename(file_path)) + print(f"[dataprep - del] delete_path: {delete_path}") + + # partially delete files/folders + if delete_path.exists(): + # delete file + if delete_path.is_file(): + try: + delete_path.unlink() + except Exception as e: + print(f"[dataprep - del] fail to delete file {delete_path}: {e}") + return {"status": False} + # delete folder + else: + try: + shutil.rmtree(delete_path) + except Exception as e: + print(f"[dataprep - del] fail to delete folder {delete_path}: {e}") + return {"status": False} + return {"status": True} + else: + raise HTTPException(status_code=404, detail="File/folder not found. Please check del_path.") + + if __name__ == "__main__": opea_microservices["opea_service@prepare_doc_redis"].start() + opea_microservices["opea_service@prepare_doc_redis_file"].start() + opea_microservices["opea_service@prepare_doc_redis_del"].start() diff --git a/comps/dataprep/utils.py b/comps/dataprep/utils.py index 786d5c8dc..416d92fe3 100644 --- a/comps/dataprep/utils.py +++ b/comps/dataprep/utils.py @@ -13,6 +13,9 @@ import signal import timeit import unicodedata +import urllib.parse +from pathlib import Path +from typing import Dict, List, Union from urllib.parse import urlparse, urlunparse import cairosvg @@ -613,3 +616,85 @@ def get_relation(table_coords, caption_coords, table_page_number, caption_page_n y_distance = 0 y_close = y_distance < threshold return same_page and x_overlap and y_close, y_distance + + +def create_upload_folder(upload_path): + if not os.path.exists(upload_path): + Path(upload_path).mkdir(parents=True, exist_ok=True) + + +def encode_filename(filename): + return urllib.parse.quote(filename, safe="") + + +def decode_filename(encoded_filename): + return urllib.parse.unquote(encoded_filename) + + +async def save_content_to_local_disk(save_path: str, content): + save_path = Path(save_path) + try: + if isinstance(content, str): + with open(save_path, "w", encoding="utf-8") as file: + file.write(content) + else: + with save_path.open("wb") as fout: + content = await content.read() + fout.write(content) + except Exception as e: + print(f"Write file failed. Exception: {e}") + raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") + + +def get_file_structure(root_path: str, parent_path: str = "") -> List[Dict[str, Union[str, List]]]: + result = [] + for path in os.listdir(root_path): + complete_path = parent_path + "/" + path if parent_path else path + file_path = root_path + "/" + path + p = Path(file_path) + # append file into result + if p.is_file(): + file_dict = { + "name": decode_filename(path), + "id": decode_filename(complete_path), + "type": "File", + "parent": "", + } + result.append(file_dict) + else: + # append folder and inner files/folders into result using recursive function + folder_dict = { + "name": decode_filename(path), + "id": decode_filename(complete_path), + "type": "Directory", + "children": get_file_structure(file_path, complete_path), + "parent": "", + } + result.append(folder_dict) + + return result + + +def remove_folder_with_ignore(folder_path: str, except_patterns: List = []): + """Remove the specific folder, and ignore some files/folders. + + :param folder_path: file path to delete + :param except_patterns: files/folder name to ignore + """ + print(f"except patterns: {except_patterns}") + for root, dirs, files in os.walk(folder_path, topdown=False): + for name in files: + # delete files except ones that match patterns + file_path = os.path.join(root, name) + if except_patterns != [] and any(pattern in file_path for pattern in except_patterns): + continue + os.remove(file_path) + + # delete empty folder + for name in dirs: + dir_path = os.path.join(root, name) + # delete folders except ones that match patterns + if except_patterns != [] and any(pattern in dir_path for pattern in except_patterns): + continue + if not os.listdir(dir_path): + os.rmdir(dir_path) diff --git a/tests/test_dataprep_redis_langchain.sh b/tests/test_dataprep_redis_langchain.sh index 8461c8227..a8c95c8c0 100644 --- a/tests/test_dataprep_redis_langchain.sh +++ b/tests/test_dataprep_redis_langchain.sh @@ -15,15 +15,19 @@ function build_docker_images() { } function start_service() { - docker run -d --name="test-comps-dataprep-redis" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6379:6379 -p 8001:8001 --ipc=host redis/redis-stack:7.2.0-v9 + docker run -d --name="test-comps-dataprep-redis-langchain" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6380:6379 -p 8002:8001 --ipc=host redis/redis-stack:7.2.0-v9 dataprep_service_port=5013 - REDIS_URL="redis://${ip_address}:6379" - docker run -d --name="test-comps-dataprep-redis-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -p ${dataprep_service_port}:6007 --ipc=host opea/dataprep-redis:comps + dataprep_file_service_port=5016 + dataprep_del_service_port=5020 + REDIS_URL="redis://${ip_address}:6380" + docker run -d --name="test-comps-dataprep-redis-langchain-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -p ${dataprep_service_port}:6007 -p ${dataprep_file_service_port}:6008 -p ${dataprep_del_service_port}:6009 --ipc=host opea/dataprep-redis:comps sleep 1m } function validate_microservice() { cd $LOG_PATH + + # test /v1/dataprep dataprep_service_port=5013 URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt @@ -32,23 +36,56 @@ function validate_microservice() { echo "[ dataprep ] HTTP status is 200. Checking content..." local CONTENT=$(curl -s -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/dataprep.log) - if echo 'Data preparation succeeded' | grep -q "$EXPECTED_RESULT"; then + if echo "$CONTENT" | grep -q "Data preparation succeeded"; then echo "[ dataprep ] Content is as expected." else echo "[ dataprep ] Content does not match the expected result: $CONTENT" - docker logs test-comps-dataprep-redis-server >> ${LOG_PATH}/dataprep.log + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep.log exit 1 fi else echo "[ dataprep ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs test-comps-dataprep-redis-server >> ${LOG_PATH}/dataprep.log + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep.log + exit 1 + fi + + # test /v1/dataprep/get_file + dataprep_file_service_port=5016 + URL="http://${ip_address}:$dataprep_file_service_port/v1/dataprep/get_file" + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -H 'Content-Type: application/json' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep - file ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/dataprep_file.log) + + if echo "$CONTENT" | grep -q '{"name":'; then + echo "[ dataprep - file ] Content is as expected." + else + echo "[ dataprep - file ] Content does not match the expected result: $CONTENT" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_file.log + exit 1 + fi + else + echo "[ dataprep - file ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_file.log + exit 1 + fi + + # test /v1/dataprep/delete_file + dataprep_file_service_port=5016 + URL="http://${ip_address}:$dataprep_del_service_port/v1/dataprep/delete_file" + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d '{"file_path": "dataprep_file.txt"}' -H 'Content-Type: application/json' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep - del ] HTTP status is 200." + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_del.log + else + echo "[ dataprep - del ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-langchain-server >> ${LOG_PATH}/dataprep_del.log exit 1 fi - rm -rf $LOG_PATH/dataprep_file.txt } function stop_docker() { - cid=$(docker ps -aq --filter "name=test-comps-dataprep-redis*") + cid=$(docker ps -aq --filter "name=test-comps-dataprep-redis-langchain*") if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi } diff --git a/tests/test_dataprep_redis_langchain_ray.sh b/tests/test_dataprep_redis_langchain_ray.sh index 667c9d942..8a46a9c40 100644 --- a/tests/test_dataprep_redis_langchain_ray.sh +++ b/tests/test_dataprep_redis_langchain_ray.sh @@ -5,6 +5,7 @@ set -xe WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { @@ -17,51 +18,73 @@ function build_docker_images() { function start_service() { echo "Starting redis microservice" # redis endpoint - docker run -d --name="test-dataprep-redis-server" --runtime=runc -p 6379:6379 -p 8001:8001 redis/redis-stack:7.2.0-v9 + docker run -d --name="test-comps-dataprep-redis-ray" --runtime=runc -p 6382:6379 -p 8004:8001 redis/redis-stack:7.2.0-v9 # dataprep-redis-server endpoint - export REDIS_URL="redis://${ip_address}:6379" + export REDIS_URL="redis://${ip_address}:6382" export INDEX_NAME="rag-redis" echo "Starting dataprep-redis-server" - docker run -d --name="test-dataprep-redis-endpoint" --runtime=runc -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e TIMEOUT_SECONDS=600 opea/dataprep-on-ray-redis:latest + docker run -d --name="test-comps-dataprep-redis-ray-server" --runtime=runc -p 6009:6007 -p 6010:6008 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e TIMEOUT_SECONDS=600 opea/dataprep-on-ray-redis:latest - sleep 5 + sleep 10 echo "Service started successfully" } function validate_microservice() { + cd $LOG_PATH + + dataprep_service_port=6009 + export URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" + echo "Starting validating the microservice" export PATH="${HOME}/miniforge3/bin:$PATH" source activate + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > dataprep_file.txt + EXIT_CODE=0 python -c "$(cat << 'EOF' import requests import json import os proxies = {'http':""} -url = 'http://localhost:6007/v1/dataprep' +url = os.environ['URL'] print("test single file ingestion") -file_list = ["test_data.pdf"] -files = [('files', (f, open(os.path.join("comps/dataprep/redis/", f), 'rb'), 'application/pdf')) for f in file_list] +file_list = ["dataprep_file.txt"] +files = [('files', (f, open(f, 'rb'))) for f in file_list] resp = requests.request('POST', url=url, headers={}, files=files, proxies=proxies) print(resp.text) resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes print("Request successful!") print("test 20 files ingestion") -file_list = ["test_data.pdf"] * 20 -files = [('files', (f, open(os.path.join("comps/dataprep/redis/", f), 'rb'), 'application/pdf')) for f in file_list] +file_list = ["dataprep_file.txt"] * 20 +files = [('files', (f, open(f, 'rb'))) for f in file_list] resp = requests.request('POST', url=url, headers={}, files=files, proxies=proxies) print(resp.text) resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes print("Request successful!") + +print("test get file structure") +url = 'http://localhost:6010/v1/dataprep/get_file' +resp = requests.request('POST', url=url, headers={}, proxies=proxies) +print(resp.text) +assert "name" in resp.text, "Response does not meet expectation." +print("Request successful!") EOF -)" - echo "Validation successful" +)" || EXIT_CODE=$? + rm -rf dataprep_file.txt + if [ $EXIT_CODE -ne 0 ]; then + echo "[ dataprep ] Validation failed. Entire log as below doc " + docker container logs test-comps-dataprep-redis-ray-server | tee -a ${LOG_PATH}/dataprep.log + exit 1 + else + echo "[ dataprep ] Validation succeed. " + fi } + function stop_docker() { - cid=$(docker ps -aq --filter "name=test-dataprep-redis*") + cid=$(docker ps -aq --filter "name=test-comps-dataprep-redis-ray*") echo "Stopping the docker containers "${cid} if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi echo "Docker containers stopped successfully" diff --git a/tests/test_dataprep_redis_llama_index.sh b/tests/test_dataprep_redis_llama_index.sh index 517f66247..c32f9fa64 100644 --- a/tests/test_dataprep_redis_llama_index.sh +++ b/tests/test_dataprep_redis_llama_index.sh @@ -15,38 +15,64 @@ function build_docker_images() { } function start_service() { - docker run -d --name="test-comps-dataprep-redis" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6379:6379 -p 8001:8001 --ipc=host redis/redis-stack:7.2.0-v9 + docker run -d --name="test-comps-dataprep-redis-llama-index" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 6381:6379 -p 8003:8001 --ipc=host redis/redis-stack:7.2.0-v9 dataprep_service_port=5012 - REDIS_URL="redis://${ip_address}:6379" - docker run -d --name="test-comps-dataprep-redis-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -p ${dataprep_service_port}:6007 --ipc=host opea/dataprep-redis:comps - sleep 1m + dataprep_file_service_port=5017 + REDIS_URL="redis://${ip_address}:6381" + docker run -d --name="test-comps-dataprep-redis-llama-index-server" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -p ${dataprep_service_port}:6007 -p ${dataprep_file_service_port}:6008 --ipc=host opea/dataprep-redis:comps + sleep 2m } function validate_microservice() { + cd $LOG_PATH + + # test /v1/dataprep dataprep_service_port=5012 URL="http://${ip_address}:$dataprep_service_port/v1/dataprep" - echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > ./dataprep_file.txt + echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL") if [ "$HTTP_STATUS" -eq 200 ]; then echo "[ dataprep ] HTTP status is 200. Checking content..." local CONTENT=$(curl -s -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL" | tee ${LOG_PATH}/dataprep.log) - if echo 'Data preparation succeeded' | grep -q "$EXPECTED_RESULT"; then + if echo "$CONTENT" | grep -q "Data preparation succeeded"; then echo "[ dataprep ] Content is as expected." else echo "[ dataprep ] Content does not match the expected result: $CONTENT" - docker logs test-comps-dataprep-redis-server >> ${LOG_PATH}/dataprep.log + docker logs test-comps-dataprep-redis-llama-index-server >> ${LOG_PATH}/dataprep.log + exit 1 + fi + else + echo "[ dataprep ] HTTP status is not 200. Received status was $HTTP_STATUS" + docker logs test-comps-dataprep-redis-llama-index-server >> ${LOG_PATH}/dataprep.log + exit 1 + fi + rm -rf $LOG_PATH/dataprep_file.txt + + # test /v1/dataprep/get_file + dataprep_file_service_port=5017 + URL="http://${ip_address}:$dataprep_file_service_port/v1/dataprep/get_file" + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -H 'Content-Type: application/json' "$URL") + if [ "$HTTP_STATUS" -eq 200 ]; then + echo "[ dataprep - file ] HTTP status is 200. Checking content..." + local CONTENT=$(curl -s -X POST -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/dataprep_file.log) + + if echo "$CONTENT" | grep -q '{"name":'; then + echo "[ dataprep - file ] Content is as expected." + else + echo "[ dataprep - file ] Content does not match the expected result: $CONTENT" + docker logs test-comps-dataprep-redis-llama-index-server >> ${LOG_PATH}/dataprep_file.log exit 1 fi else echo "[ dataprep ] HTTP status is not 200. Received status was $HTTP_STATUS" - docker logs test-comps-dataprep-redis-server >> ${LOG_PATH}/dataprep.log + docker logs test-comps-dataprep-redis-llama-index-server >> ${LOG_PATH}/dataprep_file.log exit 1 fi } function stop_docker() { - cid=$(docker ps -aq --filter "name=test-comps-dataprep-redis*") + cid=$(docker ps -aq --filter "name=test-comps-dataprep-redis-llama*") if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi } @@ -54,7 +80,7 @@ function main() { stop_docker - build_docker_images + # build_docker_images start_service validate_microservice