From fd7edd3e10d4d25a9fd8483a08566652d2d33640 Mon Sep 17 00:00:00 2001 From: Letong Han <106566639+letonghan@users.noreply.github.com> Date: Wed, 31 Jul 2024 09:42:42 +0800 Subject: [PATCH] Update Dataprep with Parameter Settings (#351) * update dataprep with parameter settings Signed-off-by: letonghan * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update port Signed-off-by: letonghan --------- Signed-off-by: letonghan Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Signed-off-by: sharanshirodkar7 --- comps/dataprep/redis/README.md | 30 +++++++++++++++++++ .../docker/docker-compose-dataprep-redis.yaml | 15 ++++++++++ .../redis/langchain/prepare_doc_redis.py | 5 +++- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md index 1861bbb4bd..8d1d29a976 100644 --- a/comps/dataprep/redis/README.md +++ b/comps/dataprep/redis/README.md @@ -44,6 +44,32 @@ export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep" export PYTHONPATH=${path_to_comps} ``` +## 1.4 Start Embedding Service + +First, you need to start a TEI service. + +```bash +your_port=6006 +model="BAAI/bge-large-en-v1.5" +revision="refs/pr/5" +docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision +``` + +Then you need to test your TEI service using the following commands: + +```bash +curl localhost:$your_port/embed \ + -X POST \ + -d '{"inputs":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' +``` + +After checking that it works, set up environment variables. + +```bash +export TEI_ENDPOINT="http://localhost:$your_port" +``` + ## 1.4 Start Document Preparation Microservice for Redis with Python Script Start document preparation microservice for Redis with below command. @@ -69,6 +95,10 @@ Please refer to this [readme](../../vectorstores/langchain/redis/README.md). ## 2.2 Setup Environment Variables ```bash +export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" +export TEI_ENDPOINT="http://${your_ip}:6006" +export REDIS_HOST=${your_ip} +export REDIS_PORT=6379 export REDIS_URL="redis://${your_ip}:6379" export INDEX_NAME=${your_index_name} export LANGCHAIN_TRACING_V2=true diff --git a/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml b/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml index e2775972de..74e2bb78ff 100644 --- a/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml +++ b/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml @@ -9,6 +9,19 @@ services: ports: - "6379:6379" - "8001:8001" + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6006:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate dataprep-redis: image: opea/dataprep-redis:latest container_name: dataprep-redis-server @@ -21,6 +34,8 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} + REDIS_HOST: ${REDIS_HOST} + REDIS_PORT: ${REDIS_PORT} REDIS_URL: ${REDIS_URL} INDEX_NAME: ${INDEX_NAME} TEI_ENDPOINT: ${TEI_ENDPOINT} diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py index 0cfda72c10..786a219ff3 100644 --- a/comps/dataprep/redis/langchain/prepare_doc_redis.py +++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py @@ -118,7 +118,10 @@ def ingest_data_to_redis(doc_path: DocPath): text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) else: text_splitter = RecursiveCharacterTextSplitter( - chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), ) content = document_loader(path)