From 10748ecd75df39c6239e62cd42c0da5f158ef968 Mon Sep 17 00:00:00 2001 From: letonghan Date: Fri, 26 Jul 2024 09:57:46 +0800 Subject: [PATCH 1/3] update dataprep with parameter settings Signed-off-by: letonghan --- comps/dataprep/redis/README.md | 30 +++++++++++++++++++ .../docker/docker-compose-dataprep-redis.yaml | 15 ++++++++++ .../redis/langchain/prepare_doc_redis.py | 2 +- 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md index 58fe3b34d..a50d51903 100644 --- a/comps/dataprep/redis/README.md +++ b/comps/dataprep/redis/README.md @@ -44,6 +44,32 @@ export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep" export PYTHONPATH=${path_to_comps} ``` +## 1.4 Start Embedding Service + +First, you need to start a TEI service. + +```bash +your_port=8090 +model="BAAI/bge-large-en-v1.5" +revision="refs/pr/5" +docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision +``` + +Then you need to test your TEI service using the following commands: + +```bash +curl localhost:$your_port/embed \ + -X POST \ + -d '{"inputs":"What is Deep Learning?"}' \ + -H 'Content-Type: application/json' +``` + +After checking that it works, set up environment variables. + +```bash +export TEI_ENDPOINT="http://localhost:$your_port" +``` + ## 1.4 Start Document Preparation Microservice for Redis with Python Script Start document preparation microservice for Redis with below command. @@ -69,6 +95,10 @@ Please refer to this [readme](../../../vectorstores/langchain/redis/README.md). ## 2.2 Setup Environment Variables ```bash +export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" +export TEI_ENDPOINT="http://${your_ip}:6006" +export REDIS_HOST=${your_ip} +export REDIS_PORT=6379 export REDIS_URL="redis://${your_ip}:6379" export INDEX_NAME=${your_index_name} export LANGCHAIN_TRACING_V2=true diff --git a/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml b/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml index e2775972d..74e2bb78f 100644 --- a/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml +++ b/comps/dataprep/redis/langchain/docker/docker-compose-dataprep-redis.yaml @@ -9,6 +9,19 @@ services: ports: - "6379:6379" - "8001:8001" + tei-embedding-service: + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + container_name: tei-embedding-server + ports: + - "6006:80" + volumes: + - "./data:/data" + shm_size: 1g + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate dataprep-redis: image: opea/dataprep-redis:latest container_name: dataprep-redis-server @@ -21,6 +34,8 @@ services: no_proxy: ${no_proxy} http_proxy: ${http_proxy} https_proxy: ${https_proxy} + REDIS_HOST: ${REDIS_HOST} + REDIS_PORT: ${REDIS_PORT} REDIS_URL: ${REDIS_URL} INDEX_NAME: ${INDEX_NAME} TEI_ENDPOINT: ${TEI_ENDPOINT} diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py index 2a99a11ee..3a516aabd 100644 --- a/comps/dataprep/redis/langchain/prepare_doc_redis.py +++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py @@ -118,7 +118,7 @@ def ingest_data_to_redis(doc_path: DocPath): text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) else: text_splitter = RecursiveCharacterTextSplitter( - chunk_size=doc_path.chunk_size, chunk_overlap=100, add_start_index=True, separators=get_separators() + chunk_size=doc_path.chunk_size, chunk_overlap=doc_path.chunk_overlap, add_start_index=True, separators=get_separators() ) content = document_loader(path) From c03da8655dea8302b8a284b4e8a06fd4ca00d44b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 26 Jul 2024 02:02:33 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/dataprep/redis/langchain/prepare_doc_redis.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/comps/dataprep/redis/langchain/prepare_doc_redis.py b/comps/dataprep/redis/langchain/prepare_doc_redis.py index 3a516aabd..73d5f6ba4 100644 --- a/comps/dataprep/redis/langchain/prepare_doc_redis.py +++ b/comps/dataprep/redis/langchain/prepare_doc_redis.py @@ -118,7 +118,10 @@ def ingest_data_to_redis(doc_path: DocPath): text_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) else: text_splitter = RecursiveCharacterTextSplitter( - chunk_size=doc_path.chunk_size, chunk_overlap=doc_path.chunk_overlap, add_start_index=True, separators=get_separators() + chunk_size=doc_path.chunk_size, + chunk_overlap=doc_path.chunk_overlap, + add_start_index=True, + separators=get_separators(), ) content = document_loader(path) From c46121eff7f0556ee2ae26a46cc50ea090db0123 Mon Sep 17 00:00:00 2001 From: letonghan Date: Fri, 26 Jul 2024 12:04:50 +0800 Subject: [PATCH 3/3] update port Signed-off-by: letonghan --- comps/dataprep/redis/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/dataprep/redis/README.md b/comps/dataprep/redis/README.md index a50d51903..58bb42897 100644 --- a/comps/dataprep/redis/README.md +++ b/comps/dataprep/redis/README.md @@ -49,7 +49,7 @@ export PYTHONPATH=${path_to_comps} First, you need to start a TEI service. ```bash -your_port=8090 +your_port=6006 model="BAAI/bge-large-en-v1.5" revision="refs/pr/5" docker run -p $your_port:80 -v ./data:/data --name tei_server -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision