From 535f33978b162361ec45a2f79d8fbd7ea28948ff Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 4 Jun 2024 08:28:08 +0800 Subject: [PATCH 1/8] Fix RAG performance issues Signed-off-by: lvliang-intel --- comps/embeddings/langchain/local_embedding.py | 2 +- .../langchain/guardrails_tgi_gaudi.py | 4 +-- comps/llms/summarization/tgi/llm.py | 29 +++++++++---------- comps/llms/text-generation/tgi/llm.py | 27 +++++++++-------- comps/retrievers/langchain/retriever_redis.py | 18 ++++++------ 5 files changed, 39 insertions(+), 41 deletions(-) diff --git a/comps/embeddings/langchain/local_embedding.py b/comps/embeddings/langchain/local_embedding.py index 0462a792c..5740eda07 100644 --- a/comps/embeddings/langchain/local_embedding.py +++ b/comps/embeddings/langchain/local_embedding.py @@ -17,11 +17,11 @@ ) @opea_telemetry def embedding(input: TextDoc) -> EmbedDoc1024: - embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en-v1.5") embed_vector = embeddings.embed_query(input.text) res = EmbedDoc1024(text=input.text, embedding=embed_vector) return res if __name__ == "__main__": + embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en-v1.5") opea_microservices["opea_service@local_embedding"].start() diff --git a/comps/guardrails/langchain/guardrails_tgi_gaudi.py b/comps/guardrails/langchain/guardrails_tgi_gaudi.py index f42c3e60a..002cbc6ce 100644 --- a/comps/guardrails/langchain/guardrails_tgi_gaudi.py +++ b/comps/guardrails/langchain/guardrails_tgi_gaudi.py @@ -49,8 +49,6 @@ def get_unsafe_dict(model_id="meta-llama/LlamaGuard-7b"): ) @traceable(run_type="llm") def safety_guard(input: TextDoc) -> TextDoc: - # chat engine for server-side prompt templating - llm_engine_hf = ChatHuggingFace(llm=llm_guard) response_input_guard = llm_engine_hf.invoke([{"role": "user", "content": input.text}]).content if "unsafe" in response_input_guard: unsafe_dict = get_unsafe_dict(llm_engine_hf.model_id) @@ -75,5 +73,7 @@ def safety_guard(input: TextDoc) -> TextDoc: temperature=0.01, repetition_penalty=1.03, ) + # chat engine for server-side prompt templating + llm_engine_hf = ChatHuggingFace(llm=llm_guard) print("guardrails - router] LLM initialized.") opea_microservices["opea_service@guardrails_tgi_gaudi"].start() diff --git a/comps/llms/summarization/tgi/llm.py b/comps/llms/summarization/tgi/llm.py index e1cce35a5..f688d7bd1 100644 --- a/comps/llms/summarization/tgi/llm.py +++ b/comps/llms/summarization/tgi/llm.py @@ -34,22 +34,7 @@ def post_process_text(text: str): ) @traceable(run_type="llm") def llm_generate(input: LLMParamsDoc): - llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") - llm = HuggingFaceEndpoint( - endpoint_url=llm_endpoint, - max_new_tokens=input.max_new_tokens, - top_k=input.top_k, - top_p=input.top_p, - typical_p=input.typical_p, - temperature=input.temperature, - repetition_penalty=input.repetition_penalty, - streaming=input.streaming, - ) - llm_chain = load_summarize_chain(llm=llm, chain_type="map_reduce") - if input.streaming: - # Split text - text_splitter = CharacterTextSplitter() texts = text_splitter.split_text(input.query) # Create multiple documents docs = [Document(page_content=t) for t in texts] @@ -72,4 +57,18 @@ async def stream_generator(): if __name__ == "__main__": + llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") + llm = HuggingFaceEndpoint( + endpoint_url=llm_endpoint, + max_new_tokens=input.max_new_tokens, + top_k=input.top_k, + top_p=input.top_p, + typical_p=input.typical_p, + temperature=input.temperature, + repetition_penalty=input.repetition_penalty, + streaming=input.streaming, + ) + llm_chain = load_summarize_chain(llm=llm, chain_type="map_reduce") + # Split text + text_splitter = CharacterTextSplitter() opea_microservices["opea_service@llm_docsum"].start() diff --git a/comps/llms/text-generation/tgi/llm.py b/comps/llms/text-generation/tgi/llm.py index ff1b2bb92..231aabbf8 100644 --- a/comps/llms/text-generation/tgi/llm.py +++ b/comps/llms/text-generation/tgi/llm.py @@ -19,21 +19,7 @@ ) @traceable(run_type="llm") def llm_generate(input: LLMParamsDoc): - llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") - llm = HuggingFaceEndpoint( - endpoint_url=llm_endpoint, - max_new_tokens=input.max_new_tokens, - top_k=input.top_k, - top_p=input.top_p, - typical_p=input.typical_p, - temperature=input.temperature, - repetition_penalty=input.repetition_penalty, - streaming=input.streaming, - timeout=600, - ) - if input.streaming: - async def stream_generator(): chat_response = "" async for text in llm.astream(input.query): @@ -51,4 +37,17 @@ async def stream_generator(): if __name__ == "__main__": + llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") + llm = HuggingFaceEndpoint( + endpoint_url=llm_endpoint, + max_new_tokens=input.max_new_tokens, + top_k=input.top_k, + top_p=input.top_p, + typical_p=input.typical_p, + temperature=input.temperature, + repetition_penalty=input.repetition_penalty, + streaming=input.streaming, + timeout=600, + ) + opea_microservices["opea_service@llm_tgi"].start() diff --git a/comps/retrievers/langchain/retriever_redis.py b/comps/retrievers/langchain/retriever_redis.py index c4f5655c0..15bf65add 100644 --- a/comps/retrievers/langchain/retriever_redis.py +++ b/comps/retrievers/langchain/retriever_redis.py @@ -22,6 +22,15 @@ ) @traceable(run_type="retriever") def retrieve(input: EmbedDoc768) -> SearchedDoc: + search_res = vector_db.similarity_search_by_vector(embedding=input.embedding) + searched_docs = [] + for r in search_res: + searched_docs.append(TextDoc(text=r.page_content)) + result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) + return result + + +if __name__ == "__main__": # Create vectorstore if tei_embedding_endpoint: # create embeddings using TEI endpoint service @@ -36,13 +45,4 @@ def retrieve(input: EmbedDoc768) -> SearchedDoc: redis_url=REDIS_URL, schema=INDEX_SCHEMA, ) - search_res = vector_db.similarity_search_by_vector(embedding=input.embedding) - searched_docs = [] - for r in search_res: - searched_docs.append(TextDoc(text=r.page_content)) - result = SearchedDoc(retrieved_docs=searched_docs, initial_query=input.text) - return result - - -if __name__ == "__main__": opea_microservices["opea_service@retriever_redis"].start() From 9c4e8b3c02c29fefbc0472544c1dcba84f18a8e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 4 Jun 2024 00:31:25 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/llms/text-generation/tgi/llm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/comps/llms/text-generation/tgi/llm.py b/comps/llms/text-generation/tgi/llm.py index 231aabbf8..ec3d95b83 100644 --- a/comps/llms/text-generation/tgi/llm.py +++ b/comps/llms/text-generation/tgi/llm.py @@ -20,6 +20,7 @@ @traceable(run_type="llm") def llm_generate(input: LLMParamsDoc): if input.streaming: + async def stream_generator(): chat_response = "" async for text in llm.astream(input.query): From f2cf241e37b176bf066b3dc74f7bc149430f6690 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 4 Jun 2024 08:57:30 +0800 Subject: [PATCH 3/8] add trace for debugging Signed-off-by: lvliang-intel --- tests/test_llms.sh | 8 +++++--- tests/test_reranks.sh | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_llms.sh b/tests/test_llms.sh index ffbcdc657..8ef6e25a6 100644 --- a/tests/test_llms.sh +++ b/tests/test_llms.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -xe +set -x WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') @@ -27,9 +27,9 @@ function start_service() { # check whether tgi is fully ready n=0 until [[ "$n" -ge 100 ]] || [[ $ready == true ]]; do - docker logs test-comps-llm-tgi-endpoint > test-comps-llm-tgi-endpoint.log + docker logs test-comps-llm-tgi-endpoint > ${WORKPATH}/tests/test-comps-llm-tgi-endpoint.log n=$((n+1)) - if grep -q Connected test-comps-llm-tgi-endpoint.log; then + if grep -q Connected ${WORKPATH}/tests/test-comps-llm-tgi-endpoint.log; then break fi sleep 5s @@ -44,6 +44,8 @@ function validate_microservice() { -X POST \ -d '{"query":"What is Deep Learning?"}' \ -H 'Content-Type: application/json' + docker logs test-comps-llm-tgi-endpoint + docker logs test-comps-llm-tgi-server } function stop_docker() { diff --git a/tests/test_reranks.sh b/tests/test_reranks.sh index 9fdf58725..ce2b11d0f 100644 --- a/tests/test_reranks.sh +++ b/tests/test_reranks.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -xe +set -x WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') @@ -33,6 +33,8 @@ function validate_microservice() { -X POST \ -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \ -H 'Content-Type: application/json' + docker logs test-comps-reranking-tei-server + docker logs test-comps-reranking-tei-endpoint } function stop_docker() { From 365ea34392d3cdd9f18f9ec9ba461f1da8e50925 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 4 Jun 2024 16:13:50 +0800 Subject: [PATCH 4/8] add trace log Signed-off-by: lvliang-intel --- tests/test_retrievers.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_retrievers.sh b/tests/test_retrievers.sh index 57c611545..c0277a64d 100644 --- a/tests/test_retrievers.sh +++ b/tests/test_retrievers.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -xe +set -x WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') @@ -38,6 +38,8 @@ function validate_microservice() { -X POST \ -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \ -H 'Content-Type: application/json' + docker logs test-comps-retriever-redis-server + docker logs test-comps-retriever-tei-endpoint } function stop_docker() { From 557eb17bf3d826032f2005f06938e644653b3a64 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 4 Jun 2024 17:12:05 +0800 Subject: [PATCH 5/8] fix issues Signed-off-by: lvliang-intel --- comps/llms/text-generation/tgi/llm.py | 26 +++++++++++++------------- tests/test_llms.sh | 2 +- tests/test_reranks.sh | 2 +- tests/test_retrievers.sh | 4 ++-- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/comps/llms/text-generation/tgi/llm.py b/comps/llms/text-generation/tgi/llm.py index ec3d95b83..ff1b2bb92 100644 --- a/comps/llms/text-generation/tgi/llm.py +++ b/comps/llms/text-generation/tgi/llm.py @@ -19,6 +19,19 @@ ) @traceable(run_type="llm") def llm_generate(input: LLMParamsDoc): + llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") + llm = HuggingFaceEndpoint( + endpoint_url=llm_endpoint, + max_new_tokens=input.max_new_tokens, + top_k=input.top_k, + top_p=input.top_p, + typical_p=input.typical_p, + temperature=input.temperature, + repetition_penalty=input.repetition_penalty, + streaming=input.streaming, + timeout=600, + ) + if input.streaming: async def stream_generator(): @@ -38,17 +51,4 @@ async def stream_generator(): if __name__ == "__main__": - llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") - llm = HuggingFaceEndpoint( - endpoint_url=llm_endpoint, - max_new_tokens=input.max_new_tokens, - top_k=input.top_k, - top_p=input.top_p, - typical_p=input.typical_p, - temperature=input.temperature, - repetition_penalty=input.repetition_penalty, - streaming=input.streaming, - timeout=600, - ) - opea_microservices["opea_service@llm_tgi"].start() diff --git a/tests/test_llms.sh b/tests/test_llms.sh index 8ef6e25a6..9d474ee28 100644 --- a/tests/test_llms.sh +++ b/tests/test_llms.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -x +set -xe WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') diff --git a/tests/test_reranks.sh b/tests/test_reranks.sh index ce2b11d0f..38db4d3fc 100644 --- a/tests/test_reranks.sh +++ b/tests/test_reranks.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -x +set -xe WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') diff --git a/tests/test_retrievers.sh b/tests/test_retrievers.sh index c0277a64d..10889e4d3 100644 --- a/tests/test_retrievers.sh +++ b/tests/test_retrievers.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -x +set -xe WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') @@ -26,7 +26,7 @@ function start_service() { unset http_proxy docker run -d --name="test-comps-retriever-redis-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME opea/retriever-redis:comps - sleep 3m + sleep 4m } function validate_microservice() { From e23125a954564a1322747a2ee07c9ba25dc112d9 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 4 Jun 2024 17:29:06 +0800 Subject: [PATCH 6/8] add missing redis service Signed-off-by: lvliang-intel --- comps/llms/summarization/tgi/llm.py | 29 +++++++++++++++-------------- tests/test_retrievers.sh | 7 +++++-- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/comps/llms/summarization/tgi/llm.py b/comps/llms/summarization/tgi/llm.py index f688d7bd1..e1cce35a5 100644 --- a/comps/llms/summarization/tgi/llm.py +++ b/comps/llms/summarization/tgi/llm.py @@ -34,7 +34,22 @@ def post_process_text(text: str): ) @traceable(run_type="llm") def llm_generate(input: LLMParamsDoc): + llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") + llm = HuggingFaceEndpoint( + endpoint_url=llm_endpoint, + max_new_tokens=input.max_new_tokens, + top_k=input.top_k, + top_p=input.top_p, + typical_p=input.typical_p, + temperature=input.temperature, + repetition_penalty=input.repetition_penalty, + streaming=input.streaming, + ) + llm_chain = load_summarize_chain(llm=llm, chain_type="map_reduce") + if input.streaming: + # Split text + text_splitter = CharacterTextSplitter() texts = text_splitter.split_text(input.query) # Create multiple documents docs = [Document(page_content=t) for t in texts] @@ -57,18 +72,4 @@ async def stream_generator(): if __name__ == "__main__": - llm_endpoint = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080") - llm = HuggingFaceEndpoint( - endpoint_url=llm_endpoint, - max_new_tokens=input.max_new_tokens, - top_k=input.top_k, - top_p=input.top_p, - typical_p=input.typical_p, - temperature=input.temperature, - repetition_penalty=input.repetition_penalty, - streaming=input.streaming, - ) - llm_chain = load_summarize_chain(llm=llm, chain_type="map_reduce") - # Split text - text_splitter = CharacterTextSplitter() opea_microservices["opea_service@llm_docsum"].start() diff --git a/tests/test_retrievers.sh b/tests/test_retrievers.sh index 10889e4d3..881db0aa3 100644 --- a/tests/test_retrievers.sh +++ b/tests/test_retrievers.sh @@ -12,15 +12,18 @@ function build_docker_images() { } function start_service() { + # redis + docker run -d --name test-redis-vector-db -p 5010:6379 -p 5011:8001 redis/redis-stack:7.2.0-v9 + # tei endpoint tei_endpoint=5008 - model="BAAI/bge-large-en-v1.5" + model="BAAI/bge-base-en-v1.5" revision="refs/pr/5" docker run -d --name="test-comps-retriever-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" # redis retriever - export REDIS_URL="redis://${ip_address}:6379" + export REDIS_URL="redis://${ip_address}:5010" export INDEX_NAME="rag-redis" retriever_port=5009 unset http_proxy From 9a9881ecfc22290c4b4b4dfef5f88dacf7b3cce7 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 4 Jun 2024 17:47:11 +0800 Subject: [PATCH 7/8] add more deplay Signed-off-by: lvliang-intel --- tests/test_retrievers.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_retrievers.sh b/tests/test_retrievers.sh index 881db0aa3..212a68255 100644 --- a/tests/test_retrievers.sh +++ b/tests/test_retrievers.sh @@ -2,7 +2,7 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -xe +set -x WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') @@ -14,12 +14,14 @@ function build_docker_images() { function start_service() { # redis docker run -d --name test-redis-vector-db -p 5010:6379 -p 5011:8001 redis/redis-stack:7.2.0-v9 + sleep 10s # tei endpoint tei_endpoint=5008 model="BAAI/bge-base-en-v1.5" revision="refs/pr/5" docker run -d --name="test-comps-retriever-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision + sleep 30s export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" # redis retriever From 42f92bcb20b3b47117d54f0671dda8fec2a6d495 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Sat, 8 Jun 2024 11:48:32 +0800 Subject: [PATCH 8/8] fix retriver microservice test issues Signed-off-by: lvliang-intel --- tests/test_retrievers_langchain.sh | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/test_retrievers_langchain.sh b/tests/test_retrievers_langchain.sh index 212a68255..2a9fd5463 100644 --- a/tests/test_retrievers_langchain.sh +++ b/tests/test_retrievers_langchain.sh @@ -2,25 +2,24 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -set -x +set -xe WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { cd $WORKPATH - docker build --no-cache -t opea/retriever-redis:comps -f comps/retrievers/langchain/docker/Dockerfile . + docker build --no-cache -t opea/retriever-redis:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/langchain/docker/Dockerfile . } function start_service() { # redis - docker run -d --name test-redis-vector-db -p 5010:6379 -p 5011:8001 redis/redis-stack:7.2.0-v9 + docker run -d --name test-redis-vector-db -p 5010:6379 -p 5011:8001 -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy redis/redis-stack:7.2.0-v9 sleep 10s # tei endpoint tei_endpoint=5008 model="BAAI/bge-base-en-v1.5" - revision="refs/pr/5" - docker run -d --name="test-comps-retriever-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model --revision $revision + docker run -d --name="test-comps-retriever-tei-endpoint" -p $tei_endpoint:80 -v ./data:/data --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.2 --model-id $model sleep 30s export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${tei_endpoint}" @@ -31,7 +30,7 @@ function start_service() { unset http_proxy docker run -d --name="test-comps-retriever-redis-server" -p ${retriever_port}:7000 --ipc=host -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME opea/retriever-redis:comps - sleep 4m + sleep 3m } function validate_microservice() { @@ -48,8 +47,15 @@ function validate_microservice() { } function stop_docker() { - cid=$(docker ps -aq --filter "name=test-comps-retrievers*") - if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + cid_retrievers=$(docker ps -aq --filter "name=test-comps-retrievers*") + if [[ ! -z "$cid_retrievers" ]]; then + docker stop $cid_retrievers && docker rm $cid_retrievers && sleep 1s + fi + + cid_redis=$(docker ps -aq --filter "name=test-redis-vector-db") + if [[ ! -z "$cid_redis" ]]; then + docker stop $cid_redis && docker rm $cid_redis && sleep 1s + fi } function main() {