diff --git a/.github/workflows/_comps-workflow.yml b/.github/workflows/_comps-workflow.yml index 6452209ee..12db04eb7 100644 --- a/.github/workflows/_comps-workflow.yml +++ b/.github/workflows/_comps-workflow.yml @@ -63,7 +63,7 @@ jobs: git clone https://github.com/vllm-project/vllm.git vllm-openvino cd ./vllm-openvino && git checkout v0.6.1 && git rev-parse HEAD && cd ../ fi - if [[ $(grep -c "vllm-hpu:" ${docker_compose_yml}) != 0 ]]; then + if [[ $(grep -c "vllm-gaudi:" ${docker_compose_yml}) != 0 ]]; then git clone https://github.com/HabanaAI/vllm-fork.git vllm-fork cd vllm-fork && git checkout 3c39626 && cd ../ fi diff --git a/.github/workflows/docker/compose/llms-compose.yaml b/.github/workflows/docker/compose/llms-compose.yaml index 91fbb46d4..984d59e9d 100644 --- a/.github/workflows/docker/compose/llms-compose.yaml +++ b/.github/workflows/docker/compose/llms-compose.yaml @@ -36,12 +36,12 @@ services: context: vllm-openvino dockerfile: Dockerfile.openvino image: ${REGISTRY:-opea}/vllm-openvino:${TAG:-latest} - vllm-hpu: + vllm-gaudi: build: context: vllm-fork dockerfile: Dockerfile.hpu shm_size: '128g' - image: ${REGISTRY:-opea}/vllm-hpu:${TAG:-latest} + image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest} vllm-arc: build: dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_gpu diff --git a/.github/workflows/docker/compose/lvms-compose.yaml b/.github/workflows/docker/compose/lvms-compose.yaml index e8ed56a2c..6e89db0e3 100644 --- a/.github/workflows/docker/compose/lvms-compose.yaml +++ b/.github/workflows/docker/compose/lvms-compose.yaml @@ -23,10 +23,10 @@ services: build: dockerfile: comps/lvms/llava/Dockerfile image: ${REGISTRY:-opea}/lvm-llava-svc:${TAG:-latest} - llava-hpu: + llava-gaudi: build: dockerfile: comps/lvms/llava/dependency/Dockerfile.intel_hpu - image: ${REGISTRY:-opea}/llava-hpu:${TAG:-latest} + image: ${REGISTRY:-opea}/llava-gaudi:${TAG:-latest} lvm-predictionguard: build: dockerfile: comps/lvms/predictionguard/Dockerfile diff --git a/comps/agent/langchain/README.md b/comps/agent/langchain/README.md index 2ff934f6c..585ff5d96 100644 --- a/comps/agent/langchain/README.md +++ b/comps/agent/langchain/README.md @@ -93,10 +93,10 @@ export vllm_volume=${YOUR_LOCAL_DIR_FOR_MODELS} # build vLLM image git clone https://github.com/HabanaAI/vllm-fork.git cd ./vllm-fork -docker build -f Dockerfile.hpu -t opea/vllm-hpu:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy +docker build -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy # TGI serving -docker run -d --runtime=habana --rm --name "comps-vllm-gaudi-service" -p 8080:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-hpu:latest --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral +docker run -d --runtime=habana --rm --name "comps-vllm-gaudi-service" -p 8080:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:latest --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral # check status docker logs comps-vllm-gaudi-service diff --git a/comps/llms/faq-generation/vllm/langchain/README.md b/comps/llms/faq-generation/vllm/langchain/README.md index b04cfc9d0..132521c4e 100644 --- a/comps/llms/faq-generation/vllm/langchain/README.md +++ b/comps/llms/faq-generation/vllm/langchain/README.md @@ -35,7 +35,7 @@ You can choose one as needed. ### 1.3 Run Docker with CLI (Option A) ```bash -docker run -d -p 8008:80 -v ./data:/data --name vllm-service --shm-size 1g opea/vllm:hpu --model-id ${LLM_MODEL_ID} +docker run -d -p 8008:80 -v ./data:/data --name vllm-service --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID} ``` ```bash diff --git a/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml b/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml index d0a00af5b..8ed64dd97 100644 --- a/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/faq-generation/vllm/langchain/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-service: - image: opea/vllm:hpu + image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - "8008:80" diff --git a/comps/llms/summarization/vllm/langchain/README.md b/comps/llms/summarization/vllm/langchain/README.md index bdb8f9beb..dafc6e109 100644 --- a/comps/llms/summarization/vllm/langchain/README.md +++ b/comps/llms/summarization/vllm/langchain/README.md @@ -18,7 +18,7 @@ pip install -r requirements.txt ```bash export HF_TOKEN=${your_hf_api_token} export LLM_MODEL_ID=${your_hf_llm_model} -docker run -p 8008:80 -v ./data:/data --name llm-docsum-vllm --shm-size 1g opea/vllm:hpu --model-id ${LLM_MODEL_ID} +docker run -p 8008:80 -v ./data:/data --name llm-docsum-vllm --shm-size 1g opea/vllm-gaudi:latest --model-id ${LLM_MODEL_ID} ``` ### 1.3 Verify the vLLM Service diff --git a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml b/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml index 8cc13e318..b93fd8030 100644 --- a/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/summarization/vllm/langchain/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-service: - image: opea/vllm:hpu + image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - "8008:80" diff --git a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh index aa189df0c..bcbf20c4a 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh @@ -38,7 +38,7 @@ if [ "$hw_mode" = "hpu" ]; then git clone https://github.com/HabanaAI/vllm-fork.git cd ./vllm-fork/ git checkout 3c39626 - docker build -f Dockerfile.hpu -t opea/vllm-hpu:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy cd .. rm -rf vllm-fork else diff --git a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh index 421112b68..6f6a7d211 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-hpu:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm-gaudi:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm-cpu:latest --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml index cb0dc2216..e817c9f35 100644 --- a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-service: - image: opea/vllm-hpu:latest + image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - "8008:80" diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh index 7bd162954..c94dd7237 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh @@ -33,7 +33,7 @@ if [ "$hw_mode" = "hpu" ]; then git clone https://github.com/HabanaAI/vllm-fork.git cd ./vllm-fork/ git checkout 3c39626 - docker build -f Dockerfile.hpu -t opea/vllm-hpu:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f Dockerfile.hpu -t opea/vllm-gaudi:latest --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy cd .. rm -rf vllm-fork else diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh index 300d8a551..d3363aa40 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm-hpu:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm-gaudi:latest --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm-cpu:latest --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml index e6ccae55f..eeed7d19a 100644 --- a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-service: - image: opea/vllm-hpu:latest + image: opea/vllm-gaudi:latest container_name: vllm-gaudi-server ports: - "8008:80" diff --git a/tests/agent/test_agent_langchain_on_intel_hpu.sh b/tests/agent/test_agent_langchain_on_intel_hpu.sh index 14eb874ae..9ba25228a 100644 --- a/tests/agent/test_agent_langchain_on_intel_hpu.sh +++ b/tests/agent/test_agent_langchain_on_intel_hpu.sh @@ -51,12 +51,12 @@ function build_vllm_docker_images() { git clone https://github.com/HabanaAI/vllm-fork.git fi cd ./vllm-fork - docker build -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy if [ $? -ne 0 ]; then - echo "opea/vllm-hpu:comps failed" + echo "opea/vllm-gaudi:comps failed" exit 1 else - echo "opea/vllm-hpu:comps successful" + echo "opea/vllm-gaudi:comps successful" fi } @@ -88,7 +88,7 @@ function start_vllm_service() { #single card echo "start vllm gaudi service" echo "**************model is $model**************" - docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -e HABANA_VISIBLE_DEVICES=all -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-hpu:comps --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 + docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -e HABANA_VISIBLE_DEVICES=all -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:comps --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 sleep 5s echo "Waiting vllm gaudi ready" n=0 @@ -115,7 +115,7 @@ function start_vllm_auto_tool_choice_service() { #single card echo "start vllm gaudi service" echo "**************auto_tool model is $model**************" - docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -e HABANA_VISIBLE_DEVICES=all -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-hpu:comps --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser ${model_parser} + docker run -d --runtime=habana --rm --name "test-comps-vllm-gaudi-service" -e HABANA_VISIBLE_DEVICES=all -p $vllm_port:80 -v $vllm_volume:/data -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data -e OMPI_MCA_btl_vader_single_copy_mechanism=none -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e VLLM_SKIP_WARMUP=true --cap-add=sys_nice --ipc=host opea/vllm-gaudi:comps --model ${model} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser ${model_parser} sleep 5s echo "Waiting vllm gaudi ready" n=0 diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh index 073ee5736..b1fd41e9a 100644 --- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh @@ -13,12 +13,12 @@ function build_docker_images() { git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork/ git checkout 3c39626 - docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . + docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . if [ $? -ne 0 ]; then - echo "opea/vllm-hpu built fail" + echo "opea/vllm-gaudi built fail" exit 1 else - echo "opea/vllm-hpu built successful" + echo "opea/vllm-gaudi built successful" fi ## Build OPEA microservice docker @@ -47,7 +47,7 @@ function start_service() { --cap-add=sys_nice \ --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ - opea/vllm-hpu:comps \ + opea/vllm-gaudi:comps \ --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 export vLLM_ENDPOINT="http://${ip_address}:${port_number}" diff --git a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh index 62626508a..3019d6c08 100644 --- a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh @@ -13,12 +13,12 @@ function build_docker_images() { git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork/ git checkout 3c39626 - docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . + docker build --no-cache -f Dockerfile.hpu -t opea/vllm-gaudi:comps --shm-size=128g . if [ $? -ne 0 ]; then - echo "opea/vllm-hpu built fail" + echo "opea/vllm-gaudi built fail" exit 1 else - echo "opea/vllm-hpu built successful" + echo "opea/vllm-gaudi built successful" fi ## Build OPEA microservice docker @@ -47,7 +47,7 @@ function start_service() { --cap-add=sys_nice \ --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ - opea/vllm-hpu:comps \ + opea/vllm-gaudi:comps \ --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 export vLLM_ENDPOINT="http://${ip_address}:${port_number}"