diff --git a/.github/workflows/docker/compose/llms-compose-cd.yaml b/.github/workflows/docker/compose/llms-compose-cd.yaml index c33bc0f3f..7dff6d5c6 100644 --- a/.github/workflows/docker/compose/llms-compose-cd.yaml +++ b/.github/workflows/docker/compose/llms-compose-cd.yaml @@ -23,10 +23,6 @@ services: build: dockerfile: comps/llms/text-generation/vllm/llama_index/Dockerfile image: ${REGISTRY:-opea}/llm-vllm-llamaindex:${TAG:-latest} - llm-vllm-llamaindex-hpu: - build: - dockerfile: comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu - image: ${REGISTRY:-opea}/llm-vllm-llamaindex-hpu:${TAG:-latest} llm-predictionguard: build: dockerfile: comps/llms/text-generation/predictionguard/Dockerfile diff --git a/.github/workflows/docker/compose/llms-compose.yaml b/.github/workflows/docker/compose/llms-compose.yaml index 904f7e1e7..c7ea529cd 100644 --- a/.github/workflows/docker/compose/llms-compose.yaml +++ b/.github/workflows/docker/compose/llms-compose.yaml @@ -24,10 +24,6 @@ services: build: dockerfile: comps/llms/text-generation/vllm/langchain/Dockerfile image: ${REGISTRY:-opea}/llm-vllm:${TAG:-latest} - llm-vllm-hpu: - build: - dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu - image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest} llm-vllm-ray: build: dockerfile: comps/llms/text-generation/vllm/ray/Dockerfile diff --git a/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu b/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu deleted file mode 100644 index f3703e4e7..000000000 --- a/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 AS hpu - -RUN git clone https://github.com/HabanaAI/vllm-fork.git /workspace/vllm - -# COPY ./ /workspace/vllm - -WORKDIR /workspace/vllm - -RUN pip install --no-cache-dir -v -r requirements-hpu.txt - -ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true - -RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install - -WORKDIR /workspace/ - -RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks - -CMD ["/bin/bash"] diff --git a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh index a47bd23bf..da7ee3aaa 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm.sh @@ -30,7 +30,11 @@ fi # Build the docker image for vLLM based on the hardware mode if [ "$hw_mode" = "hpu" ]; then - docker build -f Dockerfile.intel_hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + git clone https://github.com/HabanaAI/vllm-fork.git + cd ./vllm-fork/ + docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + cd .. + rm -rf vllm-fork else git clone https://github.com/vllm-project/vllm.git cd ./vllm/ diff --git a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh index 0d97eeb47..a5b2ceb3b 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture " + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} opea/vllm:hpu --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HF_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml index cd1e3cf54..acb620d16 100644 --- a/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/langchain/docker_compose_llm.yaml @@ -23,7 +23,7 @@ services: cap_add: - SYS_NICE ipc: host - command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80" + command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 llm: image: opea/llm-vllm:latest container_name: llm-vllm-gaudi-server diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu b/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu deleted file mode 100644 index 8166f471e..000000000 --- a/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0 AS hpu -RUN useradd -m -s /bin/bash user && \ - mkdir -p /home/user && \ - chown -R user /home/user/ -ENV LANG=en_US.UTF-8 -RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ - service ssh restart -USER user -WORKDIR /root - -RUN pip install --no-cache-dir --upgrade-strategy eager optimum[habana] - -RUN pip install --no-cache-dir -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d - -RUN pip install --no-cache-dir setuptools - -ENV PT_HPU_LAZY_ACC_PAR_MODE=0 - -ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true - -CMD ["/bin/bash"] diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh index b4a13d5fb..8b37fe048 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh @@ -30,7 +30,11 @@ fi # Build the docker image for vLLM based on the hardware mode if [ "$hw_mode" = "hpu" ]; then - docker build -f docker/Dockerfile.intel_hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + git clone https://github.com/HabanaAI/vllm-fork.git + cd ./vllm-fork/ + docker build -f Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + cd .. + rm -rf vllm-fork else git clone https://github.com/vllm-project/vllm.git cd ./vllm/ diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh index 0c7ed90de..bdf46889f 100644 --- a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture " + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture else docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml index f754a13d5..94358acc6 100644 --- a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml @@ -23,7 +23,7 @@ services: cap_add: - SYS_NICE ipc: host - command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80" + command: --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 llm: image: opea/llm-vllm-llamaindex:latest container_name: llm-vllm-gaudi-server diff --git a/comps/llms/text-generation/vllm/llama_index/llm.py b/comps/llms/text-generation/vllm/llama_index/llm.py index 55bcec7dc..76afa24a9 100644 --- a/comps/llms/text-generation/vllm/llama_index/llm.py +++ b/comps/llms/text-generation/vllm/llama_index/llm.py @@ -66,7 +66,8 @@ async def stream_generator(): return StreamingResponse(stream_generator(), media_type="text/event-stream") else: - response = await llm.acomplete(input.query).text + response = await llm.acomplete(input.query) + response = response.text if logflag: logger.info(response) return GeneratedDoc(text=response, prompt=input.query) diff --git a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh index 6ecf5d2d6..5024b0c93 100644 --- a/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_langchain_on_intel_hpu.sh @@ -8,12 +8,11 @@ WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { - ## Build VLLM Ray docker - cd $WORKPATH/comps/llms/text-generation/vllm/langchain/dependency - docker build \ - -f Dockerfile.intel_hpu \ - --no-cache -t opea/vllm-hpu:comps \ - --shm-size=128g . + ## Build VLLM docker + cd $WORKPATH + git clone https://github.com/HabanaAI/vllm-fork.git + cd vllm-fork/ + docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . if [ $? -ne 0 ]; then echo "opea/vllm-hpu built fail" exit 1 @@ -48,7 +47,7 @@ function start_service() { --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ opea/vllm-hpu:comps \ - /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048" + --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 export vLLM_ENDPOINT="http://${ip_address}:${port_number}" docker run -d --rm \ @@ -65,7 +64,7 @@ function start_service() { until [[ "$n" -ge 120 ]] || [[ $ready == true ]]; do docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log n=$((n+1)) - if grep -q Connected ${WORKPATH}/tests/test-comps-vllm-service.log; then + if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then break fi sleep 5s diff --git a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh index ca67a00f4..724e523e7 100644 --- a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh +++ b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh @@ -8,12 +8,11 @@ WORKPATH=$(dirname "$PWD") ip_address=$(hostname -I | awk '{print $1}') function build_docker_images() { - ## Build VLLM Ray docker - cd $WORKPATH/comps/llms/text-generation/vllm/llama_index/dependency - docker build \ - -f Dockerfile.intel_hpu \ - --no-cache -t opea/vllm-hpu:comps \ - --shm-size=128g . + ## Build VLLM docker + cd $WORKPATH + git clone https://github.com/HabanaAI/vllm-fork.git + cd vllm-fork/ + docker build --no-cache -f Dockerfile.hpu -t opea/vllm-hpu:comps --shm-size=128g . if [ $? -ne 0 ]; then echo "opea/vllm-hpu built fail" exit 1 @@ -48,7 +47,7 @@ function start_service() { --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ opea/vllm-hpu:comps \ - /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048" + --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048 export vLLM_ENDPOINT="http://${ip_address}:${port_number}" docker run -d --rm \ @@ -65,7 +64,7 @@ function start_service() { until [[ "$n" -ge 120 ]] || [[ $ready == true ]]; do docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log n=$((n+1)) - if grep -q Connected ${WORKPATH}/tests/test-comps-vllm-service.log; then + if grep -q throughput ${WORKPATH}/tests/test-comps-vllm-service.log; then break fi sleep 5s