From 83f764154e6e041b3402b0c355f3ad9aa6d39d0b Mon Sep 17 00:00:00 2001 From: Li Gang Date: Mon, 9 Sep 2024 18:03:56 +0800 Subject: [PATCH 1/8] Add vllm Arc Dockerfile support Support vllm inference on Intel ARC GPU Signed-off-by: Li Gang Co-authored-by: Chen, Hu1 --- .../text-generation/vllm/docker/Dockerfile.arc | 10 ++++++++++ comps/llms/text-generation/vllm/vllm_arc.sh | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 comps/llms/text-generation/vllm/docker/Dockerfile.arc create mode 100755 comps/llms/text-generation/vllm/vllm_arc.sh diff --git a/comps/llms/text-generation/vllm/docker/Dockerfile.arc b/comps/llms/text-generation/vllm/docker/Dockerfile.arc new file mode 100644 index 000000000..4d8d921e9 --- /dev/null +++ b/comps/llms/text-generation/vllm/docker/Dockerfile.arc @@ -0,0 +1,10 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM intelanalytics/ipex-llm-serving-vllm-xpu-experiment:2.1.0b2 + +COPY comps/llms/text-generation/vllm/vllm_arc.sh /llm + +RUN chmod +x /llm/vllm_arc.sh + +ENTRYPOINT ["/llm/vllm_arc.sh"] diff --git a/comps/llms/text-generation/vllm/vllm_arc.sh b/comps/llms/text-generation/vllm/vllm_arc.sh new file mode 100755 index 000000000..cb0518431 --- /dev/null +++ b/comps/llms/text-generation/vllm/vllm_arc.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +LLM_MODEL_ID="${LLM_MODEL_ID:=Intel/neural-chat-7b-v3-3}" + +source /opt/intel/oneapi/setvars.sh +source /opt/intel/1ccl-wks/setvars.sh + +python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ + --port 9009 \ + --model ${LLM_MODEL_ID} \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --device xpu \ + --enforce-eager \ + $@ From c1c6431a28228e1a069f455748679b03bed00c75 Mon Sep 17 00:00:00 2001 From: Li Gang Date: Sun, 3 Nov 2024 19:58:58 +0800 Subject: [PATCH 2/8] Add vLLM ARC support With vLLM official repo: https://github.com/vllm-project/vllm/ based on openvino backend Dockerfile is based on Dockerfile.openvino https://github.com/vllm-project/vllm/blob/main/Dockerfile.openvino And add ARC support packages Default mode: meta-llama/Llama-3.2-3B-Instruct to fit ARC A770 VRAM Signed-off-by: Li Gang --- .../vllm/docker/Dockerfile.arc | 10 ----- .../dependency/Dockerfile.openvino_arc | 43 +++++++++++++++++++ .../dependency/build_docker_vllm_openvino.sh | 29 ++++++++++--- .../launch_vllm_service_openvino.sh | 27 +++++++++--- .../text-generation/vllm/langchain/query.sh | 3 +- comps/llms/text-generation/vllm/vllm_arc.sh | 18 -------- 6 files changed, 90 insertions(+), 40 deletions(-) delete mode 100644 comps/llms/text-generation/vllm/docker/Dockerfile.arc create mode 100644 comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.openvino_arc delete mode 100755 comps/llms/text-generation/vllm/vllm_arc.sh diff --git a/comps/llms/text-generation/vllm/docker/Dockerfile.arc b/comps/llms/text-generation/vllm/docker/Dockerfile.arc deleted file mode 100644 index 4d8d921e9..000000000 --- a/comps/llms/text-generation/vllm/docker/Dockerfile.arc +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -FROM intelanalytics/ipex-llm-serving-vllm-xpu-experiment:2.1.0b2 - -COPY comps/llms/text-generation/vllm/vllm_arc.sh /llm - -RUN chmod +x /llm/vllm_arc.sh - -ENTRYPOINT ["/llm/vllm_arc.sh"] diff --git a/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.openvino_arc b/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.openvino_arc new file mode 100644 index 000000000..dfb94d2df --- /dev/null +++ b/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.openvino_arc @@ -0,0 +1,43 @@ +# The vLLM Dockerfile is used to construct vLLM image that can be directly used +# to run the OpenAI compatible server. +# Based on https://github.com/vllm-project/vllm/blob/main/Dockerfile.openvino +# add Intel ARC support package + +FROM ubuntu:22.04 AS dev + +RUN apt-get update -y && \ + apt-get install -y \ + git python3-pip \ + ffmpeg libsm6 libxext6 libgl1 \ + gpg-agent wget + +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg && \ + echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" | \ + tee /etc/apt/sources.list.d/intel-gpu-jammy.list &&\ + apt update -y &&\ + apt install -y \ + intel-opencl-icd intel-level-zero-gpu level-zero \ + intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \ + libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \ + libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \ + mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo + +WORKDIR /workspace + +RUN git clone -b v0.6.3.post1 https://github.com/vllm-project/vllm.git + +#ARG GIT_REPO_CHECK=0 +#RUN --mount=type=bind,source=.git,target=.git \ +# if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + +# install build requirements +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt +# build vLLM with OpenVINO backend +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ + +#COPY examples/ /workspace/vllm/examples +#COPY benchmarks/ /workspace/vllm/benchmarks + + +CMD ["/bin/bash"] + diff --git a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh index 7384ac8f2..6c62d7653 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh @@ -3,8 +3,27 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -BASEDIR="$( cd "$( dirname "$0" )" && pwd )" -git clone https://github.com/vllm-project/vllm.git vllm -cd ./vllm/ && git checkout v0.6.1 -docker build -t vllm:openvino -f Dockerfile.openvino . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -cd $BASEDIR && rm -rf vllm +# Set default values +default_hw_mode="cpu" + +# Assign arguments to variable +hw_mode=${1:-$default_hw_mode} + +# Check if all required arguments are provided +if [ "$#" -lt 0 ] || [ "$#" -gt 1 ]; then + echo "Usage: $0 [hw_mode]" + echo "Please customize the arguments you want to use. + - hw_mode: The hardware mode for the vLLM endpoint, with the default being 'cpu', and the optional selection can be 'cpu' and 'gpu'." + exit 1 +fi + +# Build the docker image for vLLM based on the hardware mode +if [ "$hw_mode" = "gpu" ]; then + docker build -f Dockerfile.openvino_arc -t opea/vllm:arc . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy +else + BASEDIR="$( cd "$( dirname "$0" )" && pwd )" + git clone https://github.com/vllm-project/vllm.git vllm + cd ./vllm/ && git checkout v0.6.1 + docker build -t vllm:openvino -f Dockerfile.openvino . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + cd $BASEDIR && rm -rf vllm +fi \ No newline at end of file diff --git a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service_openvino.sh b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service_openvino.sh index d54970877..55a4c01c6 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service_openvino.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service_openvino.sh @@ -9,16 +9,20 @@ default_port=8008 default_model="meta-llama/Llama-2-7b-hf" +default_device="cpu" swap_space=50 +image="vllm:openvino" -while getopts ":hm:p:" opt; do +while getopts ":hm:p:d:" opt; do case $opt in h) - echo "Usage: $0 [-h] [-m model] [-p port]" + echo "Usage: $0 [-h] [-m model] [-p port] [-d device]" echo "Options:" echo " -h Display this help message" - echo " -m model Model (default: meta-llama/Llama-2-7b-hf)" + echo " -m model Model (default: meta-llama/Llama-2-7b-hf for cpu" + echo " meta-llama/Llama-3.2-3B-Instruct for gpu)" echo " -p port Port (default: 8000)" + echo " -d device Target Device (Default: cpu, optional selection can be 'cpu' and 'gpu')" exit 0 ;; m) @@ -27,6 +31,9 @@ while getopts ":hm:p:" opt; do p) port=$OPTARG ;; + d) + device=$OPTARG + ;; \?) echo "Invalid option: -$OPTARG" >&2 exit 1 @@ -37,25 +44,33 @@ done # Assign arguments to variables model_name=${model:-$default_model} port_number=${port:-$default_port} +device=${device:-$default_device} # Set the Huggingface cache directory variable HF_CACHE_DIR=$HOME/.cache/huggingface - +if [ "$device" = "gpu" ]; then + docker_args="-e VLLM_OPENVINO_DEVICE=GPU --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path" + vllm_args="--max_model_len=1024" + model_name="meta-llama/Llama-3.2-3B-Instruct" + image="opea/vllm:arc" +fi # Start the model server using Openvino as the backend inference engine. # Provide the container name that is unique and meaningful, typically one that includes the model name. docker run -d --rm --name="vllm-openvino-server" \ -p $port_number:80 \ --ipc=host \ + $docker_args \ -e HTTPS_PROXY=$https_proxy \ -e HTTP_PROXY=$https_proxy \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ - -v $HOME/.cache/huggingface:/home/user/.cache/huggingface \ - vllm:openvino /bin/bash -c "\ + -v $HOME/.cache/huggingface:/root/.cache/huggingface \ + $image /bin/bash -c "\ cd / && \ export VLLM_CPU_KVCACHE_SPACE=50 && \ python3 -m vllm.entrypoints.openai.api_server \ --model \"$model_name\" \ + $vllm_args \ --host 0.0.0.0 \ --port 80" diff --git a/comps/llms/text-generation/vllm/langchain/query.sh b/comps/llms/text-generation/vllm/langchain/query.sh index 13b63511b..31fa18750 100644 --- a/comps/llms/text-generation/vllm/langchain/query.sh +++ b/comps/llms/text-generation/vllm/langchain/query.sh @@ -2,11 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 your_ip="0.0.0.0" +model=$(curl http://localhost:8008/v1/models -s|jq -r '.data[].id') curl http://${your_ip}:8008/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "model": "'$model'", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0 diff --git a/comps/llms/text-generation/vllm/vllm_arc.sh b/comps/llms/text-generation/vllm/vllm_arc.sh deleted file mode 100755 index cb0518431..000000000 --- a/comps/llms/text-generation/vllm/vllm_arc.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# Copyright (C) 2024 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -LLM_MODEL_ID="${LLM_MODEL_ID:=Intel/neural-chat-7b-v3-3}" - -source /opt/intel/oneapi/setvars.sh -source /opt/intel/1ccl-wks/setvars.sh - -python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \ - --port 9009 \ - --model ${LLM_MODEL_ID} \ - --trust-remote-code \ - --gpu-memory-utilization 0.9 \ - --device xpu \ - --enforce-eager \ - $@ From e2f60d8ec680bf5a92077ca0cc945edeefaa6028 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 3 Nov 2024 12:23:13 +0000 Subject: [PATCH 3/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../vllm/langchain/dependency/build_docker_vllm_openvino.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh index 6c62d7653..b7c55f1ac 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh @@ -26,4 +26,4 @@ else cd ./vllm/ && git checkout v0.6.1 docker build -t vllm:openvino -f Dockerfile.openvino . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy cd $BASEDIR && rm -rf vllm -fi \ No newline at end of file +fi From 8ace5b443d3f02c7a15dd52ab302fad122c327e1 Mon Sep 17 00:00:00 2001 From: Li Gang Date: Tue, 5 Nov 2024 08:37:11 +0800 Subject: [PATCH 4/8] Add README and .github workflow for vLLM ARC support Signed-off-by: Li Gang --- .../docker/compose/llms-compose-cd.yaml | 5 ++ .../text-generation/vllm/langchain/README.md | 46 +++++++++++++++++-- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker/compose/llms-compose-cd.yaml b/.github/workflows/docker/compose/llms-compose-cd.yaml index c33bc0f3f..55e662a6d 100644 --- a/.github/workflows/docker/compose/llms-compose-cd.yaml +++ b/.github/workflows/docker/compose/llms-compose-cd.yaml @@ -15,6 +15,11 @@ services: context: vllm-openvino dockerfile: Dockerfile.openvino image: ${REGISTRY:-opea}/vllm-openvino:${TAG:-latest} + vllm-openvino-arc: + build: + context: vllm-openvino-arc + dockerfile: Dockerfile.openvino_arc + image: ${REGISTRY:-opea}/vllm-arc:${TAG:-latest} llm-eval: build: dockerfile: comps/llms/utils/lm-eval/Dockerfile diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md index 6f41b9fe0..ab84d4674 100644 --- a/comps/llms/text-generation/vllm/langchain/README.md +++ b/comps/llms/text-generation/vllm/langchain/README.md @@ -98,16 +98,16 @@ For example, if we run `meta-llama/Meta-Llama-3-70b` with 8 cards, we can use fo bash ./launch_vllm_service.sh 8008 meta-llama/Meta-Llama-3-70b hpu 8 ``` -### 2.3 vLLM with OpenVINO +### 2.3 vLLM with OpenVINO (on Intel GPU and CPU) -vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.rst) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features: +vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.rst) and can perform optimal model serving on Intel GPU and all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete IntelĀ® GPUs (starting from IntelĀ® UHD Graphics generation). OpenVINO vLLM backend supports the following advanced vLLM features: - Prefix caching (`--enable-prefix-caching`) - Chunked prefill (`--enable-chunked-prefill`) #### Build Docker Image -To build the docker image, run the command +To build the docker image for Intel CPU, run the command ```bash bash ./build_docker_vllm_openvino.sh @@ -115,6 +115,14 @@ bash ./build_docker_vllm_openvino.sh Once it successfully builds, you will have the `vllm:openvino` image. It can be used to spawn a serving container with OpenAI API endpoint or you can work with it interactively via bash shell. +To build the docker image for Intel GPU, run the command + +```bash +bash ./build_docker_vllm_openvino.sh gpu +``` + +Once it successfully builds, you will have the `vllm:arc` image. It can be used to spawn a serving container with OpenAI API endpoint or you can work with it interactively via bash shell. + #### Launch vLLM service For gated models, such as `LLAMA-2`, you will have to pass -e HUGGING_FACE_HUB_TOKEN=\ to the docker run command above with a valid Hugging Face Hub read token. @@ -125,13 +133,31 @@ Please follow this link [huggingface token](https://huggingface.co/docs/hub/secu export HUGGINGFACEHUB_API_TOKEN= ``` -To start the model server: +To start the model server for Intel CPU: ```bash bash launch_vllm_service_openvino.sh ``` +To start the model server for Intel GPU: + +```bash +bash launch_vllm_service_openvino.sh -d gpu +``` + #### Performance tips +---------------- + +vLLM OpenVINO backend environment variables + + +- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default. + +- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` + + +##### CPU performance tips + vLLM OpenVINO backend uses the following environment variables to control behavior: @@ -148,6 +174,18 @@ OpenVINO best known configuration is: $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 +##### GPU performance tips + +GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache). + +Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. + +OpenVINO best known configuration for GPU is: + + $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json + + ### 2.4 Query the service And then you can make requests like below to check the service status: From 3590ccccd9fd04047fc727732c899c27a5374c7d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Nov 2024 00:45:43 +0000 Subject: [PATCH 5/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/llms/text-generation/vllm/langchain/README.md | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md index ab84d4674..5e049061d 100644 --- a/comps/llms/text-generation/vllm/langchain/README.md +++ b/comps/llms/text-generation/vllm/langchain/README.md @@ -146,19 +146,17 @@ bash launch_vllm_service_openvino.sh -d gpu ``` #### Performance tips ----------------- -vLLM OpenVINO backend environment variables +--- +vLLM OpenVINO backend environment variables -- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default. +- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default. - `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` - ##### CPU performance tips - vLLM OpenVINO backend uses the following environment variables to control behavior: - `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. @@ -176,7 +174,7 @@ OpenVINO best known configuration is: ##### GPU performance tips -GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache). +GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache). Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`. @@ -185,7 +183,6 @@ OpenVINO best known configuration for GPU is: $ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json - ### 2.4 Query the service And then you can make requests like below to check the service status: From 4d9e3eddac0a9da9fcb1e351cec33aaad639ec72 Mon Sep 17 00:00:00 2001 From: Li Gang Date: Fri, 8 Nov 2024 10:01:28 +0800 Subject: [PATCH 6/8] Update comps/llms/text-generation/vllm/langchain/README.md Co-authored-by: Eero Tamminen --- comps/llms/text-generation/vllm/langchain/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md index 5e049061d..5efc42610 100644 --- a/comps/llms/text-generation/vllm/langchain/README.md +++ b/comps/llms/text-generation/vllm/langchain/README.md @@ -153,7 +153,7 @@ vLLM OpenVINO backend environment variables - `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default. -- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` +- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` enables U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `` ##### CPU performance tips From 5b404bb181aa7784b8c72935fd763883de0394b2 Mon Sep 17 00:00:00 2001 From: Li Gang Date: Fri, 8 Nov 2024 10:11:11 +0800 Subject: [PATCH 7/8] Rename Dockerfile to meet Contribution Guidelines Signed-off-by: Li Gang --- .github/workflows/docker/compose/llms-compose-cd.yaml | 5 ++--- .../{Dockerfile.openvino_arc => Dockerfile.intel_gpu} | 0 .../vllm/langchain/dependency/build_docker_vllm_openvino.sh | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) rename comps/llms/text-generation/vllm/langchain/dependency/{Dockerfile.openvino_arc => Dockerfile.intel_gpu} (100%) diff --git a/.github/workflows/docker/compose/llms-compose-cd.yaml b/.github/workflows/docker/compose/llms-compose-cd.yaml index 55e662a6d..0a6fb6b15 100644 --- a/.github/workflows/docker/compose/llms-compose-cd.yaml +++ b/.github/workflows/docker/compose/llms-compose-cd.yaml @@ -15,10 +15,9 @@ services: context: vllm-openvino dockerfile: Dockerfile.openvino image: ${REGISTRY:-opea}/vllm-openvino:${TAG:-latest} - vllm-openvino-arc: + vllm-openvino-intel-gpu: build: - context: vllm-openvino-arc - dockerfile: Dockerfile.openvino_arc + dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_gpu image: ${REGISTRY:-opea}/vllm-arc:${TAG:-latest} llm-eval: build: diff --git a/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.openvino_arc b/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_gpu similarity index 100% rename from comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.openvino_arc rename to comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_gpu diff --git a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh index b7c55f1ac..cca4c22cc 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh @@ -19,7 +19,7 @@ fi # Build the docker image for vLLM based on the hardware mode if [ "$hw_mode" = "gpu" ]; then - docker build -f Dockerfile.openvino_arc -t opea/vllm:arc . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f Dockerfile.intel_gpu -t opea/vllm:arc . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy else BASEDIR="$( cd "$( dirname "$0" )" && pwd )" git clone https://github.com/vllm-project/vllm.git vllm From 261530da45824e3133237e5639fa3d4ec9932dcf Mon Sep 17 00:00:00 2001 From: Li Gang Date: Fri, 8 Nov 2024 11:12:33 +0800 Subject: [PATCH 8/8] Align image names as opea/vllm-arc:latest Signed-off-by: Li Gang --- .github/workflows/docker/compose/llms-compose-cd.yaml | 2 +- comps/llms/text-generation/vllm/langchain/README.md | 2 +- .../vllm/langchain/dependency/build_docker_vllm_openvino.sh | 2 +- .../vllm/langchain/dependency/launch_vllm_service_openvino.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker/compose/llms-compose-cd.yaml b/.github/workflows/docker/compose/llms-compose-cd.yaml index 0a6fb6b15..6af17cf9e 100644 --- a/.github/workflows/docker/compose/llms-compose-cd.yaml +++ b/.github/workflows/docker/compose/llms-compose-cd.yaml @@ -15,7 +15,7 @@ services: context: vllm-openvino dockerfile: Dockerfile.openvino image: ${REGISTRY:-opea}/vllm-openvino:${TAG:-latest} - vllm-openvino-intel-gpu: + vllm-arc: build: dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_gpu image: ${REGISTRY:-opea}/vllm-arc:${TAG:-latest} diff --git a/comps/llms/text-generation/vllm/langchain/README.md b/comps/llms/text-generation/vllm/langchain/README.md index 5efc42610..89159356f 100644 --- a/comps/llms/text-generation/vllm/langchain/README.md +++ b/comps/llms/text-generation/vllm/langchain/README.md @@ -121,7 +121,7 @@ To build the docker image for Intel GPU, run the command bash ./build_docker_vllm_openvino.sh gpu ``` -Once it successfully builds, you will have the `vllm:arc` image. It can be used to spawn a serving container with OpenAI API endpoint or you can work with it interactively via bash shell. +Once it successfully builds, you will have the `opea/vllm-arc:latest` image. It can be used to spawn a serving container with OpenAI API endpoint or you can work with it interactively via bash shell. #### Launch vLLM service diff --git a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh index cca4c22cc..2640cf460 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/build_docker_vllm_openvino.sh @@ -19,7 +19,7 @@ fi # Build the docker image for vLLM based on the hardware mode if [ "$hw_mode" = "gpu" ]; then - docker build -f Dockerfile.intel_gpu -t opea/vllm:arc . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f Dockerfile.intel_gpu -t opea/vllm-arc:latest . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy else BASEDIR="$( cd "$( dirname "$0" )" && pwd )" git clone https://github.com/vllm-project/vllm.git vllm diff --git a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service_openvino.sh b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service_openvino.sh index 55a4c01c6..140df6a0f 100644 --- a/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service_openvino.sh +++ b/comps/llms/text-generation/vllm/langchain/dependency/launch_vllm_service_openvino.sh @@ -53,7 +53,7 @@ if [ "$device" = "gpu" ]; then docker_args="-e VLLM_OPENVINO_DEVICE=GPU --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path" vllm_args="--max_model_len=1024" model_name="meta-llama/Llama-3.2-3B-Instruct" - image="opea/vllm:arc" + image="opea/vllm-arc:latest" fi # Start the model server using Openvino as the backend inference engine. # Provide the container name that is unique and meaningful, typically one that includes the model name.