From 7dbad0706d820f3c6ff8e8b4dd0ee40b7c389ff4 Mon Sep 17 00:00:00 2001 From: Zahidul Haque Date: Fri, 7 Jun 2024 10:39:29 +0530 Subject: [PATCH] openvino support in vllm (#65) Signed-off-by: Zahidul Haque --- .../text-generation/vllm-openvino/README.md | 73 +++++++++++++++++++ .../vllm-openvino/build_vllm_openvino.sh | 9 +++ .../vllm-openvino/launch_model_server.sh | 46 ++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 comps/llms/text-generation/vllm-openvino/README.md create mode 100755 comps/llms/text-generation/vllm-openvino/build_vllm_openvino.sh create mode 100755 comps/llms/text-generation/vllm-openvino/launch_model_server.sh diff --git a/comps/llms/text-generation/vllm-openvino/README.md b/comps/llms/text-generation/vllm-openvino/README.md new file mode 100644 index 000000000..48f8f3305 --- /dev/null +++ b/comps/llms/text-generation/vllm-openvino/README.md @@ -0,0 +1,73 @@ +# Use vLLM with OpenVINO + +## Build Docker Image + +To build the docker image, run the command + +```bash +bash ./build_vllm_openvino.sh +``` + +Once it successfully builds, you will have the `vllm:openvino` image. It can be used to spawn a serving container with OpenAI API endpoint or you can work with it interactively via bash shell. + +## Use vLLM serving with OpenAI API + +### Start The Server: + +For gated models, such as `LLAMA-2`, you will have to pass -e HUGGING_FACE_HUB_TOKEN=\ to the docker run command above with a valid Hugging Face Hub read token. + +Please follow this link [huggingface token](https://huggingface.co/docs/hub/security-tokens) to get an access token and export `HUGGINGFACEHUB_API_TOKEN` environment with the token. + +```bash +export HUGGINGFACEHUB_API_TOKEN= +``` + +To start the model server: + +```bash +bash launch_model_server.sh +``` + +### Request Completion With Curl: + +```bash +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-2-7b-hf", + "prompt": "What is the key advantage of Openvino framework?", + "max_tokens": 300, + "temperature": 0.7 + }' +``` + +#### Customize vLLM-OpenVINO Service + +The `launch_model_server.sh` script accepts two parameters: + +- port: The port number assigned to the vLLM CPU endpoint, with the default being 8000. +- model: The model name utilized for LLM, with the default set to "meta-llama/Llama-2-7b-hf". + +You have the flexibility to customize the two parameters according to your specific needs. Below is a sample reference, if you wish to specify a different model and port number + +` bash launch_model_server.sh -m meta-llama/Llama-2-7b-chat-hf -p 8123` + +Additionally, you can set the vLLM CPU endpoint by exporting the environment variable `vLLM_LLM_ENDPOINT`: + +```bash +export vLLM_LLM_ENDPOINT="http://xxx.xxx.xxx.xxx:8000" +export LLM_MODEL= # example: export LLM_MODEL="meta-llama/Llama-2-7b-hf" +``` + +## Use Int-8 Weights Compression + +Weights int-8 compression is disabled by default. For better performance and lower memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`. +To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above. + +The variable enables weights compression logic described in [optimum-intel 8-bit weights quantization](https://huggingface.co/docs/optimum/intel/optimization_ov#8-bit). +Hence, even if the variable is enabled, the compression is applied only for models starting with a certain size and avoids compression of too small models due to a significant accuracy drop. + +## Use UInt-8 KV cache Compression + +KV cache uint-8 compression is disabled by default. For better performance and lower memory consumption, the KV cache compression can be enabled by setting the environment variable `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`. +To pass the variable in docker, use `-e VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` as an additional argument to `docker run` command in the examples above. diff --git a/comps/llms/text-generation/vllm-openvino/build_vllm_openvino.sh b/comps/llms/text-generation/vllm-openvino/build_vllm_openvino.sh new file mode 100755 index 000000000..1b3e159fc --- /dev/null +++ b/comps/llms/text-generation/vllm-openvino/build_vllm_openvino.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +git clone --branch openvino-model-executor https://github.com/ilya-lavrenov/vllm.git +cd ./vllm/ +docker build -t vllm:openvino -f Dockerfile.openvino . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy diff --git a/comps/llms/text-generation/vllm-openvino/launch_model_server.sh b/comps/llms/text-generation/vllm-openvino/launch_model_server.sh new file mode 100755 index 000000000..887c31629 --- /dev/null +++ b/comps/llms/text-generation/vllm-openvino/launch_model_server.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +# Set default values + + +default_port=8000 +default_model="meta-llama/Llama-2-7b-hf" +swap_space=50 + +while getopts ":hm:p:" opt; do + case $opt in + h) + echo "Usage: $0 [-h] [-m model] [-p port]" + echo "Options:" + echo " -h Display this help message" + echo " -m model Model (default: meta-llama/Llama-2-7b-hf)" + echo " -p port Port (default: 8000)" + exit 0 + ;; + m) + model=$OPTARG + ;; + p) + port=$OPTARG + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + exit 1 + ;; + esac +done + +# Assign arguments to variables +model_name=${model:-$default_model} +port_number=${port:-$default_port} + + +# Set the Huggingface cache directory variable +HF_CACHE_DIR=$HOME/.cache/huggingface + +# Start the model server using Openvino as the backend inference engine. Provide the container name that is unique and meaningful, typically one that includes the model name. +docker run --rm --name="vllm-openvino-server" -p $port_number:$port_number -v $HF_CACHE_DIR:/root/.cache/huggingface vllm:openvino --model $model_name --port $port_number --disable-log-requests --swap-space $swap_space