diff --git a/.github/workflows/docker/compose/llms-compose-cd.yaml b/.github/workflows/docker/compose/llms-compose-cd.yaml index 1d99d076d..604dbc85f 100644 --- a/.github/workflows/docker/compose/llms-compose-cd.yaml +++ b/.github/workflows/docker/compose/llms-compose-cd.yaml @@ -11,3 +11,11 @@ services: context: vllm-openvino dockerfile: Dockerfile.openvino image: ${REGISTRY:-opea}/vllm-openvino:${TAG:-latest} + llm-vllm-llamaindex: + build: + dockerfile: comps/llms/text-generation/vllm/llama_index/Dockerfile + image: ${REGISTRY:-opea}/llm-vllm-llamaindex:${TAG:-latest} + llm-vllm-llamaindex-hpu: + build: + dockerfile: comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu + image: ${REGISTRY:-opea}/llm-vllm-llamaindex-hpu:${TAG:-latest} diff --git a/comps/llms/text-generation/vllm/langchain/build_docker_microservice.sh b/comps/llms/text-generation/vllm/langchain/build_docker_microservice.sh index 1bff66ef2..4058d4544 100644 --- a/comps/llms/text-generation/vllm/langchain/build_docker_microservice.sh +++ b/comps/llms/text-generation/vllm/langchain/build_docker_microservice.sh @@ -6,4 +6,4 @@ docker build \ -t opea/llm-vllm:latest \ --build-arg https_proxy=$https_proxy \ --build-arg http_proxy=$http_proxy \ - -f comps/llms/text-generation/vllm/docker/Dockerfile . + -f comps/llms/text-generation/vllm/langchain/Dockerfile . diff --git a/comps/llms/text-generation/vllm/llama_index/Dockerfile b/comps/llms/text-generation/vllm/llama_index/Dockerfile new file mode 100644 index 000000000..48429eb7e --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/Dockerfile @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +FROM ubuntu:22.04 + +RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \ + libgl1-mesa-glx \ + libjemalloc-dev \ + python3 \ + python3-pip + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ + +USER user + +COPY comps /home/user/comps + +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r /home/user/comps/llms/text-generation/vllm/llama_index/requirements.txt + + +ENV PYTHONPATH=$PYTHONPATH:/home/user + +WORKDIR /home/user/comps/llms/text-generation/vllm/llama_index + +ENTRYPOINT ["bash", "entrypoint.sh"] diff --git a/comps/llms/text-generation/vllm/llama_index/README.md b/comps/llms/text-generation/vllm/llama_index/README.md new file mode 100644 index 000000000..4bd51c812 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/README.md @@ -0,0 +1,189 @@ +# vLLM Endpoint Service + +[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference and serving, it delivers state-of-the-art serving throughput with a set of advanced features such as PagedAttention, Continuous batching and etc.. Besides GPUs, vLLM already supported [Intel CPUs](https://www.intel.com/content/www/us/en/products/overview.html) and [Gaudi accelerators](https://habana.ai/products). This guide provides an example on how to launch vLLM serving endpoint on CPU and Gaudi accelerators. + +## 🚀1. Set up Environment Variables + +```bash +export HUGGINGFACEHUB_API_TOKEN= +export vLLM_ENDPOINT="http://${your_ip}:8008" +export LLM_MODEL="meta-llama/Meta-Llama-3-8B-Instruct" +``` + +For gated models such as `LLAMA-2`, you will have to pass the environment HUGGINGFACEHUB_API_TOKEN. Please follow this link [huggingface token](https://huggingface.co/docs/hub/security-tokens) to get the access token and export `HUGGINGFACEHUB_API_TOKEN` environment with the token. + +## 🚀2. Set up vLLM Service + +First of all, go to the server folder for vllm. + +```bash +cd dependency +``` + +### 2.1 vLLM on CPU + +First let's enable VLLM on CPU. + +#### Build docker + +```bash +bash ./build_docker_vllm.sh +``` + +The `build_docker_vllm` accepts one parameter `hw_mode` to specify the hardware mode of the service, with the default being `cpu`, and the optional selection can be `hpu`. + +#### Launch vLLM service + +```bash +bash ./launch_vllm_service.sh +``` + +If you want to customize the port or model_name, can run: + +```bash +bash ./launch_vllm_service.sh ${port_number} ${model_name} +``` + +### 2.2 vLLM on Gaudi + +Then we show how to enable VLLM on Gaudi. + +#### Build docker + +```bash +bash ./build_docker_vllm.sh hpu +``` + +Set `hw_mode` to `hpu`. + +Note: If you want to enable tensor parallel, please set `setuptools==69.5.1` in Dockerfile.hpu before build docker with following command. + +``` +sed -i "s/RUN pip install setuptools/RUN pip install setuptools==69.5.1/g" docker/Dockerfile.hpu +``` + +#### Launch vLLM service on single node + +For small model, we can just use single node. + +```bash +bash ./launch_vllm_service.sh ${port_number} ${model_name} hpu 1 +``` + +Set `hw_mode` to `hpu` and `parallel_number` to 1. + +The `launch_vllm_service.sh` script accepts 7 parameters: + +- port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8008. +- model_name: The model name utilized for LLM, with the default set to 'meta-llama/Meta-Llama-3-8B-Instruct'. +- hw_mode: The hardware mode utilized for LLM, with the default set to "cpu", and the optional selection can be "hpu". +- parallel_number: parallel nodes number for 'hpu' mode +- block_size: default set to 128 for better performance on HPU +- max_num_seqs: default set to 256 for better performance on HPU +- max_seq_len_to_capture: default set to 2048 for better performance on HPU + +If you want to get more performance tuning tips, can refer to [Performance tuning](https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md#performance-tips). + +#### Launch vLLM service on multiple nodes + +For large model such as `meta-llama/Meta-Llama-3-70b`, we need to launch on multiple nodes. + +```bash +bash ./launch_vllm_service.sh ${port_number} ${model_name} hpu ${parallel_number} +``` + +For example, if we run `meta-llama/Meta-Llama-3-70b` with 8 cards, we can use following command. + +```bash +bash ./launch_vllm_service.sh 8008 meta-llama/Meta-Llama-3-70b hpu 8 +``` + +### 2.3 vLLM with OpenVINO + +vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](https://github.com/vllm-project/vllm/blob/main/docs/source/models/supported_models.rst) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features: + +- Prefix caching (`--enable-prefix-caching`) +- Chunked prefill (`--enable-chunked-prefill`) + +#### Build Docker Image + +To build the docker image, run the command + +```bash +bash ./build_docker_vllm_openvino.sh +``` + +Once it successfully builds, you will have the `vllm:openvino` image. It can be used to spawn a serving container with OpenAI API endpoint or you can work with it interactively via bash shell. + +#### Launch vLLM service + +For gated models, such as `LLAMA-2`, you will have to pass -e HUGGING_FACE_HUB_TOKEN=\ to the docker run command above with a valid Hugging Face Hub read token. + +Please follow this link [huggingface token](https://huggingface.co/docs/hub/security-tokens) to get an access token and export `HUGGINGFACEHUB_API_TOKEN` environment with the token. + +```bash +export HUGGINGFACEHUB_API_TOKEN= +``` + +To start the model server: + +```bash +bash launch_vllm_service_openvino.sh +``` + +#### Performance tips + +vLLM OpenVINO backend uses the following environment variables to control behavior: + +- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. + +- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. + +- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. + +To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`) + +OpenVINO best known configuration is: + + $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ + python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 + +### 2.4 Query the service + +And then you can make requests like below to check the service status: + +```bash +curl http://${your_ip}:8008/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "prompt": "What is Deep Learning?", + "max_tokens": 32, + "temperature": 0 + }' +``` + +## 🚀3. Set up LLM microservice + +Then we warp the VLLM service into LLM microcervice. + +### Build docker + +```bash +bash build_docker_microservice.sh +``` + +### Launch the microservice + +```bash +bash launch_microservice.sh +``` + +### Query the microservice + +```bash +curl http://${your_ip}:9000/v1/chat/completions \ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ + -H 'Content-Type: application/json' +``` diff --git a/comps/llms/text-generation/vllm/llama_index/build_docker_microservice.sh b/comps/llms/text-generation/vllm/llama_index/build_docker_microservice.sh new file mode 100644 index 000000000..46e1edba3 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/build_docker_microservice.sh @@ -0,0 +1,9 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +cd ../../../../ +docker build \ + -t opea/llm-vllm-llamaindex:latest \ + --build-arg https_proxy=$https_proxy \ + --build-arg http_proxy=$http_proxy \ + -f comps/llms/text-generation/vllm/llama_index/Dockerfile . diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu b/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu new file mode 100644 index 000000000..916c9a64c --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu @@ -0,0 +1,28 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# FROM vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest as hpu +FROM opea/habanalabs:1.16.1-pytorch-installer-2.2.2 as hpu + +RUN useradd -m -s /bin/bash user && \ + mkdir -p /home/user && \ + chown -R user /home/user/ +ENV LANG=en_US.UTF-8 +RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + service ssh restart +USER user +WORKDIR /root + +RUN pip install --no-cache-dir --upgrade-strategy eager optimum[habana] + +RUN pip install --no-cache-dir -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d + +RUN pip install --no-cache-dir setuptools + +ENV no_proxy=localhost,127.0.0.1 + +ENV PT_HPU_LAZY_ACC_PAR_MODE=0 + +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + +CMD ["/bin/bash"] diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh new file mode 100644 index 000000000..b4a13d5fb --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Set default values +default_hw_mode="cpu" + +# Assign arguments to variable +hw_mode=${1:-$default_hw_mode} + +# Check if all required arguments are provided +if [ "$#" -lt 0 ] || [ "$#" -gt 1 ]; then + echo "Usage: $0 [hw_mode]" + echo "Please customize the arguments you want to use. + - hw_mode: The hardware mode for the Ray Gaudi endpoint, with the default being 'cpu', and the optional selection can be 'cpu' and 'hpu'." + exit 1 +fi + +# Build the docker image for vLLM based on the hardware mode +if [ "$hw_mode" = "hpu" ]; then + docker build -f docker/Dockerfile.intel_hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy +else + git clone https://github.com/vllm-project/vllm.git + cd ./vllm/ + docker build -f Dockerfile.cpu -t opea/vllm:cpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy +fi diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm_openvino.sh b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm_openvino.sh new file mode 100644 index 000000000..4566263bc --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/dependency/build_docker_vllm_openvino.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +BASEDIR="$( cd "$( dirname "$0" )" && pwd )" +git clone https://github.com/vllm-project/vllm.git vllm +cd ./vllm/ +docker build -t vllm:openvino -f Dockerfile.openvino . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy +cd $BASEDIR && rm -rf vllm diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh new file mode 100644 index 000000000..0c7ed90de --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Set default values +default_port=8008 +default_model=$LLM_MODEL +default_hw_mode="cpu" +default_parallel_number=1 +default_block_size=128 +default_max_num_seqs=256 +default_max_seq_len_to_capture=2048 + +# Assign arguments to variables +port_number=${1:-$default_port} +model_name=${2:-$default_model} +hw_mode=${3:-$default_hw_mode} +parallel_number=${4:-$default_parallel_number} +block_size=${5:-$default_block_size} +max_num_seqs=${6:-$default_max_num_seqs} +max_seq_len_to_capture=${7:-$default_max_seq_len_to_capture} + +# Check if all required arguments are provided +if [ "$#" -lt 0 ] || [ "$#" -gt 4 ]; then + echo "Usage: $0 [port_number] [model_name] [hw_mode] [parallel_number]" + echo "port_number: The port number assigned to the vLLM CPU endpoint, with the default being 8080." + echo "model_name: The model name utilized for LLM, with the default set to 'meta-llama/Meta-Llama-3-8B-Instruct'." + echo "hw_mode: The hardware mode utilized for LLM, with the default set to 'cpu', and the optional selection can be 'hpu'" + echo "parallel_number: parallel nodes number for 'hpu' mode" + echo "block_size: default set to 128 for better performance on HPU" + echo "max_num_seqs: default set to 256 for better performance on HPU" + echo "max_seq_len_to_capture: default set to 2048 for better performance on HPU" + exit 1 +fi + +# Set the volume variable +volume=$PWD/data + +# Build the Docker run command based on hardware mode +if [ "$hw_mode" = "hpu" ]; then + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture " +else + docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80 +fi diff --git a/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service_openvino.sh b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service_openvino.sh new file mode 100644 index 000000000..d54970877 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/dependency/launch_vllm_service_openvino.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +# Set default values + + +default_port=8008 +default_model="meta-llama/Llama-2-7b-hf" +swap_space=50 + +while getopts ":hm:p:" opt; do + case $opt in + h) + echo "Usage: $0 [-h] [-m model] [-p port]" + echo "Options:" + echo " -h Display this help message" + echo " -m model Model (default: meta-llama/Llama-2-7b-hf)" + echo " -p port Port (default: 8000)" + exit 0 + ;; + m) + model=$OPTARG + ;; + p) + port=$OPTARG + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + exit 1 + ;; + esac +done + +# Assign arguments to variables +model_name=${model:-$default_model} +port_number=${port:-$default_port} + + +# Set the Huggingface cache directory variable +HF_CACHE_DIR=$HOME/.cache/huggingface + +# Start the model server using Openvino as the backend inference engine. +# Provide the container name that is unique and meaningful, typically one that includes the model name. + +docker run -d --rm --name="vllm-openvino-server" \ + -p $port_number:80 \ + --ipc=host \ + -e HTTPS_PROXY=$https_proxy \ + -e HTTP_PROXY=$https_proxy \ + -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ + -v $HOME/.cache/huggingface:/home/user/.cache/huggingface \ + vllm:openvino /bin/bash -c "\ + cd / && \ + export VLLM_CPU_KVCACHE_SPACE=50 && \ + python3 -m vllm.entrypoints.openai.api_server \ + --model \"$model_name\" \ + --host 0.0.0.0 \ + --port 80" diff --git a/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml new file mode 100644 index 000000000..f754a13d5 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/docker_compose_llm.yaml @@ -0,0 +1,46 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +version: "3.8" + +services: + vllm-service: + image: opea/vllm:hpu + container_name: vllm-gaudi-server + ports: + - "8008:80" + volumes: + - "./data:/data" + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + HABANA_VISIBLE_DEVICES: all + OMPI_MCA_btl_vader_single_copy_mechanism: none + LLM_MODEL: ${LLM_MODEL} + runtime: habana + cap_add: + - SYS_NICE + ipc: host + command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80" + llm: + image: opea/llm-vllm-llamaindex:latest + container_name: llm-vllm-gaudi-server + depends_on: + - vllm-service + ports: + - "9000:9000" + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + vLLM_ENDPOINT: ${vLLM_ENDPOINT} + HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN} + LLM_MODEL: ${LLM_MODEL} + restart: unless-stopped + +networks: + default: + driver: bridge diff --git a/comps/llms/text-generation/vllm/llama_index/entrypoint.sh b/comps/llms/text-generation/vllm/llama_index/entrypoint.sh new file mode 100644 index 000000000..0e0bc8f04 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/entrypoint.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +pip --no-cache-dir install -r requirements-runtime.txt + +python3 llm.py diff --git a/comps/llms/text-generation/vllm/llama_index/launch_microservice.sh b/comps/llms/text-generation/vllm/llama_index/launch_microservice.sh new file mode 100644 index 000000000..ef8084f61 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/launch_microservice.sh @@ -0,0 +1,14 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +docker run -d --rm \ + --name="llm-vllm-server" \ + -p 9000:9000 \ + --ipc=host \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e vLLM_ENDPOINT=$vLLM_ENDPOINT \ + -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \ + -e LLM_MODEL=$LLM_MODEL \ + -e LOGFLAG=$LOGFLAG \ + opea/llm-vllm-llamaindex:latest diff --git a/comps/llms/text-generation/vllm/llama_index/llm.py b/comps/llms/text-generation/vllm/llama_index/llm.py new file mode 100644 index 000000000..db4652b96 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/llm.py @@ -0,0 +1,78 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import os + +from fastapi.responses import StreamingResponse +from llama_index.llms.openai_like import OpenAILike + +from comps import ( + CustomLogger, + GeneratedDoc, + LLMParamsDoc, + ServiceType, + opea_microservices, + opea_telemetry, + register_microservice, +) + +logger = CustomLogger("llm_vllm_llamaindex") +logflag = os.getenv("LOGFLAG", False) + + +@opea_telemetry +def post_process_text(text: str): + if text == " ": + return "data: @#$\n\n" + if text == "\n": + return "data:
\n\n" + if text.isspace(): + return None + new_text = text.replace(" ", "@#$") + return f"data: {new_text}\n\n" + + +@register_microservice( + name="opea_service@llm_vllm_llama_index", + service_type=ServiceType.LLM, + endpoint="/v1/chat/completions", + host="0.0.0.0", + port=9000, +) +def llm_generate(input: LLMParamsDoc): + if logflag: + logger.info(input) + llm_endpoint = os.getenv("vLLM_ENDPOINT", "http://localhost:8008") + model_name = os.getenv("LLM_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct") + llm = OpenAILike( + api_key="fake", + api_base=llm_endpoint + "/v1", + max_tokens=input.max_new_tokens, + model=model_name, + top_p=input.top_p, + temperature=input.temperature, + streaming=input.streaming, + ) + + if input.streaming: + + def stream_generator(): + chat_response = "" + for text in llm.stream_complete(input.query): + chat_response += text + chunk_repr = repr(text.encode("utf-8")) + yield f"data: {chunk_repr}\n\n" + if logflag: + logger.info(f"[llm - chat_stream] stream response: {chat_response}") + yield "data: [DONE]\n\n" + + return StreamingResponse(stream_generator(), media_type="text/event-stream") + else: + response = llm.complete(input.query).text + if logflag: + logger.info(response) + return GeneratedDoc(text=response, prompt=input.query) + + +if __name__ == "__main__": + opea_microservices["opea_service@llm_vllm_llama_index"].start() diff --git a/comps/llms/text-generation/vllm/llama_index/query.sh b/comps/llms/text-generation/vllm/llama_index/query.sh new file mode 100644 index 000000000..5784b13a6 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/query.sh @@ -0,0 +1,19 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +your_ip="0.0.0.0" + +curl http://${your_ip}:8008/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "prompt": "What is Deep Learning?", + "max_tokens": 32, + "temperature": 0 + }' + +##query microservice +curl http://${your_ip}:9000/v1/chat/completions \ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ + -H 'Content-Type: application/json' diff --git a/comps/llms/text-generation/vllm/llama_index/requirements-runtime.txt b/comps/llms/text-generation/vllm/llama_index/requirements-runtime.txt new file mode 100644 index 000000000..225adde27 --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/requirements-runtime.txt @@ -0,0 +1 @@ +langserve diff --git a/comps/llms/text-generation/vllm/llama_index/requirements.txt b/comps/llms/text-generation/vllm/llama_index/requirements.txt new file mode 100644 index 000000000..cddb1c21e --- /dev/null +++ b/comps/llms/text-generation/vllm/llama_index/requirements.txt @@ -0,0 +1,12 @@ +docarray[full] +fastapi +huggingface_hub +llama-index-llms-openai-like +opentelemetry-api +opentelemetry-exporter-otlp +opentelemetry-sdk +prometheus-fastapi-instrumentator +shortuuid +transformers +uvicorn +vllm diff --git a/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh new file mode 100644 index 000000000..43fa4b8dc --- /dev/null +++ b/tests/llms/test_llms_text-generation_vllm_llamaindex_on_intel_hpu.sh @@ -0,0 +1,126 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +ip_address=$(hostname -I | awk '{print $1}') + +function build_docker_images() { + ## Build VLLM Ray docker + cd $WORKPATH/comps/llms/text-generation/vllm/llama_index/dependency + docker build \ + -f Dockerfile.intel_hpu \ + --no-cache -t opea/vllm-hpu:comps \ + --shm-size=128g . + if [ $? -ne 0 ]; then + echo "opea/vllm-hpu built fail" + exit 1 + else + echo "opea/vllm-hpu built successful" + fi + + ## Build OPEA microservice docker + cd $WORKPATH + docker build \ + --no-cache -t opea/llm-vllm-llamaindex:comps \ + -f comps/llms/text-generation/vllm/llama_index/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/llm-vllm-llamaindex built fail" + exit 1 + else + echo "opea/llm-vllm-llamaindex built successful" + fi +} + +function start_service() { + export LLM_MODEL="facebook/opt-125m" + port_number=5025 + docker run -d --rm \ + --runtime=habana \ + --name="test-comps-vllm-service" \ + -v $PWD/data:/data \ + -p $port_number:80 \ + -e HABANA_VISIBLE_DEVICES=all \ + -e OMPI_MCA_btl_vader_single_copy_mechanism=none \ + --cap-add=sys_nice \ + --ipc=host \ + -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ + opea/vllm-hpu:comps \ + /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048" + + export vLLM_ENDPOINT="http://${ip_address}:${port_number}" + docker run -d --rm \ + --name="test-comps-vllm-microservice" \ + -p 5030:9000 \ + --ipc=host \ + -e vLLM_ENDPOINT=$vLLM_ENDPOINT \ + -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \ + -e LLM_MODEL=$LLM_MODEL \ + opea/llm-vllm-llamaindex:comps + + # check whether vllm ray is fully ready + n=0 + until [[ "$n" -ge 120 ]] || [[ $ready == true ]]; do + docker logs test-comps-vllm-service > ${WORKPATH}/tests/test-comps-vllm-service.log + n=$((n+1)) + if grep -q Connected ${WORKPATH}/tests/test-comps-vllm-service.log; then + break + fi + sleep 5s + done + sleep 5s +} + +function validate_microservice() { + result=$(http_proxy="" curl http://${ip_address}:5025/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "facebook/opt-125m", + "prompt": "What is Deep Learning?", + "max_tokens": 32, + "temperature": 0 + }') + if [[ $result == *"text"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vllm-service + docker logs test-comps-vllm-microservice + exit 1 + fi + result=$(http_proxy="" curl http://${ip_address}:5030/v1/chat/completions \ + -X POST \ + -d '{"query":"What is Deep Learning?","max_new_tokens":17,"top_p":0.95,"temperature":0.01,"streaming":false}' \ + -H 'Content-Type: application/json') + if [[ $result == *"text"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs test-comps-vllm-service + docker logs test-comps-vllm-microservice + exit 1 + fi +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=test-comps-vllm*") + if [[ ! -z "$cid" ]]; then docker rm $cid -f && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main