Support Llama index for vLLM native (#692)

Signed-off-by: zhenwei-intel <[email protected]>
opea-project · Sep 17, 2024 · 2e41dcf · 2e41dcf
1 parent 391c4a5
commit 2e41dcf
Show file tree

Hide file tree

Showing 17 changed files with 1,032 additions and 6 deletions.
diff --git a/.github/workflows/docker/compose/llms-compose-cd.yaml b/.github/workflows/docker/compose/llms-compose-cd.yaml
@@ -4,8 +4,12 @@
 services:
   llm-native:
     build:
-      dockerfile: comps/llms/text-generation/native/Dockerfile
+      dockerfile: comps/llms/text-generation/native/langchain/Dockerfile
     image: ${REGISTRY:-opea}/llm-native:${TAG:-latest}
+  llm-native-llamaindex:
+    build:
+      dockerfile: comps/llms/text-generation/native/llama_index/Dockerfile
+    image: ${REGISTRY:-opea}/llm-native-llamaindex:${TAG:-latest}
   vllm-openvino:
     build:
       context: vllm-openvino

diff --git a/comps/llms/text-generation/native/langchain/Dockerfile b/comps/llms/text-generation/native/langchain/Dockerfile
@@ -0,0 +1,41 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# HABANA environment
+# FROM vault.habana.ai/gaudi-docker/1.16.1/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest as hpu
+FROM opea/habanalabs:1.16.1-pytorch-installer-2.2.2 as hpu
+
+ENV LANG=en_US.UTF-8
+ARG REPO=https://github.com/huggingface/optimum-habana.git
+ARG REPO_VER=v1.12.1
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    git-lfs \
+    libgl1-mesa-glx \
+    libjemalloc-dev
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+RUN git lfs install
+
+COPY comps /home/user/comps
+
+RUN pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
+    pip install --no-cache-dir git+https://github.com/HabanaAI/[email protected]
+
+RUN git clone ${REPO} /home/user/optimum-habana && \
+    cd /home/user/optimum-habana && git checkout ${REPO_VER} && \
+    cd examples/text-generation && pip install --no-cache-dir -r requirements.txt && \
+    cd /home/user/comps/llms/text-generation/native/langchain && \
+    pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir --upgrade --force-reinstall pydantic
+
+ENV PYTHONPATH=/root:/home/user
+
+WORKDIR /home/user/comps/llms/text-generation/native/langchain
+
+ENTRYPOINT ["python", "llm.py"]
diff --git a/comps/llms/text-generation/native/README.md → ...ext-generation/native/langchain/README.md b/comps/llms/text-generation/native/README.md → ...ext-generation/native/langchain/README.md
@@ -17,8 +17,9 @@ export LLM_NATIVE_MODEL="Qwen/Qwen2-7B-Instruct"
 ### 1.2 Build Docker Image
 
 ```bash
-cd ../../../../
-docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/native/Dockerfile .
+cd ../../../../../
+docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/native/langchain
+Dockerfile .
 ```
 
 To start a docker container, you have two options:

diff --git a/...generation/native/docker_compose_llm.yaml → .../native/langchain/docker_compose_llm.yaml b/...generation/native/docker_compose_llm.yaml → .../native/langchain/docker_compose_llm.yaml
diff --git a/comps/llms/text-generation/native/llm.py → ...s/text-generation/native/langchain/llm.py b/comps/llms/text-generation/native/llm.py → ...s/text-generation/native/langchain/llm.py
diff --git a/...s/text-generation/native/requirements.txt → ...eration/native/langchain/requirements.txt b/...s/text-generation/native/requirements.txt → ...eration/native/langchain/requirements.txt
diff --git a/...s/llms/text-generation/native/template.py → ...t-generation/native/langchain/template.py b/...s/llms/text-generation/native/template.py → ...t-generation/native/langchain/template.py
diff --git a/comps/llms/text-generation/native/utils.py → ...text-generation/native/langchain/utils.py b/comps/llms/text-generation/native/utils.py → ...text-generation/native/langchain/utils.py
diff --git a/comps/llms/text-generation/native/Dockerfile → ...-generation/native/llama_index/Dockerfile b/comps/llms/text-generation/native/Dockerfile → ...-generation/native/llama_index/Dockerfile
@@ -30,11 +30,11 @@ RUN pip install --no-cache-dir --upgrade-strategy eager optimum[habana] && \
 RUN git clone ${REPO} /home/user/optimum-habana && \
     cd /home/user/optimum-habana && git checkout ${REPO_VER} && \
     cd examples/text-generation && pip install --no-cache-dir -r requirements.txt && \
-    cd /home/user/comps/llms/text-generation/native && pip install --no-cache-dir -r requirements.txt && \
+    cd /home/user/comps/llms/text-generation/native/llama_index && pip install --no-cache-dir -r requirements.txt && \
     pip install --no-cache-dir --upgrade --force-reinstall pydantic
 
 ENV PYTHONPATH=/root:/home/user
 
-WORKDIR /home/user/comps/llms/text-generation/native
+WORKDIR /home/user/comps/llms/text-generation/native/llama_index
 
 ENTRYPOINT ["python", "llm.py"]
diff --git a/comps/llms/text-generation/native/llama_index/README.md b/comps/llms/text-generation/native/llama_index/README.md
@@ -0,0 +1,60 @@
+# LLM Native Microservice
+
+LLM Native microservice uses [optimum-habana](https://github.com/huggingface/optimum-habana) for model initialization and warm-up, focusing solely on large language models (LLMs). It operates without frameworks like TGI/VLLM, using PyTorch directly for inference, and supports only non-streaming formats. This streamlined approach optimizes performance on Habana hardware.
+
+## 🚀1. Start Microservice
+
+If you start an LLM microservice with docker, the `docker_compose_llm.yaml` file will automatically start a Native LLM service with docker.
+
+### 1.1 Setup Environment Variables
+
+In order to start Native LLM service, you need to setup the following environment variables first.
+
+```bash
+export LLM_NATIVE_MODEL="Qwen/Qwen2-7B-Instruct"
+```
+
+### 1.2 Build Docker Image
+
+```bash
+cd ../../../../../
+docker build -t opea/llm-native:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/native/llama_index/Dockerfile .
+```
+
+To start a docker container, you have two options:
+
+- A. Run Docker with CLI
+- B. Run Docker with Docker Compose
+
+You can choose one as needed.
+
+### 1.3 Run Docker with CLI (Option A)
+
+```bash
+docker run -d --runtime=habana --name="llm-native-server" -p 9000:9000 -e https_proxy=$https_proxy -e http_proxy=$http_proxy -e TOKENIZERS_PARALLELISM=false -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e LLM_NATIVE_MODEL=${LLM_NATIVE_MODEL} opea/llm-native:latest
+```
+
+### 1.4 Run Docker with Docker Compose (Option B)
+
+```bash
+docker compose -f docker_compose_llm.yaml up -d
+```
+
+## 🚀2. Consume LLM Service
+
+### 2.1 Check Service Status
+
+```bash
+curl http://${your_ip}:9000/v1/health_check\
+  -X GET \
+  -H 'Content-Type: application/json'
+```
+
+### 2.2 Consume LLM Service
+
+```bash
+curl http://${your_ip}:9000/v1/chat/completions\
+  -X POST \
+  -d '{"query":"What is Deep Learning?"}' \
+  -H 'Content-Type: application/json'
+```
diff --git a/comps/llms/text-generation/native/llama_index/docker_compose_llm.yaml b/comps/llms/text-generation/native/llama_index/docker_compose_llm.yaml
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  llm:
+    image: opea/llm-native:latest
+    container_name: llm-native-server
+    ports:
+      - "9000:9000"
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLM_NATIVE_MODEL: ${LLM_NATIVE_MODEL}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+      TOKENIZERS_PARALLELISM: false
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
diff --git a/comps/llms/text-generation/native/llama_index/llm.py b/comps/llms/text-generation/native/llama_index/llm.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+sys.path.append("/test/GenAIComps/")
+
+import logging
+import os
+import threading
+import time
+
+import torch
+from llama_index.core import PromptTemplate
+from template import ChatTemplate, args_dict, input_sentences
+from utils import initialize_model
+
+from comps import (
+    GeneratedDoc,
+    LLMParamsDoc,
+    ServiceType,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+)
+
+logflag = os.getenv("LOGFLAG", False)
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+class Args:
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
+
+
+model = None
+assistant_model = None
+tokenizer = None
+generation_config = None
+args = Args(**args_dict)
+initialization_lock = threading.Lock()
+initialized = False
+
+
+def generate(
+    input_query: list,
+    device="hpu",
+    use_lazy_mode=True,
+    use_hpu_graphs=True,
+    profiling_steps=0,
+    profiling_warmup_steps=0,
+    ignore_eos=True,
+    profiling_record_shapes=False,
+):
+    """Generates sequences from the input sentences and returns them."""
+    logger.info(f"[llm - generate] starting to inference with prompt {input_query}")
+    encode_t0 = time.perf_counter()
+
+    # Tokenization
+    input_tokens = tokenizer.batch_encode_plus(input_query, return_tensors="pt", padding=True)
+    encode_duration = time.perf_counter() - encode_t0
+    logger.info(f"[llm - generate] input tokenized: {input_tokens}")
+
+    # Move inputs to target device(s)
+    for t in input_tokens:
+        logger.info(f"[llm - generate] t: {t}")
+        if torch.is_tensor(input_tokens[t]):
+            logger.info("[llm - generate] input[t] is tensor")
+            logger.info(f"[llm - generate] device: {model.device}")
+            input_tokens[t] = input_tokens[t].to(model.device)
+
+    logger.info("[llm - generate] inputs transferred.")
+
+    iteration_times = []
+    outputs = model.generate(
+        **input_tokens,
+        generation_config=generation_config,
+        assistant_model=assistant_model,
+        lazy_mode=use_lazy_mode,
+        hpu_graphs=use_hpu_graphs,
+        profiling_steps=profiling_steps,
+        profiling_warmup_steps=profiling_warmup_steps,
+        ignore_eos=ignore_eos,
+        iteration_times=iteration_times,
+        profiling_record_shapes=profiling_record_shapes,
+    ).cpu()
+    logger.info("[llm - generate] result generated")
+    first_token_time = iteration_times[0] + encode_duration
+    result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    logger.info(f"[llm - generate] result: {result}")
+    logger.info(f"[llm - generate] Time to first token = {first_token_time*1000}ms")
+    return result
+
+
+def initialize():
+    global model, assistant_model, tokenizer, generation_config, initialized
+    with initialization_lock:
+        if not initialized:
+            # initialize model and tokenizer
+            import habana_frameworks.torch.hpu as torch_hpu
+            from optimum.habana.utils import HabanaProfile
+
+            model, assistant_model, tokenizer, generation_config = initialize_model(args, logger)
+            logger.info("[llm] model and tokenizer initialized.")
+
+            # compilation and model warmup
+            HabanaProfile.disable()
+            logger.info("[llm - native] Graph compilation...")
+            for _ in range(args.warmup):
+                generate(input_sentences)
+            logger.info("[llm - native] model warm up finished.")
+            torch_hpu.synchronize()
+            HabanaProfile.enable()
+            logger.info("[llm - native] Ready to inference")
+            res = generate(["What is Deep Learning?"])
+            logger.info(f"[llm - native] test result: {res}")
+            initialized = True
+
+
+@register_microservice(
+    name="opea_service@llm_native_llamaindex",
+    service_type=ServiceType.LLM,
+    endpoint="/v1/chat/completions",
+    host="0.0.0.0",
+    port=9000,
+)
+@register_statistics(names=["opea_service@llm_native_llamaindex"])
+def llm_generate(input: LLMParamsDoc):
+    initialize()
+    if logflag:
+        logger.info(input)
+    prompt = input.query
+    prompt_template = None
+    if input.chat_template:
+        prompt_template = PromptTemplate(input.chat_template)
+        input_variables = prompt_template.template_vars
+    if prompt_template:
+        if sorted(input_variables) == ["context", "question"]:
+            prompt = prompt_template.format(question=input.query, context="\n".join(input.documents))
+        elif input_variables == ["question"]:
+            prompt = prompt_template.format(question=input.query)
+        else:
+            logger.info(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
+    else:
+        if input.documents:
+            prompt = ChatTemplate.generate_rag_prompt(input.query, input.documents)
+    res = generate([prompt])
+
+    if logflag:
+        logger.info(f"[llm - native] inference result: {res}")
+    return GeneratedDoc(text=res[0], prompt=input.query)
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@llm_native_llamaindex"].start()
diff --git a/comps/llms/text-generation/native/llama_index/requirements.txt b/comps/llms/text-generation/native/llama_index/requirements.txt
@@ -0,0 +1,10 @@
+docarray
+fastapi
+httpx
+llama_index
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+prometheus-fastapi-instrumentator
+shortuuid
+uvicorn