From 0094f524b41fa29c967ad11c548a0b3eb7d59bf6 Mon Sep 17 00:00:00 2001 From: dolpher Date: Thu, 5 Sep 2024 16:26:28 +0800 Subject: [PATCH] Add vLLM inference engine support (#398) Signed-off-by: Dolpher Du Co-authored-by: Gang G Li --- helm-charts/common/vllm/.helmignore | 23 ++++ helm-charts/common/vllm/Chart.yaml | 10 ++ helm-charts/common/vllm/README.md | 53 ++++++++ helm-charts/common/vllm/gaudi-values.yaml | 19 +++ .../common/vllm/templates/_helpers.tpl | 62 ++++++++++ .../common/vllm/templates/configmap.yaml | 25 ++++ .../common/vllm/templates/deployment.yaml | 116 ++++++++++++++++++ .../common/vllm/templates/service.yaml | 18 +++ .../common/vllm/templates/tests/test-pod.yaml | 29 +++++ helm-charts/common/vllm/values.yaml | 99 +++++++++++++++ 10 files changed, 454 insertions(+) create mode 100644 helm-charts/common/vllm/.helmignore create mode 100644 helm-charts/common/vllm/Chart.yaml create mode 100644 helm-charts/common/vllm/README.md create mode 100644 helm-charts/common/vllm/gaudi-values.yaml create mode 100644 helm-charts/common/vllm/templates/_helpers.tpl create mode 100644 helm-charts/common/vllm/templates/configmap.yaml create mode 100644 helm-charts/common/vllm/templates/deployment.yaml create mode 100644 helm-charts/common/vllm/templates/service.yaml create mode 100644 helm-charts/common/vllm/templates/tests/test-pod.yaml create mode 100644 helm-charts/common/vllm/values.yaml diff --git a/helm-charts/common/vllm/.helmignore b/helm-charts/common/vllm/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/helm-charts/common/vllm/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm-charts/common/vllm/Chart.yaml b/helm-charts/common/vllm/Chart.yaml new file mode 100644 index 00000000..2f6b4f37 --- /dev/null +++ b/helm-charts/common/vllm/Chart.yaml @@ -0,0 +1,10 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v2 +name: vllm +description: The Helm chart for vLLM Inference Server +type: application +version: 1.0.0 +# The vLLM version +appVersion: "0.5" diff --git a/helm-charts/common/vllm/README.md b/helm-charts/common/vllm/README.md new file mode 100644 index 00000000..28bff970 --- /dev/null +++ b/helm-charts/common/vllm/README.md @@ -0,0 +1,53 @@ +# vllm + +Helm chart for deploying vLLM Inference service. + +Refer to [Deploy with Helm Charts](../README.md) for global guides. + +## Installing the Chart + +To install the chart, run the following: + +Note that you cannot use vllm as the service release name due to [environment variables conflict](https://docs.vllm.ai/en/stable/serving/env_vars.html#environment-variables). + +```console +cd GenAIInfra/helm-charts/common +export MODELDIR=/mnt/opea-models +export MODELNAME="Intel/neural-chat-7b-v3-3" +export HFTOKEN="insert-your-huggingface-token-here" +helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} +# To deploy on Gaudi enabled kubernetes cluster +# helm install myvllm vllm --set global.modelUseHostPath=${MODELDIR} --set LLM_MODEL_ID=${MODELNAME} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --values gaudi-values.yaml +``` + +By default, the vllm service will downloading the "Intel/neural-chat-7b-v3-3". + +If you already cached the model locally, you can pass it to container like this example: + +MODELDIR=/mnt/opea-models + +MODELNAME="facebook/opt-125m" + +## Verify + +To verify the installation, run the command `kubectl get pod` to make sure all pods are runinng. + +Then run the command `kubectl port-forward svc/myvllm 2080:80` to expose the vllm service for access. + +Open another terminal and run the following command to verify the service if working: + +```console +curl http://localhost:2080/v1/completions \ + -H "Content-Type: application/json" \ + -d '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}' +``` + +## Values + +| Key | Type | Default | Description | +| ------------------------------- | ------ | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| LLM_MODEL_ID | string | `"Intel/neural-chat-7b-v3-3"` | Models id from https://huggingface.co/, or predownloaded model directory | +| global.HUGGINGFACEHUB_API_TOKEN | string | `insert-your-huggingface-token-here` | Hugging Face API token | +| global.modelUseHostPath | string | `""` | Cached models directory, vllm will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | +| image.repository | string | `"opea/vllm"` | | +| image.tag | string | `"latest"` | | diff --git a/helm-charts/common/vllm/gaudi-values.yaml b/helm-charts/common/vllm/gaudi-values.yaml new file mode 100644 index 00000000..b1a34613 --- /dev/null +++ b/helm-charts/common/vllm/gaudi-values.yaml @@ -0,0 +1,19 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for vllm. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +image: + repository: opea/llm-vllm-hpu + tag: "latest" + +VLLM_CPU_KVCACHE_SPACE: "40" + +# Workaround for current HPU image with start command /bin/bash +# extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] +extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"] +resources: + limits: + habana.ai/gaudi: 1 diff --git a/helm-charts/common/vllm/templates/_helpers.tpl b/helm-charts/common/vllm/templates/_helpers.tpl new file mode 100644 index 00000000..41b1f8d6 --- /dev/null +++ b/helm-charts/common/vllm/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "vllm.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "vllm.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "vllm.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "vllm.labels" -}} +helm.sh/chart: {{ include "vllm.chart" . }} +{{ include "vllm.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "vllm.selectorLabels" -}} +app.kubernetes.io/name: {{ include "vllm.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "vllm.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "vllm.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm-charts/common/vllm/templates/configmap.yaml b/helm-charts/common/vllm/templates/configmap.yaml new file mode 100644 index 00000000..80b9a97d --- /dev/null +++ b/helm-charts/common/vllm/templates/configmap.yaml @@ -0,0 +1,25 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "vllm.fullname" . }}-config + labels: + {{- include "vllm.labels" . | nindent 4 }} +data: + HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote}} + {{- if .Values.global.HF_ENDPOINT }} + HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote}} + {{- end }} + http_proxy: {{ .Values.global.http_proxy | quote }} + https_proxy: {{ .Values.global.https_proxy | quote }} + no_proxy: {{ .Values.global.no_proxy | quote }} + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + HF_HOME: "/tmp/.cache/huggingface" + # https://github.com/outlines-dev/outlines/blob/main/outlines/caching.py#L14-L29 + OUTLINES_CACHE_DIR: "/tmp/.cache/outlines" + {{- if .Values.VLLM_CPU_KVCACHE_SPACE }} + VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}} + {{- end }} diff --git a/helm-charts/common/vllm/templates/deployment.yaml b/helm-charts/common/vllm/templates/deployment.yaml new file mode 100644 index 00000000..133cc0df --- /dev/null +++ b/helm-charts/common/vllm/templates/deployment.yaml @@ -0,0 +1,116 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "vllm.fullname" . }} + labels: + {{- include "vllm.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "vllm.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "vllm.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + envFrom: + - configMapRef: + name: {{ include "vllm.fullname" . }}-config + {{- if .Values.global.extraEnvConfig }} + - configMapRef: + name: {{ .Values.global.extraEnvConfig }} + optional: true + {{- end }} + securityContext: + {{- if .Values.global.modelUseHostPath }} + {} + {{- else }} + {{- toYaml .Values.securityContext | nindent 12 }} + {{- end }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + {{- if .Values.extraCmdArgs }} + {{- range .Values.extraCmdArgs }} + - {{ . | quote }} + {{- end }} + {{- end }} + - "--model" + - {{ .Values.LLM_MODEL_ID | quote }} + - "--host" + - "0.0.0.0" + - "--port" + - {{ .Values.port | quote }} + - "--download-dir" + - "/data" + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + - mountPath: /tmp + name: tmp + ports: + - name: http + containerPort: {{ .Values.port }} + protocol: TCP + {{- if .Values.livenessProbe }} + livenessProbe: + {{- toYaml .Values.livenessProbe | nindent 12 }} + {{- end }} + {{- if .Values.readinessProbe }} + readinessProbe: + {{- toYaml .Values.readinessProbe | nindent 12 }} + {{- end }} + {{- if .Values.startupProbe }} + startupProbe: + {{- toYaml .Values.startupProbe | nindent 12 }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumes: + - name: model-volume + {{- if .Values.global.modelUsePVC }} + persistentVolumeClaim: + claimName: {{ .Values.global.modelUsePVC }} + {{- else if .Values.global.modelUseHostPath }} + hostPath: + path: {{ .Values.global.modelUseHostPath }} + type: Directory + {{- else }} + emptyDir: {} + {{- end }} + - name: shm + emptyDir: + medium: Memory + sizeLimit: {{ .Values.shmSize }} + - name: tmp + emptyDir: {} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/helm-charts/common/vllm/templates/service.yaml b/helm-charts/common/vllm/templates/service.yaml new file mode 100644 index 00000000..42e4fab7 --- /dev/null +++ b/helm-charts/common/vllm/templates/service.yaml @@ -0,0 +1,18 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Service +metadata: + name: {{ include "vllm.fullname" . }} + labels: + {{- include "vllm.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 80 + targetPort: {{ .Values.port }} + protocol: TCP + name: vllm + selector: + {{- include "vllm.selectorLabels" . | nindent 4 }} diff --git a/helm-charts/common/vllm/templates/tests/test-pod.yaml b/helm-charts/common/vllm/templates/tests/test-pod.yaml new file mode 100644 index 00000000..9b786f21 --- /dev/null +++ b/helm-charts/common/vllm/templates/tests/test-pod.yaml @@ -0,0 +1,29 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "vllm.fullname" . }}-testpod" + labels: + {{- include "vllm.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test + #"helm.sh/hook-delete-policy": "hook-succeeded, hook-failure" +spec: + containers: + - name: curl + image: python:3.10.14 + command: ['bash', '-c'] + args: + - | + max_retry=20; + for ((i=1; i<=max_retry; i++)); do + curl http://{{ include "vllm.fullname" . }}/v1/completions \ + -H "Content-Type: application/json" \ + -d '{"model": {{ .Values.LLM_MODEL_ID | quote }},"prompt": "What is Deep Learning?","max_tokens": 32,"temperature": 0}' && break; + curlcode=$? + if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi; + done; + if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi + restartPolicy: Never diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml new file mode 100644 index 00000000..3e98a21b --- /dev/null +++ b/helm-charts/common/vllm/values.yaml @@ -0,0 +1,99 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Default values for vllm. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +port: 2080 +shmSize: 1Gi +image: + repository: opea/vllm + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "latest" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + +service: + type: ClusterIP + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +extraCmdArgs: ["--enforce-eager","--dtype","auto"] + +livenessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 24 +readinessProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 +startupProbe: + httpGet: + path: /health + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 120 + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 +VLLM_CPU_KVCACHE_SPACE: "" + +global: + http_proxy: "" + https_proxy: "" + no_proxy: "" + HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here" + + # Choose where to save your downloaded models + # Set modelUseHostPath for local directory, this is good for one node test. Example: + # modelUseHostPath: /mnt/opea-models + # Set modelUsePVC for PersistentVolumeClaim(PVC), which is suitable for multinode deployment. Example: + # modelUsePVC: model-volume + # You can only set one of the following var, the behavior is not defined is both are set. + # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. + modelUseHostPath: "" + modelUsePVC: ""