forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 1
/
vllm.sh
51 lines (48 loc) · 1.53 KB
/
vllm.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env bash
IMAGE_VERSION=latest
IMAGE_NAME=vllm/vllm-openai
CONTAINER_NAME=vllm72b-zj-online-embedding
MODEL_DIR=/data/xq/qwen2-5-72b-dpo-1101
# MODEL_DIR=/data/czh/SFR-Embedding-Mistral
CODE_DIR=/data/czh/vllm
DEVICES='"device=0,1,2,3,4,5,6,7"'
start() {
# docker start command
# -e ENABLE_PREFIX_CACHING=1 \
# -e ENABLE_CHUNKED_PREFILL=1 \
# -e VLLM_ATTENTION_BACKEND="FLASHINFER" \
# -e DISABLE_SLIDING_WINDOW=1 \
# -e ENFORCE_EAGER=1 \
# seems like flashinfer does not support prefix caching or chunked prefilling
echo "start run docker..."
docker run -d --name ${CONTAINER_NAME} \
--log-opt max-size=30m \
--log-opt max-file=3 \
--gpus=${DEVICES} \
--shm-size=16g \
--network host \
-e LANG="C.UTF-8" \
-e LC_ALL="C.UTF-8" \
-e MODEL_PATH=${MODEL_DIR} \
-e MODEL_TYPE="Qwen2" \
-e PORT=8809 \
-e KV_CACHE_DTYPE="auto" \
-e GPU_USAGE=0.9 \
-e ENABLE_CHUNKED_PREFILL=1 \
-v ${MODEL_DIR}:${MODEL_DIR} \
-v ${CODE_DIR}:${CODE_DIR} \
${IMAGE_NAME}:${IMAGE_VERSION}
# docker run -d --name ${CONTAINER_NAME} \
# --log-opt max-size=30m \
# --log-opt max-file=3 \
# --gpus=${DEVICES} \
# --shm-size=16g \
# --network host \
# -e MODEL_PATH=${MODEL_DIR} \
# -e EMBEDDING_OFFLINE=1 \
# -e PORT=18192 \
# -v ${MODEL_DIR}:${MODEL_DIR} \
# -v ${CODE_DIR}:${CODE_DIR} \
# ${IMAGE_NAME}:${IMAGE_VERSION}
}
$1