Skip to content

Commit

Permalink
[Bugfix][Frontend] Fix Issues Under High Load With zeromq Frontend (v…
Browse files Browse the repository at this point in the history
…llm-project#7394)

Co-authored-by: Nick Hill <[email protected]>
Signed-off-by: Alvant <[email protected]>
  • Loading branch information
2 people authored and Alvant committed Oct 26, 2024
1 parent 414f04e commit 8446f48
Show file tree
Hide file tree
Showing 9 changed files with 322 additions and 141 deletions.
1 change: 1 addition & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ steps:
- vllm/
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
- pytest -v -s entrypoints/llm
- pytest -v -s entrypoints/openai

Expand Down
55 changes: 55 additions & 0 deletions tests/entrypoints/openai/test_accuracy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""
This file test accuracy of the vLLM server via LMEval.
It uses local-completions, which interacts with vLLM
through the OAI API with N concurrent connections.
This simulates real work usage of the API and makes
sure that the zmq frontend mp RPC message passing and
AsyncLLMEngine are working correctly.
"""

import lm_eval
import pytest

from ...utils import RemoteOpenAIServer

MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
EXPECTED_VALUE = 0.58


@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len", "4096", "--enable-chunked-prefill",
"--disable-log-requests", "--enforce-eager"
]

with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest.fixture(scope="module")
def server_data(server):
return {
"url": f"{server.url_for('v1')}/completions",
}


def test_lm_eval_accuracy(server_data):
model_args = (f"model={MODEL_NAME},"
f"base_url={server_data['url']},"
f"num_concurrent={NUM_CONCURRENT},tokenized_requests=False")

results = lm_eval.simple_evaluate(
model="local-completions",
model_args=model_args,
tasks=TASK,
)

measured_value = results["results"][TASK][FILTER]
assert (measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
5 changes: 5 additions & 0 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,6 +809,11 @@ def is_stopped(self) -> bool:
def errored(self) -> bool:
return self._errored_with is not None

@property
def limit_concurrency(self) -> Optional[int]:
"""Maximum number of concurrently running requests."""
return None

def set_errored(self, exc: Exception) -> None:
self._errored_with = exc

Expand Down
4 changes: 4 additions & 0 deletions vllm/engine/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ def is_stopped(self) -> bool:
def errored(self) -> bool:
...

@property
def limit_concurrency(self) -> Optional[int]:
"""Maximum number of concurrently running requests."""

def generate(
self,
inputs: PromptInputs,
Expand Down
9 changes: 9 additions & 0 deletions vllm/entrypoints/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ async def serve_http(app: FastAPI, engine: AsyncEngineClient,

logger.info("Route: %s, Methods: %s", path, ', '.join(methods))

# Set concurrency limits in uvicorn if running in multiprocessing mode
# since zmq has maximum socket limit of zmq.constants.SOCKET_LIMIT (65536).
if engine.limit_concurrency is not None:
logger.info(
"Launching Uvicorn with --limit_concurrency %s. To avoid this "
"limit at the expense of performance run with "
"--disable-frontend-multiprocessing", engine.limit_concurrency)
uvicorn_kwargs["limit_concurrency"] = engine.limit_concurrency

config = uvicorn.Config(app, **uvicorn_kwargs)
server = uvicorn.Server(config)
_add_shutdown_handlers(app, server, engine)
Expand Down
11 changes: 6 additions & 5 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,12 @@ async def build_async_engine_client(
logger.info("Multiprocessing frontend to use %s for RPC Path.",
rpc_path)

# Build RPCClient, which conforms to AsyncEngineClient Protocol.
# NOTE: Actually, this is not true yet. We still need to support
# embedding models via RPC (see TODO above)
rpc_client = AsyncEngineRPCClient(rpc_path)
async_engine_client = rpc_client # type: ignore

# Start RPCServer in separate process (holds the AsyncLLMEngine).
context = multiprocessing.get_context("spawn")
# the current process might have CUDA context,
Expand All @@ -145,11 +151,6 @@ async def build_async_engine_client(
rpc_server_process.start()
logger.info("Started engine process with PID %d",
rpc_server_process.pid)
# Build RPCClient, which conforms to AsyncEngineClient Protocol.
# NOTE: Actually, this is not true yet. We still need to support
# embedding models via RPC (see TODO above)
rpc_client = AsyncEngineRPCClient(rpc_path)
async_engine_client = rpc_client # type: ignore

try:
while True:
Expand Down
14 changes: 12 additions & 2 deletions vllm/entrypoints/openai/rpc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,18 @@
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import SamplingParams

# Success string used for RPC instructions.
VLLM_RPC_SUCCESS_STR = "SUCCESS"
VLLM_RPC_HEALTHY_STR = "HEALTHY"

# Timeouts.
VLLM_RPC_SERVER_START_TIMEOUT_MS = 1000
VLLM_RPC_HEALTH_TIMEOUT_MS = 10000

# Minimum value of ZMQ.SOCKET_LIMIT to run mp.
VLLM_RPC_SOCKET_LIMIT_CUTOFF = 2000

# HWM is set to Infinity.
VLLM_RPC_ZMQ_HWM = 0


@dataclass
Expand All @@ -34,7 +44,7 @@ class RPCUtilityRequest(Enum):
GET_SCHEDULER_CONFIG = 5
GET_LORA_CONFIG = 6
DO_LOG_STATS = 7
CHECK_HEALTH = 8
IS_SERVER_HEALTHY = 8
IS_TRACING_ENABLED = 9


Expand Down
Loading

0 comments on commit 8446f48

Please sign in to comment.