vllm-project · robertgshaw2-redhat · Aug 21, 2024 · Aug 10, 2024 · Aug 10, 2024 · Aug 10, 2024
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -642,6 +642,11 @@ def is_stopped(self) -> bool:
     def errored(self) -> bool:
         return self._errored_with is not None
 
+    @property
+    def limit_concurrency(self) -> Optional[int]:
+        """Maximum number of concurrently running requests."""
+        return None
+
     def set_errored(self, exc: Exception) -> None:
         self._errored_with = exc
 

diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
@@ -30,6 +30,10 @@ def is_stopped(self) -> bool:
     def errored(self) -> bool:
         ...
 
+    @property
+    def limit_concurrency(self) -> Optional[int]:
+        """Maximum number of concurrently running requests."""
+
     def generate(
         self,
         inputs: PromptInputs,

diff --git a/vllm/entrypoints/launcher.py b/vllm/entrypoints/launcher.py
@@ -26,6 +26,15 @@ async def serve_http(app: FastAPI, engine: AsyncEngineClient,
 
         logger.info("Route: %s, Methods: %s", path, ', '.join(methods))
 
+    # Set concurrency limits in uvicorn if running in multiprocessing mode
+    # since zmq has maximum socket limit of zmq.constants.SOCKET_LIMIT (65536).
+    if engine.limit_concurrency is not None:
+        logger.info(
+            "Launching Uvicorn with --limit_concurrency %s. To avoid this "
+            "limit at the expense of performance run with "
+            "--disable-frontend-multiprocessing", engine.limit_concurrency)
+        uvicorn_kwargs["limit_concurrency"] = engine.limit_concurrency
+
     config = uvicorn.Config(app, **uvicorn_kwargs)
     server = uvicorn.Server(config)
     _add_shutdown_handlers(app, server, engine)

diff --git a/vllm/entrypoints/openai/rpc/__init__.py b/vllm/entrypoints/openai/rpc/__init__.py
@@ -8,7 +8,8 @@
 from vllm.sampling_params import SamplingParams
 
 VLLM_RPC_SUCCESS_STR = "SUCCESS"
-VLLM_RPC_HEALTHY_STR = "HEALTHY"
+VLLM_RPC_SERVER_START_TIMEOUT_MS = 1000
+VLLM_RPC_HEALTH_TIMEOUT_MS = 10000
 
 
 @dataclass
@@ -34,7 +35,7 @@ class RPCUtilityRequest(Enum):
     GET_SCHEDULER_CONFIG = 5
     GET_LORA_CONFIG = 6
     DO_LOG_STATS = 7
-    CHECK_HEALTH = 8
+    IS_SERVER_HEALTHY = 8
     IS_TRACING_ENABLED = 9
 
 

diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py
@@ -1,5 +1,7 @@
+import asyncio
 from contextlib import contextmanager
 from typing import Any, AsyncGenerator, Mapping, Optional
+from uuid import uuid4
 
 import cloudpickle
 import zmq
@@ -8,31 +10,108 @@
 from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig)
 from vllm.entrypoints.openai.rpc import (RPC_REQUEST_TYPE,
-                                         VLLM_RPC_HEALTHY_STR,
+                                         VLLM_RPC_HEALTH_TIMEOUT_MS,
+                                         VLLM_RPC_SERVER_START_TIMEOUT_MS,
                                          VLLM_RPC_SUCCESS_STR, RPCAbortRequest,
                                          RPCGenerateRequest, RPCUtilityRequest)
 from vllm.inputs import PromptInputs
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 
-# Time to wait before checking it the server process is alive.
-SERVER_START_TIMEOUT_MS = 1000
+logger = init_logger(__name__)
+
+# Path used for inprocess proxy.
+INPROC_PROXY_PATH = f"inproc://{uuid4()}"
 
 
 class AsyncEngineRPCClient:
+    """
+    RPCClient that connects to the RPCServer wrapping AsyncLLMEngine.
+
+    On startup, the RPCClient:
+        - makes DEALER socket (to_rpc_server) that connects to the RPCServer 
+            via ipc, which uses unix sockets under the hood
+            (https://libzmq.readthedocs.io/en/zeromq4-1/zmq_ipc.html)
+        - makes ROUTER socket (from_api_server) that binds to a random 
+            inproc address, which uses memory under the hood
+            (https://libzmq.readthedocs.io/en/zeromq3-x/zmq_inproc.html)
+        - runs a proxy in a background asyncio task between 
+            from_api_server (ROUTER, inproc) and to_rpc_server (DEALER ipc, )
+
+    Each request handled by the asyncio api_server calls generate():
+        - make a DEALER socket that connects to from_api_server via inproc
+        - send a RCPGenerateRequest to the inproc socket
+        - background proxy forwards the request from inproc -> ipc
+        - RPCServer responds to the request one token at a time over ipc
+        - background proxy forwards the response from ipc -> inproc
+
+    The connection looks like this:
+        DEALER <- inproc -> [ ROUTER | DEALER ] <- ipc -> ROUTER
+
+    Message routing is performed via identities that are managed by the 
+    ROUTER socket. ROUTER sockets track every connection it has and 
+    tells the caller about these. The way it tells the caller is to stick 
+    the connection identity in front of each message received. When we 
+    send the message via a ROUTER, we first send an identity frame.
+    See https://zguide.zeromq.org/docs/chapter3/#The-Extended-Reply-Envelope
+    for more details on connection identities.
+
+    This proxy design enables us to use a single unix socket, which 
+    improves performance by avoiding syscalls (~5%) and avoids resource limits
+    such as ulimit, which defaults to 1024 on ubuntu.
+
+    See: https://zguide.zeromq.org/docs/chapter3/ for more details on the
+    Request-Reply pattern of zeromq sockets.
+    """
 
     def __init__(self, rpc_path: str):
         self.context = zmq.asyncio.Context()
-        self.rpc_path = rpc_path
+        self.context.set(zmq.constants.MAX_SOCKETS,
+                         self.context.get(zmq.constants.SOCKET_LIMIT))
+
+        # IPC connection to RPC Server (uses unix sockets).
+        self.to_rcp_server = self.context.socket(zmq.constants.DEALER)
+        self.to_rcp_server.connect(rpc_path)
+
+        # In process proxy to RPC Server (used memory-based messaging).
+        self.from_api_server = self.context.socket(zmq.constants.ROUTER)
+        self.from_api_server.bind(INPROC_PROXY_PATH)
+
+        # Asyncio background task for the proxy.
+        self.proxy_task = asyncio.create_task(
+            self.run_proxy(self.from_api_server, self.to_rcp_server))
+
+        # Maximum number of requests that can be active. This value is
+        # used uvicorn to launch with --limit-concurrency to limit the
+        # maximum number of requests being processed at a time.
+        # Note: https://www.uvicorn.org/server-behavior/#resource-limits
+        # Note: this value is typically 65536
+        self.limit_concurrency = self.context.get(zmq.constants.SOCKET_LIMIT)
+
+    async def run_proxy(self, socket_from, socket_to):
+        """Background task that runs a proxy"""
+        poller = zmq.asyncio.Poller()
+        poller.register(socket_from, zmq.constants.POLLIN)
+        poller.register(socket_to, zmq.constants.POLLIN)
+        while True:
+            events = await poller.poll()
+            events = dict(events)
+            if socket_from in events:
+                msg = await socket_from.recv_multipart()
+                await socket_to.send_multipart(msg)
+            if socket_to in events:
+                msg = await socket_to.recv_multipart()
+                await socket_from.send_multipart(msg)
 
     async def setup(self):
         """Setup the client before it starts sending server requests."""
 
         # Wait until server is ready.
-        await self.wait_for_server()
+        await self._wait_for_server_rpc()
         self._errored = False
 
         # Get the configs.
@@ -54,15 +133,13 @@ def close(self):
         self.context.destroy()
 
     @contextmanager
-    def socket(self):
-        # Ensure client sockets are always closed after use
-
-        # Connect to RPC socket for Request-Reply pattern,
+    def to_proxy_socket(self):
+        # Connect to the proxy.
         # Note that we use DEALER to enable asynchronous communication
         # to enable streaming.
         socket = self.context.socket(zmq.constants.DEALER)
         try:
-            socket.connect(self.rpc_path)
+            socket.connect(INPROC_PROXY_PATH)
             yield socket
         finally:
             # linger == 0 means discard unsent messages
@@ -81,10 +158,9 @@ async def _send_get_data_rpc_request(self, request: RPCUtilityRequest,
                                          error_message: str) -> Any:
         """Send an RPC request that is expecting data back."""
 
-        with self.socket() as socket:
-
+        with self.to_proxy_socket() as socket:
             # Ping RPCServer with a request.
-            await socket.send(cloudpickle.dumps(request))
+            await socket.send_multipart([cloudpickle.dumps(request)])
 
             # Await the data from the Server.
             data = cloudpickle.loads(await socket.recv())
@@ -93,6 +169,9 @@ async def _send_get_data_rpc_request(self, request: RPCUtilityRequest,
             # LoRAConfig can be None.
             if expected_type == LoRAConfig and data is None:
                 pass
+            elif isinstance(data, Exception):
+                logger.warning(error_message)
+                raise data
             else:
                 raise ValueError(error_message)
 
@@ -103,9 +182,9 @@ async def _send_one_way_rpc_request(self,
                                         error_message: str,
                                         timeout: Optional[int] = None):
         """Send one-way RPC request to trigger an action."""
-        with self.socket() as socket:
+        with self.to_proxy_socket() as socket:
             # Ping RPC Server with request.
-            await socket.send(cloudpickle.dumps(request))
+            await socket.send_multipart([cloudpickle.dumps(request)])
 
             # Await acknowledgement from RPCServer.
             if timeout is not None and await socket.poll(timeout=timeout) == 0:
@@ -114,6 +193,9 @@ async def _send_one_way_rpc_request(self,
             response = cloudpickle.loads(await socket.recv())
 
         if not isinstance(response, str) or response != VLLM_RPC_SUCCESS_STR:
+            if isinstance(response, Exception):
+                logger.warning(error_message)
+                raise response
             raise ValueError(error_message)
 
         return response
@@ -130,13 +212,13 @@ async def get_model_config(self) -> ModelConfig:
     async def is_tracing_enabled(self) -> bool:
         return self.tracing_flag
 
-    async def wait_for_server(self):
+    async def _wait_for_server_rpc(self):
         """Wait for the RPCServer to start up."""
 
         await self._send_one_way_rpc_request(
             request=RPCUtilityRequest.IS_SERVER_READY,
-            error_message="Unable to start RPC Server.",
-            timeout=SERVER_START_TIMEOUT_MS)
+            error_message="Unable to start RPC Server",
+            timeout=VLLM_RPC_SERVER_START_TIMEOUT_MS)
 
     async def _get_model_config_rpc(self) -> ModelConfig:
         """Get the ModelConfig object from the RPC Server"""
@@ -226,7 +308,7 @@ async def generate(
 
         finished = False
         try:
-            with self.socket() as socket:
+            with self.to_proxy_socket() as socket:
 
                 # Send RPCGenerateRequest to the RPCServer.
                 await socket.send_multipart([
@@ -266,23 +348,10 @@ async def generate(
     async def check_health(self) -> None:
         """Raise if unhealthy"""
 
-        with self.socket() as socket:
-
-            # Ping RPCServer with CHECK_HEALTH request.
-            await socket.send(cloudpickle.dumps(RPCUtilityRequest.CHECK_HEALTH)
-                              )
-
-            # Await the reply from the server.
-            # TODO: do we need an internal timeout here?
-            # Or do we expect the external probe to timeout and let this chill?
-            health_message = cloudpickle.loads(await socket.recv())
-
-        if isinstance(health_message, Exception):
-            raise health_message
-
-        if health_message != VLLM_RPC_HEALTHY_STR:
-            raise ValueError("Expected healthy response from backend but got "
-                             "f{health_message}")
+        await self._send_one_way_rpc_request(
+            request=RPCUtilityRequest.IS_SERVER_HEALTHY,
+            error_message="Got Unhealthy response from RPC Server",
+            timeout=VLLM_RPC_HEALTH_TIMEOUT_MS)
 
     async def encode(self, *args,
                      **kwargs) -> AsyncGenerator[EmbeddingRequestOutput, None]: