neuralmagic · parfeniukink · Aug 29, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,15 @@
+venv*/
+Dockerfile
+.gitignore
+.env
+.git
+.github/
+.ruff_cache/
+.pre-commit-config.yaml
+docs/
+*.md
+LICENSE
+MANIFEST.in
+__pycache__/
+*.egg-info/
+*log
diff --git a/DEVELOPING.md b/DEVELOPING.md
@@ -33,13 +33,30 @@ cd guidellm
 pip install -e .[dev]
 ```
 
-If you work with `deepsparse` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run:
+In case of working with `deepsparse` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run:
 
 ```sh
 pip install -e .[deepsparse]
 # or pip install -e '.[deepsparse]'
 ```
 
+In case of working with `vllm` backend, etc it has some other software limitations. In order to install dependencies for the specific backend, run:
+
+```sh
+pip install -e .[vllm]
+# or pip install -e '.[vllm]'
+```
+
+According to the [installation guide](https://docs.vllm.ai/en/v0.4.0.post1/getting_started/installation.html) `vllm` is supported only on **Linux**. It means that running the application and tests will fail.
+
+Workaround with Docker:
+
+```sh
+cd guidellm/
+docker build -t guidellm:latest .
+docker run -v ./:./ guidellm:latest python -m pytest -s -v src/unit/backend/test_vllm.py
+```
+
 ## Project Structure
 
 The project follows a standard Python project structure:
@@ -163,6 +180,19 @@ The end-to-end tests are located in the `tests/e2e` directory. To run the end-to
 tox -e test-e2e
 ```
 
+### Running unsopported tests
+
+Some of the test might be not supported on your system (_for instance `vllm` is not supported on MacOS yet_). In order to run them on Linux Operating System you might use technologies like **WSL** on Windows, or **Docker** on Windows or MacOS.
+
+In order to run under the Docker just run the command below:
+
+```sh
+docker build --platform linux/amd64 --tag guidellm:latest .
+docker run --rm --env-file .env guidellm:latest pytest tests/
+```
+
+<br>
+
 ## Formatting, Linting, and Type Checking
 
 ### Running Quality Checks (Linting)

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,23 @@
+FROM --platform=linux/amd64 python:3.8-slim
+
+# Environment variables
+ENV PYTHONUNBUFFERED=1
+
+RUN : \
+    && apt-get update \
+    # dependencies for building Python packages && cleaning up unused files
+    && apt-get install -y \
+        build-essential \
+        libcurl4-openssl-dev \
+        libssl-dev \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && pip install --upgrade \
+        pip \
+        setuptools
+
+WORKDIR /app
+
+# Install project dependencies
+COPY ./ ./
+RUN pip install -e .[dev,deepsparse,vllm]
diff --git a/pyproject.toml b/pyproject.toml
@@ -74,6 +74,9 @@ dev = [
 deepsparse = [
     "deepsparse; python_version < '3.12'",
 ]
+vllm = [
+    "vllm; sys_platform == 'linux'",
+]
 
 
 [project.entry-points.console_scripts]
@@ -108,7 +111,7 @@ exclude = ["venv", ".tox"]
 follow_imports = 'silent'
 
 [[tool.mypy.overrides]]
-module = ["deepsparse.*", "transformers.*"]
+module = ["deepsparse.*", "transformers.*", "vllm.*"]
 ignore_missing_imports=true
 
 

diff --git a/src/guidellm/backend/base.py b/src/guidellm/backend/base.py
@@ -15,7 +15,7 @@
 __all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
 
 
-BackendEnginePublic = Literal["openai_server", "deepsparse"]
+BackendEnginePublic = Literal["openai_server", "deepsparse", "vllm"]
 BackendEngine = Union[BackendEnginePublic, Literal["test"]]
 
 

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
@@ -10,7 +10,7 @@
 __all__ = ["OpenAIBackend"]
 
 
-@Backend.register("openai_server")
+@Backend.register(backend_type="openai_server")
 class OpenAIBackend(Backend):
     """
     An OpenAI backend implementation for generative AI results.

diff --git a/src/guidellm/backend/vllm/__init__.py b/src/guidellm/backend/vllm/__init__.py
@@ -0,0 +1,26 @@
+"""
+This package encapsulates the "vLLM Backend" implementation.
+
+ref: https://github.com/vllm-project/vllm
+
+The `vllm` package supports Python3.8..Python3.11,
+when the `guidellm` start from Python3.8.
+
+Safe range of versions is Python3.8..Python3.11
+for the vLLM Backend implementation.
+
+In the end ensure that the `vllm` package is installed.
+"""
+
+from guidellm.utils import check_python_version, module_is_available
+
+check_python_version(min_version="3.8", max_version="3.12")
+
+module_is_available(
+    module="vllm",
+    helper=("`vllm` package is not available. Try run: `pip install -e '.[vllm]'`"),
+)
+
+from .backend import VllmBackend  # noqa: E402
+
+__all__ = ["VllmBackend"]
diff --git a/src/guidellm/backend/vllm/backend.py b/src/guidellm/backend/vllm/backend.py
@@ -0,0 +1,122 @@
+from typing import Any, AsyncGenerator, Dict, List, Optional
+
+from loguru import logger
+from vllm import LLM, CompletionOutput, SamplingParams
+
+from guidellm.backend import Backend, GenerativeResponse
+from guidellm.config import settings
+from guidellm.core import TextGenerationRequest
+
+
+@Backend.register(backend_type="vllm")
+class VllmBackend(Backend):
+    """
+    An vLLM Backend implementation for the generative AI result.
+    """
+
+    def __init__(self, model: Optional[str] = None, **request_args):
+        _model = self._get_model(model)
+        self._request_args: Dict[str, Any] = request_args
+        self.llm = LLM(_model)
+
+        # NOTE: Must be after all the parameters since ``self.llm`` is going to be used
+        #       by ``make_request`` within ``Backend.test_connection()``
+        super().__init__(type_="vllm", model=_model, target="not used")
+
+        logger.info(f"vLLM Backend uses model '{self._model}'")
+
+    def _get_model(self, model_from_cli: Optional[str] = None) -> str:
+        """Provides the model by the next priority list:
+        1. from function argument (comes from CLI)
+        1. from environment variable
+        2. `self.default_model` from `self.available_models`
+        """
+
+        if model_from_cli is not None:
+            return model_from_cli
+        elif settings.llm_model is not None:
+            logger.info(
+                "Using vLLM model from environment variable: " f"{settings.llm_model}"
+            )
+            return settings.llm_model
+        else:
+            logger.info(f"Using default vLLM model: {self.default_model}")
+            return self.default_model
+
+    async def make_request(
+        self, request: TextGenerationRequest
+    ) -> AsyncGenerator[GenerativeResponse, None]:
+        """
+        Make a request to the vLLM Python API client.
+
+        :param request: The result request to submit.
+        :type request: TextGenerationRequest
+        :return: An iterator over the generative responses.
+        :rtype: Iterator[GenerativeResponse]
+        """
+
+        logger.debug(f"Making request to vLLM backend with prompt: {request.prompt}")
+
+        token_count = 0
+        request_args = {
+            **self._request_args,
+            "inputs": [request.prompt],
+            "sampling_params": SamplingParams(max_tokens=request.output_token_count),
+        }
+
+        final_response = GenerativeResponse(
+            type_="final",
+            prompt=request.prompt,
+            prompt_token_count=request.prompt_token_count,
+            output_token_count=token_count,
+        )
+
+        if not (result := self.llm.generate(**request_args)):
+            yield final_response
+            return
+
+        try:
+            generations: List[CompletionOutput] = result[0].outputs
+        except IndexError:
+            yield final_response
+            return
+
+        for generation in generations:
+            if not (token := generation.text):
+                break
+            else:
+                token_count += 1
+                yield GenerativeResponse(
+                    type_="token_iter",
+                    add_token=token,
+                    prompt=request.prompt,
+                    prompt_token_count=request.prompt_token_count,
+                    output_token_count=token_count,
+                )
+
+        yield GenerativeResponse(
+            type_="final",
+            prompt=request.prompt,
+            prompt_token_count=request.prompt_token_count,
+            output_token_count=token_count,
+        )
+
+    def available_models(self) -> List[str]:
+        """
+        Get the available models for the backend.
+
+        ref: https://docs.vllm.ai/en/v0.4.1/models/supported_models.html
+
+        :return: A list of available models.
+        :rtype: List[str]
+        """
+
+        return [
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            "meta-llama/Meta-Llama-3-8B-Instruct",
+        ]
+
+    def _token_count(self, text: str) -> int:
+        token_count = len(text.split())
+        logger.debug(f"Token count for text '{text}': {token_count}")
+        return token_count
diff --git a/src/guidellm/utils/progress.py b/src/guidellm/utils/progress.py
@@ -162,9 +162,11 @@ def update_benchmark(
             total=completed_total,
             completed=completed_count if not completed else completed_total,
             req_per_sec=(f"{req_per_sec:.2f}" if req_per_sec else "#.##"),
-            start_time_str=datetime.fromtimestamp(start_time).strftime("%H:%M:%S")
-            if start_time
-            else "--:--:--",
+            start_time_str=(
+                datetime.fromtimestamp(start_time).strftime("%H:%M:%S")
+                if start_time
+                else "--:--:--"
+            ),
         )
         logger.debug(
             "Updated benchmark task at index {}: {}% complete",

diff --git a/tests/dummy/__init__.py b/tests/dummy/__init__.py
@@ -1,8 +1,5 @@
 """
 The tests.dummy package package represents dummy data factories and test services.
-
-test.dummy.data.openai_model_factory - openai.types.Model test factory
-test.dummy.data.openai_completion_factory - openai.types.Completion test factory
 """
 
-from . import data, services  # noqa: F401
+from . import data, services, vllm  # noqa: F401
diff --git a/tests/dummy/data/__init__.py b/tests/dummy/data/__init__.py
@@ -1,3 +0,0 @@
-from .openai import openai_completion_factory, openai_model_factory
-
-__all__ = ["openai_completion_factory", "openai_model_factory"]

diff --git a/tests/dummy/data/openai.py b/tests/dummy/data/openai.py
Original file line number	Diff line number	Diff line change
		@@ -1,3 +0,0 @@
		from .openai import openai_completion_factory, openai_model_factory

		__all__ = ["openai_completion_factory", "openai_model_factory"]