basetenlabs · aspctu · Mar 27, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/mistral/dolfo/config.yaml b/mistral/dolfo/config.yaml
@@ -0,0 +1,43 @@
+apply_library_patches: true
+base_image:
+  image: nvcr.io/nvidia/tritonserver:24.03-trtllm-python-py3
+  python_executable_path: /usr/bin/python3
+bundled_packages_dir: packages
+data_dir: data
+description: Generate text from a prompt with this seven billion parameter language
+  model.
+environment_variables: {}
+examples_filename: examples.yaml
+external_data: null
+external_package_dirs: []
+input_type: Any
+live_reload: false
+model_class_filename: model.py
+model_class_name: Model
+model_framework: custom
+trt_llm:
+  serve:
+    engine_repository: baseten/dolphin_i6000_o1024_bs96_tp8-tllm_0.9.0.dev2024032600
+    pipeline_parallel_count: 1
+    tensor_parallel_count: 8
+    tokenizer_repository: cognitivecomputations/dolphin-2.6-mixtral-8x7b
+model_metadata:
+  engine_repository: baseten/dolphin_i6000_o1024_bs96_tp8-tllm_0.9.0.dev2024032600
+  tags:
+  - text-generation
+  - openai-compatible
+model_module_dir: model
+model_name: dolfo-new
+model_type: Model
+python_version: py311
+requirements:
+- tritonclient[all]
+- transformers
+- jinja2
+resources:
+  accelerator: H100:8
+  use_gpu: true
+runtime:
+  num_workers: 1
+  predict_concurrency: 1000
+secrets: {}
diff --git a/mistral/dolfo/model/__init__.py b/mistral/dolfo/model/__init__.py
diff --git a/mistral/dolfo/model/model.py b/mistral/dolfo/model/model.py
@@ -0,0 +1,135 @@
+import os
+from itertools import count
+
+from constants import (
+    GRPC_SERVICE_PORT,
+    HF_AUTH_KEY_CONSTANT,
+    HTTP_SERVICE_PORT,
+    TOKENIZER_KEY_CONSTANT,
+)
+from schema import ModelInput
+from transformers import AutoTokenizer
+from triton_client import TritonClient, TritonServer
+from utils import execute_command
+
+APPEND_ASSISTANT_TEMPLATE_TO_PROMPT = True
+APPEND_ASSISTANT_TEMPLATE_TO_PROMPT_STR = "<|im_start|>assistant"
+STOP_TOKEN = "<|im_end|>"
+
+class Model:
+    def __init__(self, data_dir, config, secrets):
+        self._data_dir = data_dir
+        self._config = config
+        self._secrets = secrets
+        self._request_id_counter = count(start=1)
+        self.triton_client = None
+        self.triton_server = None
+        self.tokenizer = None
+        self.uses_openai_api = None
+
+    def load(self):
+        execute_command(["ldconfig"])
+        # trtllm_config = TrussTRTLLMConfiguration(**self._config.get("trt_llm", {}))
+        trtllm_config = self._config.get("trt_llm", {})
+        self.uses_openai_api = "openai-compatible" in self._config.get(
+            "model_metadata", {}
+        ).get("tags", [])
+        hf_access_token = None
+        if "hf_access_token" in self._secrets._base_secrets.keys():
+            hf_access_token = self._secrets["hf_access_token"]
+
+        self.triton_server = TritonServer(
+            grpc_port=GRPC_SERVICE_PORT,
+            http_port=HTTP_SERVICE_PORT,
+        )
+
+        engine_repository_path = trtllm_config["serve"]["engine_repository"]
+        tokenizer_repository = trtllm_config["serve"]["tokenizer_repository"]
+        tensor_parallel_count = trtllm_config["serve"]["tensor_parallel_count"]
+        pipeline_parallel_count = trtllm_config["serve"]["pipeline_parallel_count"]
+        world_size = tensor_parallel_count * pipeline_parallel_count
+
+        # if not trtllm_config.requires_build:
+        #    engine_repository_path = trtllm_config.serve.engine_repository
+        #    tokenizer_repository = trtllm_config.serve.tokenizer_repository
+        #    tensor_parallel_count = trtllm_config.serve.tensor_parallel_count
+        #    pipeline_parallel_count = trtllm_config.serve.pipeline_parallel_count
+        #    world_size = tensor_parallel_count * pipeline_parallel_count
+        # else:
+        #    engine_repository_path = None
+        #    tokenizer_repository = trtllm_config.build.huggingface_ckpt_repository
+        #    tensor_parallel_count = trtllm_config.build.tensor_parallel_count
+        #    pipeline_parallel_count = trtllm_config.build.pipeline_parallel_count
+        #    world_size = tensor_parallel_count * pipeline_parallel_count
+
+        self.triton_server.create_model_repository(
+            truss_data_dir=self._data_dir,
+            engine_repository_path=engine_repository_path,
+            huggingface_auth_token=hf_access_token,
+        )
+
+        env = {}
+        if hf_access_token:
+            env[HF_AUTH_KEY_CONSTANT] = hf_access_token
+        env[TOKENIZER_KEY_CONSTANT] = tokenizer_repository
+
+        self.triton_server.start(
+            world_size=world_size,
+            env=env,
+        )
+
+        self.triton_client = TritonClient(
+            grpc_service_port=GRPC_SERVICE_PORT,
+        )
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_repository, token=hf_access_token
+        )
+        self.eos_token_id = self.tokenizer.eos_token_id
+
+    async def predict(self, model_input):
+        if model_input.get("max_tokens") is None:
+            model_input["max_tokens"] = 500
+
+        if model_input.get("max_new_tokens") is None:
+            model_input["max_new_tokens"] = 500
+
+        model_input["request_id"] = str(os.getpid()) + str(
+            next(self._request_id_counter)
+        )
+        model_input["eos_token_id"] = self.eos_token_id
+        messages = model_input.get("messages", [])
+        if "messages" in model_input:
+            del model_input["messages"]
+        prompt = model_input.get("prompt", None)
+        if not prompt and messages == []:
+            raise ValueError("Prompt or messages must be provided")
+
+        if self.uses_openai_api and not prompt:
+            prompt = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+            )
+            model_input["prompt"] = prompt
+
+        if APPEND_ASSISTANT_TEMPLATE_TO_PROMPT:
+            model_input["prompt"] = f"{model_input['prompt']}{APPEND_ASSISTANT_TEMPLATE_TO_PROMPT_STR}"
+
+        self.triton_client.start_grpc_stream()
+        model_input = ModelInput(**model_input)
+        result_iterator = self.triton_client.infer(model_input)
+
+        async def generate():
+            async for result in result_iterator:
+                if result != STOP_TOKEN:
+                    yield result
+                else:
+                    yield ""
+
+        if model_input.stream:
+            return generate()
+        else:
+            if self.uses_openai_api:
+                return "".join(generate())
+            else:
+                return {"text": "".join(generate())}
diff --git a/mistral/dolfo/packages/constants.py b/mistral/dolfo/packages/constants.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+# If changing model repo path, please updated inside tensorrt_llm config.pbtxt as well
+TENSORRT_LLM_MODEL_REPOSITORY_PATH = Path("/packages/tensorrt_llm_model_repository/")
+GRPC_SERVICE_PORT = 8001
+HTTP_SERVICE_PORT = 8003
+HF_AUTH_KEY_CONSTANT = "HUGGING_FACE_HUB_TOKEN"
+TOKENIZER_KEY_CONSTANT = "TRITON_TOKENIZER_REPOSITORY"
+ENTRYPOINT_MODEL_NAME = "ensemble"
diff --git a/mistral/dolfo/packages/schema.py b/mistral/dolfo/packages/schema.py
@@ -0,0 +1,155 @@
+from typing import Optional
+
+import numpy as np
+import tritonclient
+import tritonclient.grpc.aio as grpcclient
+
+
+class ModelInput:
+    def __init__(
+        self,
+        prompt: str,
+        request_id: int,
+        max_tokens: int = 50,
+        max_new_tokens: int = 50,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = 50,
+        beam_width: int = 1,
+        bad_words_list: Optional[list] = None,
+        stop_words_list: Optional[list] = None,
+        repetition_penalty: float = 1.0,
+        ignore_eos: bool = False,
+        stream: bool = True,
+        eos_token_id: int = None,  # type: ignore
+    ) -> None:
+        self.stream = stream
+        self.request_id = request_id
+        self._prompt = prompt
+        self._max_tokens = max_tokens
+        self._beam_width = beam_width
+        self._bad_words_list = [""] if bad_words_list is None else bad_words_list
+        self._stop_words_list = [""] if stop_words_list is None else stop_words_list
+        self._repetition_penalty = repetition_penalty
+        self._eos_token_id = eos_token_id
+        self._ignore_eos = ignore_eos
+        # These variables are passed by OAI proxy but are unused
+        # TODO(Abu): Add support for these
+        self._max_new_tokens = max_new_tokens
+        self._temperature = temperature
+        self._top_p = top_p
+        self._top_k = top_k
+
+    def _prepare_grpc_tensor(
+        self, name: str, input_data: np.ndarray
+    ) -> grpcclient.InferInput:
+        tensor = grpcclient.InferInput(
+            name,
+            input_data.shape,
+            tritonclient.utils.np_to_triton_dtype(input_data.dtype),
+        )
+        tensor.set_data_from_numpy(input_data)
+        return tensor
+
+    def to_tensors(self):
+        if self._eos_token_id is None and self._ignore_eos:
+            raise ValueError("eos_token_id is required when ignore_eos is True")
+
+        prompt_data = np.array([[self._prompt]], dtype=object)
+        output_len_data = np.ones_like(prompt_data, dtype=np.uint32) * self._max_tokens
+        bad_words_data = np.array([self._bad_words_list], dtype=object)
+        stop_words_data = np.array([self._stop_words_list], dtype=object)
+        stream_data = np.array([[self.stream]], dtype=bool)
+        beam_width_data = np.array([[self._beam_width]], dtype=np.uint32)
+        repetition_penalty_data = np.array(
+            [[self._repetition_penalty]], dtype=np.float32
+        )
+
+        inputs = [
+            self._prepare_grpc_tensor("text_input", prompt_data),
+            self._prepare_grpc_tensor("max_tokens", output_len_data),
+            self._prepare_grpc_tensor("bad_words", bad_words_data),
+            self._prepare_grpc_tensor("stop_words", stop_words_data),
+            self._prepare_grpc_tensor("stream", stream_data),
+            self._prepare_grpc_tensor("beam_width", beam_width_data),
+            self._prepare_grpc_tensor("repetition_penalty", repetition_penalty_data),
+        ]
+
+        if not self._ignore_eos:
+            end_id_data = np.array([[self._eos_token_id]], dtype=np.uint32)
+            inputs.append(self._prepare_grpc_tensor("end_id", end_id_data))
+
+        return inputs
+
+
+# The following are duplicated from the underlying base image.
+# We list them as a comment for posterity:
+#
+# class TRTLLMModelArchitecture(Enum):
+#     LLAMA: str = "llama"
+#     MISTRAL: str = "mistral"
+#     DEEPSEEK: str = "deepseek"
+
+
+# class TRTLLMQuantizationType(Enum):
+#     NO_QUANT: str = "no_quant"
+#     WEIGHTS_ONLY_INT8: str = "weights_int8"
+#     WEIGHTS_KV_INT8: str = "weights_kv_int8"
+#     WEIGHTS_ONLY_INT4: str = "weights_int4"
+#     WEIGHTS_KV_INT4: str = "weights_kv_int4"
+#     SMOOTH_QUANT: str = "smooth_quant"
+#     FP8: str = "fp8"
+#     FP8_KV: str = "fp8_kv"
+
+# class TrussTRTLLMPluginConfiguration(BaseModel):
+#     multi_block_mode: bool = False
+#     paged_kv_cache: bool = True
+#     use_fused_mlp: bool = False
+
+# class TrussTRTLLMBuildConfiguration(BaseModel):
+#     base_model_architecture: TRTLLMModelArchitecture
+#     max_input_len: int
+#     max_output_len: int
+#     max_batch_size: int
+#     max_beam_width: int
+#     max_prompt_embedding_table_size: int = 0
+#     huggingface_ckpt_repository: Optional[str]
+#     gather_all_token_logits: bool = False
+#     strongly_typed: bool = False
+#     quantization_type: TRTLLMQuantizationType = TRTLLMQuantizationType.NO_QUANT
+#     tensor_parallel_count: int = 1
+#     pipeline_parallel_count: int = 1
+#     plugin_configuration: TrussTRTLLMPluginConfiguration = TrussTRTLLMPluginConfiguration()
+
+# class TrussTRTLLMServingConfiguration(BaseModel):
+#     engine_repository: str
+#     tokenizer_repository: str
+#     tensor_parallel_count: int = 1
+#     pipeline_parallel_count: int = 1
+
+# class TrussTRTLLMConfiguration(BaseModel):
+#     serve: Optional[TrussTRTLLMServingConfiguration] = None
+#     build: Optional[TrussTRTLLMBuildConfiguration] = None
+
+#     @model_validator(mode="after")
+#     def check_minimum_required_configuration(self):
+#         if not self.serve and not self.build:
+#             raise ValueError(
+#                 "Either serve or build configurations must be provided"
+#             )
+#         if self.serve and self.build:
+#             raise ValueError(
+#                 "Both serve and build configurations cannot be provided"
+#             )
+#         if self.serve is not None:
+#             if (self.serve.engine_repository is None) ^ (self.serve.tokenizer_repository is None):
+#                 raise ValueError(
+#                     "Both engine_repository and tokenizer_repository must be provided"
+#                 )
+#         return self
+
+#     @property
+#     def requires_build(self):
+#         if self.build is not None:
+#             return True
+#         return False