Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Abuqader/dolfo #247

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions mistral/dolfo/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
apply_library_patches: true
base_image:
image: nvcr.io/nvidia/tritonserver:24.03-trtllm-python-py3
python_executable_path: /usr/bin/python3
bundled_packages_dir: packages
data_dir: data
description: Generate text from a prompt with this seven billion parameter language
model.
environment_variables: {}
examples_filename: examples.yaml
external_data: null
external_package_dirs: []
input_type: Any
live_reload: false
model_class_filename: model.py
model_class_name: Model
model_framework: custom
trt_llm:
serve:
engine_repository: baseten/dolphin_i6000_o1024_bs96_tp8-tllm_0.9.0.dev2024032600
pipeline_parallel_count: 1
tensor_parallel_count: 8
tokenizer_repository: cognitivecomputations/dolphin-2.6-mixtral-8x7b
model_metadata:
engine_repository: baseten/dolphin_i6000_o1024_bs96_tp8-tllm_0.9.0.dev2024032600
tags:
- text-generation
- openai-compatible
model_module_dir: model
model_name: dolfo-new
model_type: Model
python_version: py311
requirements:
- tritonclient[all]
- transformers
- jinja2
resources:
accelerator: H100:8
use_gpu: true
runtime:
num_workers: 1
predict_concurrency: 1000
secrets: {}
Empty file added mistral/dolfo/model/__init__.py
Empty file.
135 changes: 135 additions & 0 deletions mistral/dolfo/model/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import os
from itertools import count

from constants import (
GRPC_SERVICE_PORT,
HF_AUTH_KEY_CONSTANT,
HTTP_SERVICE_PORT,
TOKENIZER_KEY_CONSTANT,
)
from schema import ModelInput
from transformers import AutoTokenizer
from triton_client import TritonClient, TritonServer
from utils import execute_command

APPEND_ASSISTANT_TEMPLATE_TO_PROMPT = True
APPEND_ASSISTANT_TEMPLATE_TO_PROMPT_STR = "<|im_start|>assistant"
STOP_TOKEN = "<|im_end|>"

class Model:
def __init__(self, data_dir, config, secrets):
self._data_dir = data_dir
self._config = config
self._secrets = secrets
self._request_id_counter = count(start=1)
self.triton_client = None
self.triton_server = None
self.tokenizer = None
self.uses_openai_api = None

def load(self):
execute_command(["ldconfig"])
# trtllm_config = TrussTRTLLMConfiguration(**self._config.get("trt_llm", {}))
trtllm_config = self._config.get("trt_llm", {})
self.uses_openai_api = "openai-compatible" in self._config.get(
"model_metadata", {}
).get("tags", [])
hf_access_token = None
if "hf_access_token" in self._secrets._base_secrets.keys():
hf_access_token = self._secrets["hf_access_token"]

self.triton_server = TritonServer(
grpc_port=GRPC_SERVICE_PORT,
http_port=HTTP_SERVICE_PORT,
)

engine_repository_path = trtllm_config["serve"]["engine_repository"]
tokenizer_repository = trtllm_config["serve"]["tokenizer_repository"]
tensor_parallel_count = trtllm_config["serve"]["tensor_parallel_count"]
pipeline_parallel_count = trtllm_config["serve"]["pipeline_parallel_count"]
world_size = tensor_parallel_count * pipeline_parallel_count

# if not trtllm_config.requires_build:
# engine_repository_path = trtllm_config.serve.engine_repository
# tokenizer_repository = trtllm_config.serve.tokenizer_repository
# tensor_parallel_count = trtllm_config.serve.tensor_parallel_count
# pipeline_parallel_count = trtllm_config.serve.pipeline_parallel_count
# world_size = tensor_parallel_count * pipeline_parallel_count
# else:
# engine_repository_path = None
# tokenizer_repository = trtllm_config.build.huggingface_ckpt_repository
# tensor_parallel_count = trtllm_config.build.tensor_parallel_count
# pipeline_parallel_count = trtllm_config.build.pipeline_parallel_count
# world_size = tensor_parallel_count * pipeline_parallel_count

self.triton_server.create_model_repository(
truss_data_dir=self._data_dir,
engine_repository_path=engine_repository_path,
huggingface_auth_token=hf_access_token,
)

env = {}
if hf_access_token:
env[HF_AUTH_KEY_CONSTANT] = hf_access_token
env[TOKENIZER_KEY_CONSTANT] = tokenizer_repository

self.triton_server.start(
world_size=world_size,
env=env,
)

self.triton_client = TritonClient(
grpc_service_port=GRPC_SERVICE_PORT,
)

self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_repository, token=hf_access_token
)
self.eos_token_id = self.tokenizer.eos_token_id

async def predict(self, model_input):
if model_input.get("max_tokens") is None:
model_input["max_tokens"] = 500

if model_input.get("max_new_tokens") is None:
model_input["max_new_tokens"] = 500

model_input["request_id"] = str(os.getpid()) + str(
next(self._request_id_counter)
)
model_input["eos_token_id"] = self.eos_token_id
messages = model_input.get("messages", [])
if "messages" in model_input:
del model_input["messages"]
prompt = model_input.get("prompt", None)
if not prompt and messages == []:
raise ValueError("Prompt or messages must be provided")

if self.uses_openai_api and not prompt:
prompt = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
)
model_input["prompt"] = prompt

if APPEND_ASSISTANT_TEMPLATE_TO_PROMPT:
model_input["prompt"] = f"{model_input['prompt']}{APPEND_ASSISTANT_TEMPLATE_TO_PROMPT_STR}"

self.triton_client.start_grpc_stream()
model_input = ModelInput(**model_input)
result_iterator = self.triton_client.infer(model_input)

async def generate():
async for result in result_iterator:
if result != STOP_TOKEN:
yield result
else:
yield ""

if model_input.stream:
return generate()
else:
if self.uses_openai_api:
return "".join(generate())
else:
return {"text": "".join(generate())}
9 changes: 9 additions & 0 deletions mistral/dolfo/packages/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pathlib import Path

# If changing model repo path, please updated inside tensorrt_llm config.pbtxt as well
TENSORRT_LLM_MODEL_REPOSITORY_PATH = Path("/packages/tensorrt_llm_model_repository/")
GRPC_SERVICE_PORT = 8001
HTTP_SERVICE_PORT = 8003
HF_AUTH_KEY_CONSTANT = "HUGGING_FACE_HUB_TOKEN"
TOKENIZER_KEY_CONSTANT = "TRITON_TOKENIZER_REPOSITORY"
ENTRYPOINT_MODEL_NAME = "ensemble"
155 changes: 155 additions & 0 deletions mistral/dolfo/packages/schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
from typing import Optional

import numpy as np
import tritonclient
import tritonclient.grpc.aio as grpcclient


class ModelInput:
def __init__(
self,
prompt: str,
request_id: int,
max_tokens: int = 50,
max_new_tokens: int = 50,
temperature: float = 1.0,
top_p: float = 1.0,
top_k: int = 50,
beam_width: int = 1,
bad_words_list: Optional[list] = None,
stop_words_list: Optional[list] = None,
repetition_penalty: float = 1.0,
ignore_eos: bool = False,
stream: bool = True,
eos_token_id: int = None, # type: ignore
) -> None:
self.stream = stream
self.request_id = request_id
self._prompt = prompt
self._max_tokens = max_tokens
self._beam_width = beam_width
self._bad_words_list = [""] if bad_words_list is None else bad_words_list
self._stop_words_list = [""] if stop_words_list is None else stop_words_list
self._repetition_penalty = repetition_penalty
self._eos_token_id = eos_token_id
self._ignore_eos = ignore_eos
# These variables are passed by OAI proxy but are unused
# TODO(Abu): Add support for these
self._max_new_tokens = max_new_tokens
self._temperature = temperature
self._top_p = top_p
self._top_k = top_k

def _prepare_grpc_tensor(
self, name: str, input_data: np.ndarray
) -> grpcclient.InferInput:
tensor = grpcclient.InferInput(
name,
input_data.shape,
tritonclient.utils.np_to_triton_dtype(input_data.dtype),
)
tensor.set_data_from_numpy(input_data)
return tensor

def to_tensors(self):
if self._eos_token_id is None and self._ignore_eos:
raise ValueError("eos_token_id is required when ignore_eos is True")

prompt_data = np.array([[self._prompt]], dtype=object)
output_len_data = np.ones_like(prompt_data, dtype=np.uint32) * self._max_tokens
bad_words_data = np.array([self._bad_words_list], dtype=object)
stop_words_data = np.array([self._stop_words_list], dtype=object)
stream_data = np.array([[self.stream]], dtype=bool)
beam_width_data = np.array([[self._beam_width]], dtype=np.uint32)
repetition_penalty_data = np.array(
[[self._repetition_penalty]], dtype=np.float32
)

inputs = [
self._prepare_grpc_tensor("text_input", prompt_data),
self._prepare_grpc_tensor("max_tokens", output_len_data),
self._prepare_grpc_tensor("bad_words", bad_words_data),
self._prepare_grpc_tensor("stop_words", stop_words_data),
self._prepare_grpc_tensor("stream", stream_data),
self._prepare_grpc_tensor("beam_width", beam_width_data),
self._prepare_grpc_tensor("repetition_penalty", repetition_penalty_data),
]

if not self._ignore_eos:
end_id_data = np.array([[self._eos_token_id]], dtype=np.uint32)
inputs.append(self._prepare_grpc_tensor("end_id", end_id_data))

return inputs


# The following are duplicated from the underlying base image.
# We list them as a comment for posterity:
#
# class TRTLLMModelArchitecture(Enum):
# LLAMA: str = "llama"
# MISTRAL: str = "mistral"
# DEEPSEEK: str = "deepseek"


# class TRTLLMQuantizationType(Enum):
# NO_QUANT: str = "no_quant"
# WEIGHTS_ONLY_INT8: str = "weights_int8"
# WEIGHTS_KV_INT8: str = "weights_kv_int8"
# WEIGHTS_ONLY_INT4: str = "weights_int4"
# WEIGHTS_KV_INT4: str = "weights_kv_int4"
# SMOOTH_QUANT: str = "smooth_quant"
# FP8: str = "fp8"
# FP8_KV: str = "fp8_kv"

# class TrussTRTLLMPluginConfiguration(BaseModel):
# multi_block_mode: bool = False
# paged_kv_cache: bool = True
# use_fused_mlp: bool = False

# class TrussTRTLLMBuildConfiguration(BaseModel):
# base_model_architecture: TRTLLMModelArchitecture
# max_input_len: int
# max_output_len: int
# max_batch_size: int
# max_beam_width: int
# max_prompt_embedding_table_size: int = 0
# huggingface_ckpt_repository: Optional[str]
# gather_all_token_logits: bool = False
# strongly_typed: bool = False
# quantization_type: TRTLLMQuantizationType = TRTLLMQuantizationType.NO_QUANT
# tensor_parallel_count: int = 1
# pipeline_parallel_count: int = 1
# plugin_configuration: TrussTRTLLMPluginConfiguration = TrussTRTLLMPluginConfiguration()

# class TrussTRTLLMServingConfiguration(BaseModel):
# engine_repository: str
# tokenizer_repository: str
# tensor_parallel_count: int = 1
# pipeline_parallel_count: int = 1

# class TrussTRTLLMConfiguration(BaseModel):
# serve: Optional[TrussTRTLLMServingConfiguration] = None
# build: Optional[TrussTRTLLMBuildConfiguration] = None

# @model_validator(mode="after")
# def check_minimum_required_configuration(self):
# if not self.serve and not self.build:
# raise ValueError(
# "Either serve or build configurations must be provided"
# )
# if self.serve and self.build:
# raise ValueError(
# "Both serve and build configurations cannot be provided"
# )
# if self.serve is not None:
# if (self.serve.engine_repository is None) ^ (self.serve.tokenizer_repository is None):
# raise ValueError(
# "Both engine_repository and tokenizer_repository must be provided"
# )
# return self

# @property
# def requires_build(self):
# if self.build is not None:
# return True
# return False
Loading
Loading