From 6dbfb271e3f4e7162b8fb45580272a2ecf9a4a3c Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Thu, 24 Aug 2023 16:13:21 -0700 Subject: [PATCH 01/21] Add TRT ComposerModel inference wrapper --- .../models/inference_api_wrapper/__init__.py | 6 + .../models/inference_api_wrapper/trtllm.py | 166 ++++++++++++++++++ llmfoundry/models/model_registry.py | 4 +- 3 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 llmfoundry/models/inference_api_wrapper/__init__.py create mode 100644 llmfoundry/models/inference_api_wrapper/trtllm.py diff --git a/llmfoundry/models/inference_api_wrapper/__init__.py b/llmfoundry/models/inference_api_wrapper/__init__.py new file mode 100644 index 0000000000..dbb5530f11 --- /dev/null +++ b/llmfoundry/models/inference_api_wrapper/__init__.py @@ -0,0 +1,6 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from llmfoundry.models.inference_api_wrapper.trtllm import TRTLLMEvalWrapper + +__all__ = ['TRTLLMEvalWrapper'] diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py new file mode 100644 index 0000000000..2a51d590cc --- /dev/null +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -0,0 +1,166 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +"""Implements a TRT-LLM evaluation model wrapped around a +:class:`.ComposerModel`.""" + +import json +from pathlib import Path +from typing import Any, Optional + +import tensorrt_llm +import torch +from omegaconf import DictConfig +from tensorrt_llm.runtime import ModelConfig, SamplingConfig +from transformers import PreTrainedTokenizer + +from llmfoundry.models.inference_api_wrapper.interface import \ + InferenceAPIEvalWrapper + +__all__ = ['TRTLLMEvalWrapper'] + + +# From tensorrt_llm/examples/{model_name}/build.py +def get_engine_name(model: str, dtype: str, tp_size: int, rank: int): + return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) + + +class TRTLLMEvalWrapper(InferenceAPIEvalWrapper): + + def __init__( + self, + model_cfg: DictConfig, + tokenizer: PreTrainedTokenizer, + ): + + super().__init__(model_cfg, tokenizer) + + tensorrt_llm.logger.set_level(model_cfg['log_level']) + + # Load TRT config from file + engine_dir = Path(model_cfg['engine_dir']) + config_path = engine_dir / 'config.json' + with open(config_path, 'r') as f: + config = json.load(f) + + # Set vars from config + use_gpt_attention_plugin = config['plugin_config'][ + 'gpt_attention_plugin'] + inflight_batching_gpt_attention_plugin = config['plugin_config'][ + 'inflight_batching_gpt_attention_plugin'] + remove_input_padding = config['plugin_config']['remove_input_padding'] + if remove_input_padding: + raise ValueError( + 'TRT-LLM Evaluation Wrapper does not support remove_input_padding.' + ) + dtype = config['builder_config']['precision'] + world_size = config['builder_config']['tensor_parallel'] + assert world_size == tensorrt_llm.mpi_world_size(), \ + f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' + num_heads = config['builder_config']['num_heads'] // world_size + hidden_size = config['builder_config']['hidden_size'] // world_size + vocab_size = config['builder_config']['vocab_size'] + num_layers = config['builder_config']['num_layers'] + multi_query_mode = config['builder_config']['multi_query_mode'] + paged_kv_cache = config['builder_config'].get('paged_kv_cache', False) + tokens_per_block = config['builder_config'].get('tokens_per_block', 64) + use_prompt_tuning = config['builder_config'].get( + 'use_prompt_tuning', False) + + self.hidden_size = hidden_size + self.vocab_size = vocab_size + + # Device and rank + runtime_rank = tensorrt_llm.mpi_rank() + runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank) + torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) + + # Tokenization and sampling + self.END_ID = model_cfg.get('eos_token_id', self.tokenizer.eos_token_id) + self.PAD_ID = model_cfg.get('pad_token_id', self.tokenizer.pad_token_id) + if self.PAD_ID == None: + self.tokenizer.pad_token = self.tokenizer.eos_token + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + + print('EOS TOKEN:', self.END_ID) + print('Pad token:', self.PAD_ID) + + self.sampling_config = SamplingConfig(end_id=self.END_ID, + pad_id=self.PAD_ID, + num_beams=1) + + # Load TRT engine + engine_name = get_engine_name(model_cfg['version'], dtype, world_size, + runtime_rank) + serialize_path = engine_dir / engine_name + with open(serialize_path, 'rb') as f: + engine_buffer = f.read() + + # Initialize generation session for model + trt_model_config = ModelConfig( + num_heads=num_heads, + hidden_size=self.hidden_size, + vocab_size=self.vocab_size, + num_layers=num_layers, + gpt_attention_plugin=use_gpt_attention_plugin, + inflight_batching_gpt_attention_plugin= + inflight_batching_gpt_attention_plugin, + multi_query_mode=multi_query_mode, + remove_input_padding=remove_input_padding, + paged_kv_cache=paged_kv_cache, + tokens_per_block=tokens_per_block, + use_prompt_tuning=use_prompt_tuning) + self.decoder = tensorrt_llm.runtime.GenerationSession( + trt_model_config, engine_buffer, runtime_mapping) + + def eval_forward(self, batch, outputs: Optional[Any] = None): + # If the batch mode is generate, we will generate a requested number of tokens using the underlying + # model's generate function. Strings will be returned from eval_forward + output_logits_batch = [] + batch = self.rebatch(batch) + for tokens, cont_idxs in zip(batch['input_ids'], + batch['continuation_indices']): + + seqlen = tokens.shape[0] + tokens = tokens.tolist() + cont_idxs = cont_idxs.tolist() + expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] + + prompt = tokens[:cont_idxs[0]] + input_ids = torch.tensor([prompt], dtype=torch.int, device='cuda') + input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device='cuda') + #print("prompt:", self.tokenizer.decode(prompt)) + #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) + #print("Input lengths:", input_lengths) + #print(cont_idxs[0]) + #print("Expected continuation tokens:", len(expected_cont_tokens)) + self.decoder.setup(input_lengths.size(0), + torch.max(input_lengths).item(), + len(expected_cont_tokens)) + + output_idsg, output_logits_list = self.decoder.decode( + input_ids, input_lengths, self.sampling_config) + + #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) + + output_logits = torch.nn.functional.one_hot( + torch.tensor(tokens[1:cont_idxs[0]], device='cuda'), + num_classes=self.vocab_size) + + for i in range(len(output_logits_list)): + output_logits_list[i] = output_logits_list[i].squeeze() + + next_logit_tensor = torch.stack(output_logits_list) + output_logits = torch.cat([output_logits, next_logit_tensor]) + #print(output_logits.shape) + #print(output_ids[0][0][cont_idxs[0]:].tolist()) + padding = torch.nn.functional.one_hot(torch.full( + (seqlen - output_logits.shape[0],), + self.PAD_ID, + device=output_logits.device), + num_classes=self.vocab_size) + output_logits = torch.cat([output_logits, padding]) + #print("Output logits shape:", output_logits.shape) + output_logits_batch.append(output_logits) + + return torch.stack(output_logits_batch).to(batch['input_ids'].device) diff --git a/llmfoundry/models/model_registry.py b/llmfoundry/models/model_registry.py index 02a709740e..557ce17cc3 100644 --- a/llmfoundry/models/model_registry.py +++ b/llmfoundry/models/model_registry.py @@ -4,10 +4,12 @@ from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM, ComposerHFT5) from llmfoundry.models.mpt import ComposerMPTCausalLM +from llmfoundry.models.inference_api_wrapper import TRTLLMEvalWrapper COMPOSER_MODEL_REGISTRY = { - 'mpt_causal_lm': ComposerMPTCausalLM, + l 'mpt_causal_lm': ComposerMPTCausalLM, 'hf_causal_lm': ComposerHFCausalLM, 'hf_prefix_lm': ComposerHFPrefixLM, 'hf_t5': ComposerHFT5, + 'trtllm': TRTLLMEvalWrapper } From c312d21b9ef3bb5bb4643ababf16cd7ee5063cee Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Thu, 24 Aug 2023 16:32:29 -0700 Subject: [PATCH 02/21] Fix precommit --- .../models/inference_api_wrapper/trtllm.py | 24 +++++++------------ llmfoundry/models/model_registry.py | 4 ++-- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index 2a51d590cc..34a2103fc2 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -6,7 +6,7 @@ import json from pathlib import Path -from typing import Any, Optional +from typing import Any, Optional, Dict import tensorrt_llm import torch @@ -113,7 +113,7 @@ def __init__( self.decoder = tensorrt_llm.runtime.GenerationSession( trt_model_config, engine_buffer, runtime_mapping) - def eval_forward(self, batch, outputs: Optional[Any] = None): + def eval_forward(self, batch: Dict, outputs: Optional[Any] = None): # If the batch mode is generate, we will generate a requested number of tokens using the underlying # model's generate function. Strings will be returned from eval_forward output_logits_batch = [] @@ -128,20 +128,17 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): prompt = tokens[:cont_idxs[0]] input_ids = torch.tensor([prompt], dtype=torch.int, device='cuda') - input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device='cuda') - #print("prompt:", self.tokenizer.decode(prompt)) - #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) - #print("Input lengths:", input_lengths) - #print(cont_idxs[0]) - #print("Expected continuation tokens:", len(expected_cont_tokens)) + input_lengths = torch.tensor([input_ids.size(1)], + dtype=torch.int, + device='cuda') + self.decoder.setup(input_lengths.size(0), torch.max(input_lengths).item(), len(expected_cont_tokens)) - output_idsg, output_logits_list = self.decoder.decode( - input_ids, input_lengths, self.sampling_config) - - #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) + _, output_logits_list = self.decoder.decode(input_ids, + input_lengths, + self.sampling_config) output_logits = torch.nn.functional.one_hot( torch.tensor(tokens[1:cont_idxs[0]], device='cuda'), @@ -152,15 +149,12 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): next_logit_tensor = torch.stack(output_logits_list) output_logits = torch.cat([output_logits, next_logit_tensor]) - #print(output_logits.shape) - #print(output_ids[0][0][cont_idxs[0]:].tolist()) padding = torch.nn.functional.one_hot(torch.full( (seqlen - output_logits.shape[0],), self.PAD_ID, device=output_logits.device), num_classes=self.vocab_size) output_logits = torch.cat([output_logits, padding]) - #print("Output logits shape:", output_logits.shape) output_logits_batch.append(output_logits) return torch.stack(output_logits_batch).to(batch['input_ids'].device) diff --git a/llmfoundry/models/model_registry.py b/llmfoundry/models/model_registry.py index 557ce17cc3..1b0f4210ac 100644 --- a/llmfoundry/models/model_registry.py +++ b/llmfoundry/models/model_registry.py @@ -3,11 +3,11 @@ from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM, ComposerHFT5) -from llmfoundry.models.mpt import ComposerMPTCausalLM from llmfoundry.models.inference_api_wrapper import TRTLLMEvalWrapper +from llmfoundry.models.mpt import ComposerMPTCausalLM COMPOSER_MODEL_REGISTRY = { - l 'mpt_causal_lm': ComposerMPTCausalLM, + 'mpt_causal_lm': ComposerMPTCausalLM, 'hf_causal_lm': ComposerHFCausalLM, 'hf_prefix_lm': ComposerHFPrefixLM, 'hf_t5': ComposerHFT5, From 170359147faf360149f9e0c3c6a45e8a17f4abd4 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Thu, 24 Aug 2023 17:30:01 -0700 Subject: [PATCH 03/21] Add base wrapper --- .../models/inference_api_wrapper/__init__.py | 4 +- .../models/inference_api_wrapper/interface.py | 110 ++++++++++++++++++ .../models/inference_api_wrapper/trtllm.py | 37 ++++-- 3 files changed, 142 insertions(+), 9 deletions(-) create mode 100644 llmfoundry/models/inference_api_wrapper/interface.py diff --git a/llmfoundry/models/inference_api_wrapper/__init__.py b/llmfoundry/models/inference_api_wrapper/__init__.py index dbb5530f11..24b0af9ca2 100644 --- a/llmfoundry/models/inference_api_wrapper/__init__.py +++ b/llmfoundry/models/inference_api_wrapper/__init__.py @@ -1,6 +1,8 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +from llmfoundry.models.inference_api_wrapper.interface import \ + InferenceAPIEvalWrapper from llmfoundry.models.inference_api_wrapper.trtllm import TRTLLMEvalWrapper -__all__ = ['TRTLLMEvalWrapper'] +__all__ = ['InferenceAPIEvalWrapper', 'TRTLLMEvalWrapper'] diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py new file mode 100644 index 0000000000..3ee896648a --- /dev/null +++ b/llmfoundry/models/inference_api_wrapper/interface.py @@ -0,0 +1,110 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Dict, Optional + +import torch +from composer.core.types import Batch +from composer.metrics import InContextLearningMetric +# required for loading a python model into composer +from composer.metrics.nlp import (InContextLearningLMAccuracy, + InContextLearningLMExpectedCalibrationError, + InContextLearningMCExpectedCalibrationError, + InContextLearningMultipleChoiceAccuracy, + InContextLearningQAAccuracy, + LanguageCrossEntropy, LanguagePerplexity) +from composer.models import ComposerModel +from torchmetrics import Metric +from transformers import AutoTokenizer + + +class InferenceAPIEvalWrapper(ComposerModel): + + def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer): + self.model_name = model_cfg['version'] + self.tokenizer = tokenizer + self.labels = None + # set up training and eval metrics + eval_metrics = [ + LanguageCrossEntropy(), + LanguagePerplexity(), + InContextLearningLMAccuracy(), + InContextLearningMultipleChoiceAccuracy(), + InContextLearningQAAccuracy(), + InContextLearningLMExpectedCalibrationError(), + InContextLearningMCExpectedCalibrationError() + ] + self.eval_metrics = { + metric.__class__.__name__: metric for metric in eval_metrics + } + super(InferenceAPIEvalWrapper, self).__init__() + self.mocked_layer = torch.nn.Linear(2, 3) + + def get_metrics(self, is_train: bool = False): + if is_train: + metrics = [] + else: + metrics = self.eval_metrics + + return metrics if metrics else {} + + def get_next_token_logit_tensor(self, prompt: str): + raise NotImplementedError + + def rebatch(self, batch: Batch): + # default is a no-op, but Chat API modifies these + return batch + + def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): + # If the batch mode is generate, we will generate a requested number of tokens using the underlying + # model's generate function. Extra generation kwargs can be passed in via the batch. Strings will + # be returned from eval_forward + output_logits_batch = [] + for tokens, cont_idxs in zip(batch['input_ids'], + batch['continuation_indices']): + + seqlen = tokens.shape[0] + tokens = tokens.tolist() + cont_idxs = cont_idxs.tolist() + expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] + output_logits = torch.nn.functional.one_hot( + torch.tensor(tokens[1:cont_idxs[0]]), + num_classes=self.tokenizer.pad_token_id + 1) + for i in range(len(expected_cont_tokens)): + # decode one token at a time + prompt = self.tokenizer.decode(tokens[:cont_idxs[0]] + + expected_cont_tokens[0:i]) + next_logit_tensor = self.get_next_token_logit_tensor(prompt) + if next_logit_tensor is None: + continue + output_logits = torch.cat( + [output_logits, + next_logit_tensor.reshape(1, -1)]) + padding = torch.nn.functional.one_hot( + torch.full((seqlen - output_logits.shape[0],), + self.tokenizer.pad_token_id), + num_classes=self.tokenizer.pad_token_id + 1) + output_logits = torch.cat([output_logits, padding]) + output_logits_batch.append(output_logits) + + return torch.stack(output_logits_batch).to(batch['input_ids'].device) + + def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None: + batch = self.rebatch(batch) + self.labels = batch.pop('labels') + self.labels[:, :-1] = self.labels[:, 1:].clone() + self.labels[:, -1] = -100 + if isinstance(metric, InContextLearningMetric) and batch.get( + 'mode', None) == 'icl_task': + assert self.labels is not None + metric.update(batch, outputs, self.labels) + else: + metric.update( + outputs, + self.labels) # pyright: ignore [reportGeneralTypeIssues] + + def forward(self): + pass + + def loss(self): + pass diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index 34a2103fc2..7f6fee9c7a 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -6,12 +6,10 @@ import json from pathlib import Path -from typing import Any, Optional, Dict +from typing import Any, Optional -import tensorrt_llm import torch from omegaconf import DictConfig -from tensorrt_llm.runtime import ModelConfig, SamplingConfig from transformers import PreTrainedTokenizer from llmfoundry.models.inference_api_wrapper.interface import \ @@ -19,6 +17,20 @@ __all__ = ['TRTLLMEvalWrapper'] +try: + import tensorrt_llm + from tensorrt_llm.runtime import ModelConfig, SamplingConfig + TRT_LLM_INSTALLED = True +except ImportError: + TRT_LLM_INSTALLED = False + + +def check_if_trt_llm_installed(): + if not TRT_LLM_INSTALLED: + raise ImportError( + 'TRT-LLM is not installed. It must be installed to use the TRTLLMEValWrapper.' + ) + # From tensorrt_llm/examples/{model_name}/build.py def get_engine_name(model: str, dtype: str, tp_size: int, rank: int): @@ -32,6 +44,7 @@ def __init__( model_cfg: DictConfig, tokenizer: PreTrainedTokenizer, ): + check_if_trt_llm_installed() super().__init__(model_cfg, tokenizer) @@ -113,7 +126,7 @@ def __init__( self.decoder = tensorrt_llm.runtime.GenerationSession( trt_model_config, engine_buffer, runtime_mapping) - def eval_forward(self, batch: Dict, outputs: Optional[Any] = None): + def eval_forward(self, batch, outputs: Optional[Any] = None): # If the batch mode is generate, we will generate a requested number of tokens using the underlying # model's generate function. Strings will be returned from eval_forward output_logits_batch = [] @@ -131,14 +144,19 @@ def eval_forward(self, batch: Dict, outputs: Optional[Any] = None): input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device='cuda') - + #print("prompt:", self.tokenizer.decode(prompt)) + #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) + #print("Input lengths:", input_lengths) + #print(cont_idxs[0]) + #print("Expected continuation tokens:", len(expected_cont_tokens)) self.decoder.setup(input_lengths.size(0), torch.max(input_lengths).item(), len(expected_cont_tokens)) - _, output_logits_list = self.decoder.decode(input_ids, - input_lengths, - self.sampling_config) + output_ids, output_logits_list = self.decoder.decode( + input_ids, input_lengths, self.sampling_config) + + #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) output_logits = torch.nn.functional.one_hot( torch.tensor(tokens[1:cont_idxs[0]], device='cuda'), @@ -149,12 +167,15 @@ def eval_forward(self, batch: Dict, outputs: Optional[Any] = None): next_logit_tensor = torch.stack(output_logits_list) output_logits = torch.cat([output_logits, next_logit_tensor]) + #print(output_logits.shape) + #print(output_ids[0][0][cont_idxs[0]:].tolist()) padding = torch.nn.functional.one_hot(torch.full( (seqlen - output_logits.shape[0],), self.PAD_ID, device=output_logits.device), num_classes=self.vocab_size) output_logits = torch.cat([output_logits, padding]) + #print("Output logits shape:", output_logits.shape) output_logits_batch.append(output_logits) return torch.stack(output_logits_batch).to(batch['input_ids'].device) From 33e52897e868c5e139a9133a4d41bc2812b1bc99 Mon Sep 17 00:00:00 2001 From: nik-mosaic <101217697+nik-mosaic@users.noreply.github.com> Date: Tue, 12 Dec 2023 10:47:07 -0600 Subject: [PATCH 04/21] Update model_registry.py --- llmfoundry/models/model_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/models/model_registry.py b/llmfoundry/models/model_registry.py index 332804a44d..25ca5442f3 100644 --- a/llmfoundry/models/model_registry.py +++ b/llmfoundry/models/model_registry.py @@ -13,7 +13,7 @@ 'hf_causal_lm': ComposerHFCausalLM, 'hf_prefix_lm': ComposerHFPrefixLM, 'hf_t5': ComposerHFT5, - 'trtllm': TRTLLMEvalWrapper + 'trtllm': TRTLLMEvalWrapper, 'openai_causal_lm': OpenAICausalLMEvalWrapper, 'openai_chat': OpenAIChatAPIEvalWrapper } From da7b235833fbae49cc017fc95e1e5cf070cd96c9 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Wed, 13 Dec 2023 06:39:38 -0800 Subject: [PATCH 05/21] add changes to make llmfoundry install and test trtllm --- llmfoundry/__init__.py | 7 +- .../inference_api_wrapper/.trtllm.py.swo | Bin 0 -> 16384 bytes .../inference_api_wrapper/.trtllm.py.swp | Bin 0 -> 20480 bytes .../models/inference_api_wrapper/__init__.py | 1 + .../models/inference_api_wrapper/trtllm.py | 31 +++---- llmfoundry/models/mpt/modeling_mpt.py | 3 +- scripts/eval/eval.py | 2 +- scripts/eval/evaluate_trtllm_test.py | 78 ++++++++++++++++++ setup.py | 2 +- 9 files changed, 103 insertions(+), 21 deletions(-) create mode 100644 llmfoundry/models/inference_api_wrapper/.trtllm.py.swo create mode 100644 llmfoundry/models/inference_api_wrapper/.trtllm.py.swp create mode 100644 scripts/eval/evaluate_trtllm_test.py diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py index 85f96aadb9..023bfa2372 100644 --- a/llmfoundry/__init__.py +++ b/llmfoundry/__init__.py @@ -39,9 +39,10 @@ is_cuda_available = False extras = '.[gpu]' if is_cuda_available else '.' - raise ImportError( - f'Please make sure to pip install {extras} to get the requirements for the LLM example.' - ) from e + print("ImportError:", e) + # raise ImportError( + # f'Please make sure to pip install {extras} to get the requirements for the LLM example.' + #) from e __all__ = [ 'build_text_denoising_dataloader', diff --git a/llmfoundry/models/inference_api_wrapper/.trtllm.py.swo b/llmfoundry/models/inference_api_wrapper/.trtllm.py.swo new file mode 100644 index 0000000000000000000000000000000000000000..65e810630861a82ffa1e760d1eee34e247eae123 GIT binary patch literal 16384 zcmeHO&yO5O6)s{YI1n2|;1q<)J3>8+W_s2e$8rYtu=1|OhF$L}y9tQ3snm4W%rx86 z-L9@)du*-X!i66PB#^jpK}g}(f#eT}xFjM-K>_6g9FVvnt{{aA@Kts9bkFQ~_6S1N z^0(7n_3G8DuU@^Xt}gdhUf$ZF*WFcu>!XBRdF#FiE(&HzAtY_ zu`j~36GVL>MdS&_lYrfmJV}J?D5)UoCgZd0vH6w(%fLJaE|OK0pA3MzzaYJc<(7fegnJ>{2KT^unRm6oCnSUzxyyD?*eZCzXZMu1V9g113m{_ z1fBwZ`yoQ!0e%V`1Dn7zz(1cP0u__;LBHM(y0%7d#gXVv@@tKRI&%_*dHsPq z5TK)OZ%N4f)>H%;g*h3g71o=0T46o6QrMsex<`TsTm^ANdF0cwuPd@uEWM%YWj=RP zp_ng@15Y$%7O7w)nB>tB{WL8fM>tsnPjLrHmNJ~NVG$1CR29yMp%qF*X)L8;`XtUy zUb=~6J)zH1bZl#)Syyygk*R)0%Lqq&bb7NAZ_o{6wrQfytD0v>J&rxzGZn(5>S4Mo zhk@^lsFbQbRmvK4-S_F;j7Q3JI342H(L@Wy zKJ*TqUKWHtc+H=kJI0$PH5d=diSLE6cXU=kXSM*0xpuYUE#U*!Et!(B{Lc(vxL+d5yl957}vr zPF1S8e6s9J?B!`Hq^h+<2fT%)TlDs%bftA6ak)O0aLL0^gf;4YHBJ`$OCp6Aq=;j+ zQVq>c<<90&CpiZ%9<;Z&cj)FZ53@Y_eK9{3Nc~u(G>R2Xvm}YpWGP1Rv0y=zWD2JF zz7B8|F-EE6K`Q9WP$o8|jAfJPYV_5>G%-&@HBZdTt+AAZ8TxiI;18X7^0hYWN}U11 z3s<012X>aLsLALEGf!kewM6UHSA*^gMUOcKUmtO-h)h)s)H0KW!dKHca>JPW&3@Gl z_X0JfaU!B-E(cQ8utgo|EK?4x6*aYdm?n&qGe3~DL!CU&)iqR|Q4?9Cdr;e)%)5t9 zmFe86tZC3~*hk?JrP-!g{U8)oqoy1^j0XcOIrMB8ienMh>p)(fsH-5F3w^V{dmRoQwT%%6PGrxQc5fDbYBH9Z(Fdbkwhq`6g|sa@aOs-?rS zyxmf$Oh(jsc*5{cCHSLf-iIe{-Vcu3kL8G~oQoDUBOG)j<-~6$d~FOCOswd;h>t{G zpi5IFi&DjXQM603;KafK<;^WDYS5e7RqMUj5%={XR-~PWIWt=INyp5u`H|p~xXM*@ z$|Q6Id6by+j7zsL>1U~;Jpl`f{zCUsj5ew;eHI)dN)#N{qDUSzP-J<(IuLgMQ&1d!>OVeNM^l3#C=aBiq`7PooZf*1DWOP7&A8NJi5i2 zjt0hdx}#5A7tS^Fo*$%skBX>G&50y;4HQtx}H;Z+Q7NgRuLGdjku{7!$E!`4x9*X^8+^+L1>H_9^As zSIpEVs;SKoMe}3tKIKxMlcyYKcIp1!#+TaLSkOnQXm4S8tAc(YWS6evRQ*u2SKSpz z-H4O139S^!xv-`u!Zx@BetEHP^53UjKFA z1z-*MTOspDEoU#wfMvikU>UFsSOzQumI2FvWxz6E8L$le?=YaZFUm{5d6)P4x7h8< zKbB$d$2)4qnbJGP*bxj=%6eRR!(~^+O>=kXHWh&j?sx;ZWmo{?`~!H03Ft*XS!yl-Jb5g z?wZ`A+>-(TP_rUT=cGOxhaDB)yzWve;f7-b2)*pG-7;zY;^!T7_ z{_cC+x$fP(*}bdVZ62BDPP5xT%A3y7e$VZ3=dOi2k2L2DV|F`?H zaN}M2o&Ow#F87mJFX;1_2R&{(k!NqmP89KYHjPshwW8gF^r_xg3{(tEFfdO0&7+6T z+_^B1nP?a@hi_*$-*jf8P<2}|P%%(3P%%(3P%%(3P%%(3P%-fT%s?7lZF~(Kd{5R% zC;NNV#NTVO`xDt;cjEok+5Pp|-{&XZXSda#ih+uOih+uOih+uOih+uOih+uOih+uO zih+uO{{aIIKIFfFcJH7A0L1_E@Bd$a&@g@gJO`Ww&H(Gc7l2vdM&J-&0DrjAFkS$D z2y}oi11;bl;0W+h;H^W3@iOo%@Hp@-;G4h@xDS{G-nhXqeh&N$_$lxta2mK4xCXcy z_|*pt<8|O=;CbLXfCqGeHt-qX5O4$V%l8|`FM#g@Tfh=w&-G0OJYZG2k?CBk=lrp(ntAH?A{`r-6roTY%TDHHG4R{WC z6u1+38;2x+27U~j1sGreuYmh=z%H-~m;jB(KX3^1Y<>`f;@K6&UXa$zRmWv%xXFW- z*h@!sNFLTJ+ZFVXQc&m6{A{gRCo{HV;53B*lP?Yt} z2V;6~)Zhq#l8n6q%U#?nu>4jDoMG!^4c<8?^}>KTfy)wS6!~5-P_~nXac^k##;#)} zJhffE<@Ihrz;wSO6o=* z%Ax1FJjhFx?vyZw%N!@%j^c0>rFJ?Fzy^lnUdK;LJnSigW_w}K_XZuaJNA4R%*lIm zL)c59iLen#ZPyQbn+H`iCo4diyN@VlB4@x|dvnX~IlUo2P?}zRZ^uSsKlSXhV;=9? zWbyk5?3j~<&5fd)!FXi*&MuFWeI=-3rMhbT_N}XmmFkv8b>GURa^?n`on^{owLT2F z;~v}vQR+3jt1znqkMA#H}y+)%Y@^$T^g(AP;4 z9;cZgt{E%@(sVA|UfOOZdu*cKDMV@@ky>N7QJ4&Yv#6m*j7wxx(t z8S*gGitN*@L2~JuVQV~PTMi~)AAT(JOuCno-U-sanuLMnhmKq8t6|vo(jf~Y9@Jzx zkeUt~%#6)Cb0AewQ+f|!To}3SdNG@2rhI2n3u-V*McQl?Vr#|9+A-Bjb35THUf?MP6##szyU@ zxoDSCDi3Gd+qAvDjTBW{`%o=OX-c@+vQ<|c75jn~TaEcTEHU72wsP~hQRq+*(PE2U zFFh_66!n@`I(g^vJ}VS8jGm9FPxN5E&~(Y>No_gLsXey3TzU@8WWQscj^lut=P%g! zCynqgT3gRwu;e(HRzHkKPAa)*FfqeHN0cw^Vo4`uuqCpyu?w6LmlawpN#m?m_B*TE zV%W}NYk>OVCp4O2_mZiWM{$<#&}N1wv*#rKXvSYKsIa6Wj-ROm_JIy~tmijlF{ za>_>IBxPL=4YKjV@G{IcmSXwLIE0tTI4pS)=|Gl8(np3t40q>Ss# zxY$~@UzGPkKvt|Q!bsy?sB7D>aNCA{9U@sbH#jdtw5^i6EirD1H8r^#Pg5TEks6*W zb_sjU#HKKpgbb246`77xrjyhm9Pxoe{wFV~CX|u)kM_m6>|NUR=4FosyA5_aBFF6b zQXcs~-|OZj$ZzFx$i3m?S4_gzBi3->u`fH9y#IK4!1W_;S7k6Zkvg^%sH10S8zBt^(dfZ2l?WA>cva!vMwTzXTiw zZUNpxEdD9rNgx7F01WsI4k+FPo&`F}o$I(;bx%Tc=GR^Pi|iw{pBj&Q48Jyl#0ZAJrztM$-+^wsEA_vMA79o13VW55!WY zAVu|3FUX?2Qc)U~%j`rZu53~|p?Q1DG!i70MuHm9duWu|(CfeknCqmDs)Q#}K`8^b z^<17bm}y=l7b>$$E^T@8{lR0U6xJt@))Q4|$}bVpYs)dGn4DHqqo<-%U3nahEkH!oZn!dV)H78 zh3G2EqI&VWSW7eorQH_IOfQtJjKm1_LK+DcC|!SQdFqvIkx7$$z98_7T{XvUxfF3G z-!5BdD7DGgpYx(A`)oCEq5uyQ93cPwIu6cV$c{k9XkMoz&DTkebStkjNTWel<1TzG?RBd>@>*LWJNP!Cs2>QMeaDHXi%CxD{17_ik*7v<$9?>du2foFmT;%oExS}QrgDB=7Ra?eUwlBV z@q`j;^c~Rv(McqrQLI*UIs2x8Gzt={rikA?nQZMnPn2~~1xcdF4HOA%n7(Am>VaNd zPdIDqw|dcOSu^cv5A|x3S!nIa2YT^oNj0L+(_ZLRRkm7N{nU%7q_oK$w_dK4e=0.20,<0.21', # for HF inference `device_map` 'transformers>=4.34.1,<4.35', 'mosaicml-streaming>=0.7.1,<0.8', - 'torch>=2.1,<2.1.1', + # 'torch>=2.1,<2.1.1', # Already installed in TRT-LLM container 'datasets>=2.14.5,<2.15', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.1.97', From a872c3d4febd698344738f8b1ef171881d7c189c Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Wed, 13 Dec 2023 08:25:57 -0800 Subject: [PATCH 06/21] add new yamls, fix trt bugs --- .../models/inference_api_wrapper/interface.py | 5 +- .../models/inference_api_wrapper/trtllm.py | 100 +++++++++++------- scripts/eval/evaluate_trtllm_test.py | 4 +- .../eval/yamls/mini_eval_gauntlet_v0.2.yaml | 13 +++ scripts/eval/yamls/mini_tasks_v0.2.yaml | 8 ++ 5 files changed, 88 insertions(+), 42 deletions(-) create mode 100644 scripts/eval/yamls/mini_eval_gauntlet_v0.2.yaml create mode 100644 scripts/eval/yamls/mini_tasks_v0.2.yaml diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py index 6a6fc14888..1d30b57b60 100644 --- a/llmfoundry/models/inference_api_wrapper/interface.py +++ b/llmfoundry/models/inference_api_wrapper/interface.py @@ -41,8 +41,9 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer): def get_metrics(self, is_train: bool = False): if is_train: - raise NotImplementedError( - 'You cannot use inference wrappers for training') + metrics = {} # Cannot use inference wrappers for training + # raise NotImplementedError( + # 'You cannot use inference wrappers for training') else: metrics = self.eval_metrics diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index b1226ca6cd..b4490a6fb4 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -19,6 +19,7 @@ try: import tensorrt_llm from tensorrt_llm.runtime import ModelConfig, SamplingConfig + from tensorrt_llm.quantization import QuantMode TRT_LLM_INSTALLED = True except ImportError: TRT_LLM_INSTALLED = False @@ -32,8 +33,11 @@ def check_if_trt_llm_installed(): # From tensorrt_llm/examples/{model_name}/build.py -def get_engine_name(model: str, dtype: str, tp_size: int, rank: int): - return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) +def get_engine_name(model: str, dtype: str, tp_size: int, pp_size: int, rank: int): + if pp_size == 1: + return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) + return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, + pp_size, rank) class TRTLLMEvalWrapper(InferenceAPIEvalWrapper): @@ -54,37 +58,67 @@ def __init__( config_path = engine_dir / 'config.json' with open(config_path, 'r') as f: config = json.load(f) + + dtype = config['builder_config']['precision'] + tp_size = config['builder_config']['tensor_parallel'] + pp_size = config['builder_config'].get('pipeline_parallel', 1) + world_size = tp_size * pp_size - # Set vars from config - use_gpt_attention_plugin = config['plugin_config'][ - 'gpt_attention_plugin'] + assert world_size == tensorrt_llm.mpi_world_size(), \ + f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' + + num_heads = config['builder_config']['num_heads'] // tp_size + hidden_size = config['builder_config']['hidden_size'] // tp_size + vocab_size = config['builder_config']['vocab_size'] + num_layers = config['builder_config']['num_layers'] + use_gpt_attention_plugin = bool( + config['plugin_config']['gpt_attention_plugin']) remove_input_padding = config['plugin_config']['remove_input_padding'] #if remove_input_padding: # raise ValueError( # 'TRT-LLM Evaluation Wrapper does not support remove_input_padding.' # ) - dtype = config['builder_config']['precision'] - world_size = config['builder_config']['tensor_parallel'] - assert world_size == tensorrt_llm.mpi_world_size(), \ - f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' - num_heads = config['builder_config']['num_heads'] // world_size - hidden_size = config['builder_config']['hidden_size'] // world_size - vocab_size = config['builder_config']['vocab_size'] - num_layers = config['builder_config']['num_layers'] - multi_query_mode = config['builder_config']['multi_query_mode'] - paged_kv_cache = config['builder_config'].get('paged_kv_cache', False) - tokens_per_block = config['builder_config'].get('tokens_per_block', 64) - use_prompt_tuning = config['builder_config'].get( - 'use_prompt_tuning', False) - # Add quant mode here + + num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads) + paged_kv_cache = config['plugin_config']['paged_kv_cache'] + tokens_per_block = config['plugin_config']['tokens_per_block'] + use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce', + False) + + quant_mode = QuantMode(config['builder_config']['quant_mode']) + if config['builder_config'].get('multi_query_mode', False): + tensorrt_llm.logger.warning( + "`multi_query_mode` config is deprecated. Please rebuild the engine." + ) + num_kv_heads = 1 + num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size + + model_config = tensorrt_llm.runtime.ModelConfig( + vocab_size=vocab_size, + num_layers=num_layers, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + hidden_size=hidden_size, + paged_kv_cache=paged_kv_cache, + tokens_per_block=tokens_per_block, + gpt_attention_plugin=use_gpt_attention_plugin, + remove_input_padding=remove_input_padding, + use_custom_all_reduce=use_custom_all_reduce, + dtype=dtype, + quant_mode=quant_mode, + gather_all_token_logits=True) + self.hidden_size = hidden_size self.vocab_size = vocab_size # Device and rank runtime_rank = tensorrt_llm.mpi_rank() - runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank) - torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) + runtime_mapping = tensorrt_llm.Mapping(world_size, + runtime_rank, + tp_size=tp_size, + pp_size=pp_size) + torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) # Tokenization and sampling self.END_ID = model_cfg.get('eos_token_id', self.tokenizer.eos_token_id) @@ -92,36 +126,26 @@ def __init__( if self.PAD_ID == None: self.tokenizer.pad_token = self.tokenizer.eos_token self.tokenizer.pad_token_id = self.tokenizer.eos_token_id - + self.PAD_ID = self.tokenizer.eos_token_id + print('EOS TOKEN:', self.END_ID) print('Pad token:', self.PAD_ID) self.sampling_config = SamplingConfig(end_id=self.END_ID, pad_id=self.PAD_ID, - num_beams=1) + num_beams=1, + return_dict=True) # Load TRT engine - engine_name = get_engine_name(model_cfg['version'], dtype, world_size, + engine_name = get_engine_name(model_cfg['version'], dtype, tp_size, pp_size, runtime_rank) serialize_path = engine_dir / engine_name with open(serialize_path, 'rb') as f: engine_buffer = f.read() - # Initialize generation session for model - trt_model_config = ModelConfig( - num_heads=num_heads, - hidden_size=self.hidden_size, - vocab_size=self.vocab_size, - num_layers=num_layers, - gpt_attention_plugin=use_gpt_attention_plugin, - multi_query_mode=multi_query_mode, - remove_input_padding=remove_input_padding, - paged_kv_cache=paged_kv_cache, - tokens_per_block=tokens_per_block, - use_prompt_tuning=use_prompt_tuning, - gather_all_token_logits = True) self.decoder = tensorrt_llm.runtime.GenerationSession( - trt_model_config, engine_buffer, runtime_mapping) + model_config, engine_buffer, runtime_mapping) + def eval_forward(self, batch, outputs: Optional[Any] = None): # If the batch mode is generate, we will generate a requested number of tokens using the underlying diff --git a/scripts/eval/evaluate_trtllm_test.py b/scripts/eval/evaluate_trtllm_test.py index 832f50f0e4..a0cfc7ac7d 100644 --- a/scripts/eval/evaluate_trtllm_test.py +++ b/scripts/eval/evaluate_trtllm_test.py @@ -33,8 +33,8 @@ } } ], - 'icl_tasks': './eval/yamls/tasks_v0.2.yaml', - 'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml', + 'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml', + 'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml', } trt_llama_config = { diff --git a/scripts/eval/yamls/mini_eval_gauntlet_v0.2.yaml b/scripts/eval/yamls/mini_eval_gauntlet_v0.2.yaml new file mode 100644 index 0000000000..b35c0f2873 --- /dev/null +++ b/scripts/eval/yamls/mini_eval_gauntlet_v0.2.yaml @@ -0,0 +1,13 @@ +eval_gauntlet: + weighting: EQUAL + subtract_random_baseline: true + rescale_accuracy: true + averages: + core_average: + - world_knowledge + categories: + - name: world_knowledge + benchmarks: + - name: jeopardy + num_fewshot: 3 + random_baseline: 0 diff --git a/scripts/eval/yamls/mini_tasks_v0.2.yaml b/scripts/eval/yamls/mini_tasks_v0.2.yaml new file mode 100644 index 0000000000..2366ccfb8d --- /dev/null +++ b/scripts/eval/yamls/mini_tasks_v0.2.yaml @@ -0,0 +1,8 @@ +icl_tasks: +- + label: jeopardy + dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl + num_fewshot: [3] + icl_task_type: language_modeling + continuation_delimiter: "\nAnswer: " # this separates questions from answers + has_categories: true From f01be4271c24b65070b2f73841cdd61d94892a4e Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Wed, 13 Dec 2023 10:25:34 -0800 Subject: [PATCH 07/21] update trt wrapper for new logit format --- .../models/inference_api_wrapper/trtllm.py | 28 +++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index b4490a6fb4..a862b6b026 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -175,9 +175,33 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): len(expected_cont_tokens)) output_dict = self.decoder.decode( - input_ids, input_lengths, self.sampling_config) + input_ids, input_lengths, self.sampling_config, return_dict=True) + + context_logits = output_dict['context_logits'] + context_logits = context_logits.squeeze() + output_logits_list = output_dict['generation_logits'] + for i in range(len(output_logits_list)): + output_logits_list[i] = output_logits_list[i].squeeze() + print("Context logits:", context_logits.shape) + print("Output logits list:", output_logits_list) + print("Output logits 0 shape:", output_logits_list[0].shape) + output_logits_tensor = torch.stack(output_logits_list) + print("Output logits stacked:", output_logits_tensor.shape) + combined_logits = torch.cat([context_logits, output_logits_tensor]) + print("Combined logits shape:", combined_logits.shape) + + padding = torch.nn.functional.one_hot( + torch.full( + (seqlen - combined_logits.shape[0],), + self.PAD_ID, + device=combined_logits.device + ), + num_classes=self.vocab_size) + padded_combined_logits = torch.cat([combined_logits, padding]) + + output_logits_batch.append(padded_combined_logits) - return output_dict['generation_logits'] + return torch.stack(output_logits_batch).to(batch['input_ids'].device) #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) """ From c5a79da7f7bea0c8239c38c570cce2efc6b923dc Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Wed, 13 Dec 2023 11:10:37 -0800 Subject: [PATCH 08/21] more padding and shape fixes --- .../models/inference_api_wrapper/trtllm.py | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index a862b6b026..103b3e862f 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -182,13 +182,17 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): output_logits_list = output_dict['generation_logits'] for i in range(len(output_logits_list)): output_logits_list[i] = output_logits_list[i].squeeze() - print("Context logits:", context_logits.shape) - print("Output logits list:", output_logits_list) - print("Output logits 0 shape:", output_logits_list[0].shape) - output_logits_tensor = torch.stack(output_logits_list) - print("Output logits stacked:", output_logits_tensor.shape) - combined_logits = torch.cat([context_logits, output_logits_tensor]) - print("Combined logits shape:", combined_logits.shape) + # print("Context logits:", context_logits.shape) + # print("Output logits list:", output_logits_list) + if len(output_logits_list) > 0: + # print("Output logits 0 shape:", output_logits_list[0].shape) + output_logits_tensor = torch.stack(output_logits_list) + # print("Output logits stacked:", output_logits_tensor.shape) + combined_logits = torch.cat([context_logits, output_logits_tensor]) + else: + combined_logits = context_logits + + # print("Combined logits shape:", combined_logits.shape) padding = torch.nn.functional.one_hot( torch.full( @@ -199,12 +203,14 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): num_classes=self.vocab_size) padded_combined_logits = torch.cat([combined_logits, padding]) - output_logits_batch.append(padded_combined_logits) + # print("Padded combined logits shape:", padded_combined_logits.shape) - return torch.stack(output_logits_batch).to(batch['input_ids'].device) + output_logits_batch.append(padded_combined_logits) + + return torch.stack(output_logits_batch).to(batch['input_ids'].device) - #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) - """ + #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) + """ # Old logits logic, back before TRT-LLM natively returned logits output_logits = torch.nn.functional.one_hot( torch.tensor(tokens[1:cont_idxs[0]], device='cuda'), @@ -226,5 +232,5 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): #print("Output logits shape:", output_logits.shape) output_logits_batch.append(output_logits) - return torch.stack(output_logits_batch).to(batch['input_ids'].device) - """ + return torch.stack(output_logits_batch).to(batch['input_ids'].device) + """ From 19abfe2fc44542f0f240e07f1a674c9518227165 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Wed, 13 Dec 2023 15:01:42 -0800 Subject: [PATCH 09/21] update run script --- scripts/eval/evaluate_trtllm_test.py | 46 +++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/scripts/eval/evaluate_trtllm_test.py b/scripts/eval/evaluate_trtllm_test.py index a0cfc7ac7d..682fb5295b 100644 --- a/scripts/eval/evaluate_trtllm_test.py +++ b/scripts/eval/evaluate_trtllm_test.py @@ -9,6 +9,7 @@ from omegaconf import OmegaConf as om from omegaconf import DictConfig +# GPT config is just for quick initial testing purposes trt_gpt_config = { 'run_name': 'trtllm-eval', 'seed': 0, @@ -52,7 +53,43 @@ { 'name': 'trtllm', 'version': 'llama', - 'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/trt-models/llama-2-7b-chat/bf16/1-gpu', + 'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/7B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/1-gpu', + 'log_level': 'error', + 'eos_token_id': 2, + 'pad_token_id': 2 + }, + 'tokenizer': + { + 'name': '/workspace/llama-7b-chat-hf/' + } + } + ], + 'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml', + 'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml', + 'loggers': { + 'wandb': { + 'project': 'nik-quant-eval' + } + } +} + + +trt_llama70b_config = { + 'run_name': 'trtllm-eval', + 'seed': 0, + 'max_seq_len': 2048, + 'device_eval_batch_size': 4, + 'precision': 'amp_bf16', + 'dist_timeout': 6000, + 'models': + [ + { + 'model_name': 'trtllm/llama', + 'model': + { + 'name': 'trtllm', + 'version': 'llama', + 'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/8-gpu', 'log_level': 'error', 'eos_token_id': 2, 'pad_token_id': 2 @@ -63,8 +100,8 @@ } } ], - 'icl_tasks': './eval/yamls/tasks_v0.2.yaml', - 'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml', + 'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml', + 'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml', 'loggers': { 'wandb': { 'project': 'nik-quant-eval' @@ -72,7 +109,8 @@ } } -om_dict_config: DictConfig = om.create(trt_gpt_config) + +om_dict_config: DictConfig = om.create(trt_llama70b_config) print("OmegaConfig dictionary", om.to_yaml(om_dict_config)) run_evaluation(om_dict_config) From 3a3b334445389c038ac9a53dfcead48b67e33c4c Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Mon, 18 Dec 2023 23:00:53 -0800 Subject: [PATCH 10/21] update utils for multigpu trt models --- .../inference_api_wrapper/.trtllm.py.swo | Bin 16384 -> 0 bytes .../models/inference_api_wrapper/interface.py | 7 +- .../models/inference_api_wrapper/trtllm.py | 28 +- llmfoundry/utils/builders.py | 15 +- scripts/eval/eval_trt_multigpu.py | 407 ++++++++++++++++++ scripts/eval/evaluate_trtllm_test.py | 12 +- 6 files changed, 451 insertions(+), 18 deletions(-) delete mode 100644 llmfoundry/models/inference_api_wrapper/.trtllm.py.swo create mode 100644 scripts/eval/eval_trt_multigpu.py diff --git a/llmfoundry/models/inference_api_wrapper/.trtllm.py.swo b/llmfoundry/models/inference_api_wrapper/.trtllm.py.swo deleted file mode 100644 index 65e810630861a82ffa1e760d1eee34e247eae123..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeHO&yO5O6)s{YI1n2|;1q<)J3>8+W_s2e$8rYtu=1|OhF$L}y9tQ3snm4W%rx86 z-L9@)du*-X!i66PB#^jpK}g}(f#eT}xFjM-K>_6g9FVvnt{{aA@Kts9bkFQ~_6S1N z^0(7n_3G8DuU@^Xt}gdhUf$ZF*WFcu>!XBRdF#FiE(&HzAtY_ zu`j~36GVL>MdS&_lYrfmJV}J?D5)UoCgZd0vH6w(%fLJaE|OK0pA3MzzaYJc<(7fegnJ>{2KT^unRm6oCnSUzxyyD?*eZCzXZMu1V9g113m{_ z1fBwZ`yoQ!0e%V`1Dn7zz(1cP0u__;LBHM(y0%7d#gXVv@@tKRI&%_*dHsPq z5TK)OZ%N4f)>H%;g*h3g71o=0T46o6QrMsex<`TsTm^ANdF0cwuPd@uEWM%YWj=RP zp_ng@15Y$%7O7w)nB>tB{WL8fM>tsnPjLrHmNJ~NVG$1CR29yMp%qF*X)L8;`XtUy zUb=~6J)zH1bZl#)Syyygk*R)0%Lqq&bb7NAZ_o{6wrQfytD0v>J&rxzGZn(5>S4Mo zhk@^lsFbQbRmvK4-S_F;j7Q3JI342H(L@Wy zKJ*TqUKWHtc+H=kJI0$PH5d=diSLE6cXU=kXSM*0xpuYUE#U*!Et!(B{Lc(vxL+d5yl957}vr zPF1S8e6s9J?B!`Hq^h+<2fT%)TlDs%bftA6ak)O0aLL0^gf;4YHBJ`$OCp6Aq=;j+ zQVq>c<<90&CpiZ%9<;Z&cj)FZ53@Y_eK9{3Nc~u(G>R2Xvm}YpWGP1Rv0y=zWD2JF zz7B8|F-EE6K`Q9WP$o8|jAfJPYV_5>G%-&@HBZdTt+AAZ8TxiI;18X7^0hYWN}U11 z3s<012X>aLsLALEGf!kewM6UHSA*^gMUOcKUmtO-h)h)s)H0KW!dKHca>JPW&3@Gl z_X0JfaU!B-E(cQ8utgo|EK?4x6*aYdm?n&qGe3~DL!CU&)iqR|Q4?9Cdr;e)%)5t9 zmFe86tZC3~*hk?JrP-!g{U8)oqoy1^j0XcOIrMB8ienMh>p)(fsH-5F3w^V{dmRoQwT%%6PGrxQc5fDbYBH9Z(Fdbkwhq`6g|sa@aOs-?rS zyxmf$Oh(jsc*5{cCHSLf-iIe{-Vcu3kL8G~oQoDUBOG)j<-~6$d~FOCOswd;h>t{G zpi5IFi&DjXQM603;KafK<;^WDYS5e7RqMUj5%={XR-~PWIWt=INyp5u`H|p~xXM*@ z$|Q6Id6by+j7zsL>1U~;Jpl`f{zCUsj5ew;eHI)dN)#N{qDUSzP-J<(IuLgMQ&1d!>OVeNM^l3#C=aBiq`7PooZf*1DWOP7&A8NJi5i2 zjt0hdx}#5A7tS^Fo*$%skBX>G&50y;4HQtx}H;Z+Q7NgRuLGdjku{7!$E!`4x9*X^8+^+L1>H_9^As zSIpEVs;SKoMe}3tKIKxMlcyYKcIp1!#+TaLSkOnQXm4S8tAc(YWS6evRQ*u2SKSpz z-H4O139S^!xv-`u!Zx@BetEHP^53UjKFA z1z-*MTOspDEoU#wfMvikU>UFsSOzQumI2FvWxz6E8L$le?=YaZFUm{5d6)P4x7h8< zKbB$d$2)4qnbJGP*bxj=%6eRR!(~^+O>=kX None: 'mode', None) == 'icl_task': assert self.labels is not None metric.update(batch, outputs, self.labels) + for batch_idx, cont_idx in enumerate(batch['continuation_indices']): + cont_tok_pred = outputs[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1) + cont_tok_targ = self.labels[batch_idx].index_select(dim=0, index=cont_idx - 1) + print("Ground Truth Label:", self.tokenizer.decode(self.labels[batch_idx].tolist()[:-1])) + print("Model output:", self.tokenizer.decode(cont_tok_pred)) else: raise NotImplementedError( 'Inference API wrapper only supports InContextLearningMetrics and mode=icl_task' diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index 103b3e862f..ccf5f728e6 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -118,7 +118,8 @@ def __init__( runtime_rank, tp_size=tp_size, pp_size=pp_size) - torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node) + self.device_num = runtime_rank % runtime_mapping.gpus_per_node + torch.cuda.set_device(self.device_num) # Tokenization and sampling self.END_ID = model_cfg.get('eos_token_id', self.tokenizer.eos_token_id) @@ -144,7 +145,10 @@ def __init__( engine_buffer = f.read() self.decoder = tensorrt_llm.runtime.GenerationSession( - model_config, engine_buffer, runtime_mapping) + model_config, engine_buffer, runtime_mapping, debug_mode=False) + + print("!!! Initialized generation session for rank:", runtime_rank) + torch.cuda.synchronize() def eval_forward(self, batch, outputs: Optional[Any] = None): @@ -161,27 +165,35 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] prompt = tokens[:cont_idxs[0]] - input_ids = torch.tensor([prompt], dtype=torch.int, device='cuda') + + + input_ids = torch.tensor([prompt], dtype=torch.int, device="cuda:" + str(self.device_num)) input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, - device='cuda') - #print("prompt:", self.tokenizer.decode(prompt)) + device="cuda:"+ str(self.device_num)) + # print("prompt:", self.tokenizer.decode(prompt)) + # print("Input device:", input_ids.get_device()) #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) #print("Input lengths:", input_lengths) #print(cont_idxs[0]) #print("Expected continuation tokens:", len(expected_cont_tokens)) - self.decoder.setup(input_lengths.size(0), + with torch.no_grad(): + self.decoder.setup(input_lengths.size(0), torch.max(input_lengths).item(), len(expected_cont_tokens)) - output_dict = self.decoder.decode( - input_ids, input_lengths, self.sampling_config, return_dict=True) + output_dict = self.decoder.decode( + input_ids, input_lengths, self.sampling_config, return_dict=True) + torch.cuda.synchronize() + context_logits = output_dict['context_logits'] context_logits = context_logits.squeeze() output_logits_list = output_dict['generation_logits'] + # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()) for i in range(len(output_logits_list)): output_logits_list[i] = output_logits_list[i].squeeze() + print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0].tolist())) # print("Context logits:", context_logits.shape) # print("Output logits list:", output_logits_list) if len(output_logits_list) > 0: diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index a672fbee55..58987fb71d 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -43,6 +43,11 @@ log = logging.getLogger(__name__) +try: + import tensorrt_llm + TENSORRT_LLM = True +except: + TENSORRT_LLM = False def build_evaluators( eval_loader_config: Optional[Union[DictConfig, ListConfig]], @@ -491,9 +496,13 @@ def _validate_cfg(icl_cfg: DictConfig): metric_names = list(icl_cfg.metric_names) # TODO: fix Composer bug when copying local paths and destination exists destination_path = f'{destination_dir}/{icl_cfg.label}-{num_fewshot}.jsonl' - if dist.get_local_rank() == 0 and os.path.exists(destination_path): - os.remove(destination_path) - dist.barrier() + if TENSORRT_LLM: + if tensorrt_llm.mpi_rank() == 0 and os.path.exists(destination_path): + os.remove(destination_path) + else: + if dist.get_global_rank() == 0 and os.path.exists(destination_path): + os.remove(destination_path) + dist.barrier() dataloaders = get_icl_task_dataloader( icl_cfg.icl_task_type, diff --git a/scripts/eval/eval_trt_multigpu.py b/scripts/eval/eval_trt_multigpu.py new file mode 100644 index 0000000000..809b1a53e3 --- /dev/null +++ b/scripts/eval/eval_trt_multigpu.py @@ -0,0 +1,407 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +import sys +import time +import warnings +from typing import Any, Dict, List, Optional, Tuple, Union + +import pandas as pd +import torch +from composer.loggers.logger_destination import LoggerDestination +from composer.models.base import ComposerModel +from composer.trainer import Trainer +from composer.utils import dist, get_device, reproducibility +from omegaconf import DictConfig, ListConfig +from omegaconf import OmegaConf as om +from transformers import (AutoModelForCausalLM, PreTrainedTokenizerBase, + T5ForConditionalGeneration) + +# from llmfoundry.models import MPTForCausalLM +from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY +from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, + build_evaluators, build_logger, + build_tokenizer) +from llmfoundry.utils.config_utils import pop_config, process_init_device + + + +def load_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, + fsdp_config: Optional[Dict], num_retries: int) -> ComposerModel: + init_context = process_init_device(model_cfg, fsdp_config) + + retries = 0 + composer_model = None + with init_context: + while retries < num_retries and composer_model is None: + try: + composer_model = COMPOSER_MODEL_REGISTRY[model_cfg.name]( + model_cfg, tokenizer) + except Exception as e: + retries += 1 + if retries >= num_retries: + raise e + else: + print( + f'Got exception {str(e)} while loading model {model_cfg.name}. {num_retries-retries} retries remaining' + ) + + assert composer_model is not None + return composer_model + + +def evaluate_model( + model_cfg: DictConfig, + dist_timeout: Union[float, int], + run_name: str, + seed: int, + icl_tasks: Union[str, ListConfig], + max_seq_len: int, + device_eval_batch_size: int, + eval_gauntlet_config: Optional[Union[str, DictConfig]], + eval_loader_config: Optional[Union[DictConfig, ListConfig]], + fsdp_config: Optional[Dict], + num_retries: int, + loggers_cfg: Dict[str, Any], + python_log_level: Optional[str], + precision: str, + eval_gauntlet_df: Optional[pd.DataFrame], + icl_subset_num_batches: Optional[int], +): + + print(f'Evaluating model: {model_cfg.model_name}', flush=True) + # Build tokenizer and model + tokenizer_cfg: Dict[str, + Any] = om.to_container(model_cfg.tokenizer, + resolve=True) # type: ignore + tokenizer_name = tokenizer_cfg['name'] + tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + + evaluators, logger_keys, eval_gauntlet_callback = build_evaluators( + eval_loader_config, + icl_tasks, + eval_gauntlet_config, + tokenizer=tokenizer, + device_eval_batch_size=device_eval_batch_size, + icl_seq_len=max_seq_len, + icl_subset_num_batches=icl_subset_num_batches, + ) + + callbacks = [] + if eval_gauntlet_callback is not None: + callbacks.append(eval_gauntlet_callback) + + loggers: List[LoggerDestination] = [ + build_logger(name, logger_cfg) + for name, logger_cfg in loggers_cfg.items() + ] + + if fsdp_config and model_cfg.model.get('load_in_8bit', False): + raise ValueError( + 'The FSDP config block is not supported when loading ' + + 'Hugging Face models in 8bit.') + + if hasattr(model_cfg.model, 'pretrained_lora_id_or_path'): + composer_model = load_peft_model(model_cfg.model, tokenizer, + num_retries) + else: + composer_model = load_model(model_cfg.model, tokenizer, fsdp_config, + num_retries) + + # Now add the eval metrics + if eval_loader_config is not None: + train_metrics = composer_model.get_metrics(is_train=True) + evaluators = add_metrics_to_eval_loaders(evaluators, train_metrics) + + if eval_gauntlet_df is None and eval_gauntlet_callback is not None: + eval_gauntlet_df = pd.DataFrame( + columns=['model_name'] + + [avg for avg in eval_gauntlet_callback.averages] + + [t.name for t in eval_gauntlet_callback.categories]) + + load_path = model_cfg.get('load_path', None) + if model_cfg.model.name == 'mpt_causal_lm' and load_path is None: + raise ValueError( + 'MPT causal LMs require a load_path to the checkpoint for model evaluation.' + + + ' Please check your yaml and the model_cfg to ensure that load_path is set.' + ) + + assert composer_model is not None + + trainer = Trainer( + run_name=run_name, + seed=seed, + model=composer_model, + callbacks=callbacks, + loggers=loggers, + precision=precision, + fsdp_config=fsdp_config, + load_path=load_path, + load_weights_only=True, + progress_bar=False, + log_to_console=True, + dist_timeout=dist_timeout, + python_log_level=python_log_level, + ) + + if torch.cuda.is_available(): + torch.cuda.synchronize() + a = time.time() + trainer.eval(eval_dataloader=evaluators) + if torch.cuda.is_available(): + torch.cuda.synchronize() + b = time.time() + print(f'Ran {model_cfg.model_name} eval in: {b-a} seconds') + return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) + + +def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: + om.resolve(cfg) + model_configs: ListConfig = pop_config(cfg, 'models', must_exist=True) + eval_gauntlet_config: Optional[Union[str, DictConfig]] = pop_config( + cfg, 'eval_gauntlet', must_exist=False, default_value=None) + if eval_gauntlet_config is None: + eval_gauntlet_config = pop_config(cfg, + 'model_gauntlet', + must_exist=False, + default_value=None) + if eval_gauntlet_config: + print( + 'Use of the key `model_gauntlet` is deprecated, please use the key `eval_gauntlet`' + ) + + fsdp_dict_cfg: Optional[DictConfig] = pop_config(cfg, + 'fsdp_config', + must_exist=False, + default_value=None) + fsdp_config: Optional[Dict] = om.to_container( + fsdp_dict_cfg, + resolve=True) if fsdp_dict_cfg is not None else None # type: ignore + assert isinstance(fsdp_config, Dict) or fsdp_config is None + + # Mandatory Evaluation Parameters + icl_tasks: Union[str, ListConfig] = pop_config(cfg, + 'icl_tasks', + must_exist=True) + max_seq_len: int = pop_config(cfg, 'max_seq_len', must_exist=True) + device_eval_batch_size: int = pop_config(cfg, + 'device_eval_batch_size', + must_exist=True) + precision: str = pop_config(cfg, + 'precision', + must_exist=False, + default_value=None) + python_log_level: Optional[str] = pop_config(cfg, + 'python_log_level', + must_exist=False, + default_value='debug') + + # Optional Evaluation Parameters with default values + eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config( + cfg, 'eval_loader', must_exist=False, default_value=None) + seed: int = pop_config(cfg, 'seed', must_exist=False, default_value=17) + dist_timeout: Union[float, int] = pop_config(cfg, + 'dist_timeout', + must_exist=False, + default_value=600.0) + default_run_name: str = os.environ.get('RUN_NAME', 'llm') + run_name: str = pop_config(cfg, + 'run_name', + must_exist=False, + default_value=default_run_name) + num_retries: int = pop_config(cfg, + 'num_retries', + must_exist=False, + default_value=3) + loggers_cfg: Dict[str, Any] = pop_config(cfg, + 'loggers', + must_exist=False, + default_value={}) + icl_subset_num_batches: int = pop_config(cfg, + 'icl_subset_num_batches', + must_exist=False, + default_value=None) + # Pop out interpolation variables. + pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None) + + # Warn for unused parameters + for key in cfg: + warnings.warn( + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary.' + ) + + reproducibility.seed_all(seed) + # dist.initialize_dist(get_device(None), timeout=dist_timeout) + + if python_log_level is not None: + logging.basicConfig( + # Example of format string + # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here + format= + f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' + ) + logging.getLogger('llmfoundry').setLevel(python_log_level.upper()) + + eval_gauntlet_df = None + models_df = None + composite_scores = None + trainers = [] + for model_cfg in model_configs: + (trainer, logger_keys, eval_gauntlet_callback, + eval_gauntlet_df) = evaluate_model( + model_cfg=model_cfg, + dist_timeout=dist_timeout, + run_name=run_name, + seed=seed, + icl_tasks=icl_tasks, + max_seq_len=max_seq_len, + device_eval_batch_size=device_eval_batch_size, + eval_gauntlet_config=eval_gauntlet_config, + eval_loader_config=eval_loader_config, + fsdp_config=fsdp_config, + num_retries=num_retries, + loggers_cfg=loggers_cfg, + python_log_level=python_log_level, + precision=precision, + eval_gauntlet_df=eval_gauntlet_df, + icl_subset_num_batches=icl_subset_num_batches) + trainers.append(trainer) + + if eval_gauntlet_callback is not None: + composite_scores = eval_gauntlet_callback.eval_after_all( + trainer.state, trainer.logger) + + benchmark_to_taxonomy = {} + if eval_gauntlet_callback is not None: + for t in eval_gauntlet_callback.categories: + for b in t.benchmarks: + benchmark_to_taxonomy[b.name] = t.name + + model_results = calculate_markdown_results(logger_keys, trainer, + benchmark_to_taxonomy, + model_cfg.model_name) + + if models_df is None: + models_df = model_results + else: + models_df = pd.concat([models_df, model_results], ignore_index=True) + + if eval_gauntlet_df is not None and eval_gauntlet_callback is not None: + assert composite_scores is not None + row = {'model_name': model_cfg['model_name']} + row.update( + {k.split('/')[-1]: v for k, v in composite_scores.items()}) + eval_gauntlet_df = pd.concat( + [eval_gauntlet_df, pd.DataFrame([row])], ignore_index=True) + + print(f'Printing gauntlet results for all models') + + print( + eval_gauntlet_df.sort_values( + list(eval_gauntlet_callback.averages.keys())[0], + ascending=False).to_markdown(index=False)) + print(f'Printing complete results for all models') + assert models_df is not None + print(models_df.to_markdown(index=False)) + + return trainers, eval_gauntlet_df + + +def calculate_markdown_results(logger_keys: List[str], trainer: Trainer, + benchmark_to_taxonomy: Dict[str, str], + model_name: str): + results = {} + + for key in logger_keys: + # dl_name is either 2-tuple (benchmark_name, num_fewshot) + # or 3-tuple (benchmark_name, num_fewshot, subcategory) + dl_name, metric_name = key.split('/')[1:-1], key.split('/')[-1] + if 'Accuracy' not in metric_name: + continue + + metric = trainer.state.eval_metrics.get('/'.join(dl_name), + {}).get(metric_name, None) + + if metric is None: + continue + if dl_name[1] not in results: + results[dl_name[1]] = {} + + if dl_name[0] not in results[dl_name[1]]: + results[dl_name[1]][dl_name[0]] = {} + + if metric_name not in results[dl_name[1]][dl_name[0]]: + results[dl_name[1]][dl_name[0]][metric_name] = [] + + results[dl_name[1]][dl_name[0]][metric_name].append({ + 'val': metric.compute(), + 'subcat': dl_name[-1] if len(dl_name) == 3 else 'no_subcat' + }) + + df = pd.DataFrame(columns=[ + 'Category', 'Benchmark', 'Subtask', 'Accuracy', 'Number few shot', + 'Model' + ]) + + for num_shot in results: + for benchmark in results[num_shot]: + for metric in results[num_shot][benchmark]: + subscores = results[num_shot][benchmark][metric] + if len(subscores) == 1: + row = { + 'Category': benchmark_to_taxonomy.get(benchmark, ''), + 'Benchmark': benchmark, + 'Subtask': None, + 'Accuracy': subscores[0]['val'], + 'Number few shot': num_shot, + 'Model': model_name + } + df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) + else: + row = { + 'Category': + benchmark_to_taxonomy.get(benchmark, ''), + 'Benchmark': + benchmark, + 'Subtask': + 'Average', + 'Accuracy': + sum(s['val'] for s in subscores) / len(subscores), + 'Number few shot': + num_shot, + 'Model': + model_name + } + df = pd.concat([df, pd.DataFrame([row])], ignore_index=True) + for sub in subscores: + row = { + 'Category': + benchmark_to_taxonomy.get(benchmark, ''), + 'Benchmark': + None, + 'Subtask': + sub['subcat'], + 'Accuracy': + sub['val'], + 'Number few shot': + num_shot, + 'Model': + model_name + } + df = pd.concat([df, pd.DataFrame([row])], + ignore_index=True) + return df + + +if __name__ == '__main__': + yaml_path, args_list = sys.argv[1], sys.argv[2:] + with open(yaml_path) as f: + yaml_cfg = om.load(f) + cli_cfg = om.from_cli(args_list) + cfg = om.merge(yaml_cfg, cli_cfg) + assert isinstance(cfg, DictConfig) + main(cfg) diff --git a/scripts/eval/evaluate_trtllm_test.py b/scripts/eval/evaluate_trtllm_test.py index 682fb5295b..8b1ab7b8ef 100644 --- a/scripts/eval/evaluate_trtllm_test.py +++ b/scripts/eval/evaluate_trtllm_test.py @@ -5,7 +5,7 @@ # All this can be written in YAML form. -from eval import main as run_evaluation +from eval_trt_multigpu import main as run_evaluation from omegaconf import OmegaConf as om from omegaconf import DictConfig @@ -38,12 +38,12 @@ 'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml', } -trt_llama_config = { +trt_llama7b_config = { 'run_name': 'trtllm-eval', 'seed': 0, 'max_seq_len': 2048, - 'device_eval_batch_size': 4, - 'precision': 'amp_bf16', + 'device_eval_batch_size': 32, + 'precision': 'amp_fp16', 'dist_timeout': 6000, 'models': [ @@ -79,7 +79,7 @@ 'seed': 0, 'max_seq_len': 2048, 'device_eval_batch_size': 4, - 'precision': 'amp_bf16', + 'precision': 'amp_fp16', 'dist_timeout': 6000, 'models': [ @@ -89,7 +89,7 @@ { 'name': 'trtllm', 'version': 'llama', - 'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/8-gpu', + 'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/fp8/8-gpu', 'log_level': 'error', 'eos_token_id': 2, 'pad_token_id': 2 From f646eddf762333cdfb91baedac15b72a5cbe4425 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Fri, 22 Dec 2023 02:45:05 -0800 Subject: [PATCH 11/21] Metric device updates --- .../models/inference_api_wrapper/interface.py | 9 ++---- .../models/inference_api_wrapper/trtllm.py | 29 ++++++++++++++++--- ...uate_trtllm_test.py => run_trtllm_eval.py} | 16 ++++++---- 3 files changed, 38 insertions(+), 16 deletions(-) rename scripts/eval/{evaluate_trtllm_test.py => run_trtllm_eval.py} (84%) diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py index 6ebc472f91..78a63d202a 100644 --- a/llmfoundry/models/inference_api_wrapper/interface.py +++ b/llmfoundry/models/inference_api_wrapper/interface.py @@ -28,7 +28,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer): eval_metrics = [ LanguageCrossEntropy(), LanguagePerplexity(), - InContextLearningLMAccuracy() + InContextLearningLMAccuracy(), InContextLearningMultipleChoiceAccuracy(), InContextLearningQAAccuracy(), InContextLearningLMExpectedCalibrationError(), @@ -94,18 +94,15 @@ def eval_forward(self, batch: Batch, outputs: Optional[Any] = None): def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None: batch = self.rebatch(batch) + metric = metric.to(device=outputs.device) self.labels = batch.pop('labels') self.labels[:, :-1] = self.labels[:, 1:].clone() self.labels[:, -1] = -100 + # print("Devices:", outputs.get_device(), self.labels.get_device(), metric.device) if isinstance(metric, InContextLearningMetric) and batch.get( 'mode', None) == 'icl_task': assert self.labels is not None metric.update(batch, outputs, self.labels) - for batch_idx, cont_idx in enumerate(batch['continuation_indices']): - cont_tok_pred = outputs[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1) - cont_tok_targ = self.labels[batch_idx].index_select(dim=0, index=cont_idx - 1) - print("Ground Truth Label:", self.tokenizer.decode(self.labels[batch_idx].tolist()[:-1])) - print("Model output:", self.tokenizer.decode(cont_tok_pred)) else: raise NotImplementedError( 'Inference API wrapper only supports InContextLearningMetrics and mode=icl_task' diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index ccf5f728e6..4e77184a24 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -119,6 +119,7 @@ def __init__( tp_size=tp_size, pp_size=pp_size) self.device_num = runtime_rank % runtime_mapping.gpus_per_node + self.device = torch.device('cuda:' + str(self.device_num)) torch.cuda.set_device(self.device_num) # Tokenization and sampling @@ -149,6 +150,26 @@ def __init__( print("!!! Initialized generation session for rank:", runtime_rank) torch.cuda.synchronize() + + # Move metrics to proper device (doesn't help, have to do this in update_metric()) + # for key, value in self.eval_metrics.items(): + # self.eval_metrics[key] = value.to(device=self.device) + # print("Eval metric now at:", self.eval_metrics[key].device) + + def rebatch(self, batch): + """ + Move tensors in batch to the correct GPU. + """ + if isinstance(batch, dict): + for key, value in batch.items(): + batch[key] = self.rebatch(value) + return batch + elif isinstance(batch, torch.Tensor): + return batch.to(device=self.device) + elif isinstance(batch, list): + return [self.rebatch(b) for b in batch] + + return batch def eval_forward(self, batch, outputs: Optional[Any] = None): @@ -167,10 +188,10 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): prompt = tokens[:cont_idxs[0]] - input_ids = torch.tensor([prompt], dtype=torch.int, device="cuda:" + str(self.device_num)) + input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, - device="cuda:"+ str(self.device_num)) + device=self.device) # print("prompt:", self.tokenizer.decode(prompt)) # print("Input device:", input_ids.get_device()) #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) @@ -193,7 +214,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()) for i in range(len(output_logits_list)): output_logits_list[i] = output_logits_list[i].squeeze() - print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0].tolist())) + # print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0].tolist())) # print("Context logits:", context_logits.shape) # print("Output logits list:", output_logits_list) if len(output_logits_list) > 0: @@ -219,7 +240,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): output_logits_batch.append(padded_combined_logits) - return torch.stack(output_logits_batch).to(batch['input_ids'].device) + return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device) #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) """ diff --git a/scripts/eval/evaluate_trtllm_test.py b/scripts/eval/run_trtllm_eval.py similarity index 84% rename from scripts/eval/evaluate_trtllm_test.py rename to scripts/eval/run_trtllm_eval.py index 8b1ab7b8ef..4bce4d4170 100644 --- a/scripts/eval/evaluate_trtllm_test.py +++ b/scripts/eval/run_trtllm_eval.py @@ -9,6 +9,10 @@ from omegaconf import OmegaConf as om from omegaconf import DictConfig + +trt_folder_path = '/workspace/TensorRT-LLM/' + + # GPT config is just for quick initial testing purposes trt_gpt_config = { 'run_name': 'trtllm-eval', @@ -25,7 +29,7 @@ { 'name': 'trtllm', 'version': 'gpt', - 'engine_dir': '/workspace/tensorrt-llm-private/examples/gpt/engine_outputs', + 'engine_dir': trt_folder_path + 'examples/gpt/engine_outputs', 'log_level': 'error' }, 'tokenizer': @@ -53,7 +57,7 @@ { 'name': 'trtllm', 'version': 'llama', - 'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/7B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/1-gpu', + 'engine_dir': trt_folder_path + 'examples/llama/tmp/llama/7B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/1-gpu', 'log_level': 'error', 'eos_token_id': 2, 'pad_token_id': 2 @@ -78,7 +82,7 @@ 'run_name': 'trtllm-eval', 'seed': 0, 'max_seq_len': 2048, - 'device_eval_batch_size': 4, + 'device_eval_batch_size': 8, 'precision': 'amp_fp16', 'dist_timeout': 6000, 'models': @@ -89,7 +93,7 @@ { 'name': 'trtllm', 'version': 'llama', - 'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/fp8/8-gpu', + 'engine_dir': trt_folder_path + 'examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/fp8/8-gpu', 'log_level': 'error', 'eos_token_id': 2, 'pad_token_id': 2 @@ -100,8 +104,8 @@ } } ], - 'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml', - 'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml', + 'icl_tasks': './eval/yamls/tasks_v0.2.yaml', + 'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml', 'loggers': { 'wandb': { 'project': 'nik-quant-eval' From dfa30b813d3e4b2884147358d1f1df0bfc2f7ae2 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Wed, 3 Jan 2024 23:38:32 -0800 Subject: [PATCH 12/21] Update interface to support QA tasks --- .../inference_api_wrapper/.trtllm.py.swp | Bin 20480 -> 0 bytes .../models/inference_api_wrapper/interface.py | 10 +- .../models/inference_api_wrapper/trtllm.py | 95 ++++++++---- scripts/eval/run_trtllm_eval.py | 128 ++++++++-------- scripts/eval/yamls/lm_tasks_v0.2.yaml | 59 ++++++++ scripts/eval/yamls/mini_tasks_v0.2.yaml | 23 ++- scripts/eval/yamls/qa_mc_tasks_v0.2.yaml | 138 ++++++++++++++++++ scripts/eval/yamls/tasks_v0.2.yaml | 61 ++++---- 8 files changed, 383 insertions(+), 131 deletions(-) delete mode 100644 llmfoundry/models/inference_api_wrapper/.trtllm.py.swp create mode 100644 scripts/eval/yamls/lm_tasks_v0.2.yaml create mode 100644 scripts/eval/yamls/qa_mc_tasks_v0.2.yaml diff --git a/llmfoundry/models/inference_api_wrapper/.trtllm.py.swp b/llmfoundry/models/inference_api_wrapper/.trtllm.py.swp deleted file mode 100644 index f7844d9b9842f3444bea914762c6b151c28895b9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20480 zcmeHPUyLM08E@o|s2m!=2ZNa6R_Pg?>HWh&j?sx;ZWmo{?`~!H03Ft*XS!yl-Jb5g z?wZ`A+>-(TP_rUT=cGOxhaDB)yzWve;f7-b2)*pG-7;zY;^!T7_ z{_cC+x$fP(*}bdVZ62BDPP5xT%A3y7e$VZ3=dOi2k2L2DV|F`?H zaN}M2o&Ow#F87mJFX;1_2R&{(k!NqmP89KYHjPshwW8gF^r_xg3{(tEFfdO0&7+6T z+_^B1nP?a@hi_*$-*jf8P<2}|P%%(3P%%(3P%%(3P%%(3P%-fT%s?7lZF~(Kd{5R% zC;NNV#NTVO`xDt;cjEok+5Pp|-{&XZXSda#ih+uOih+uOih+uOih+uOih+uOih+uO zih+uO{{aIIKIFfFcJH7A0L1_E@Bd$a&@g@gJO`Ww&H(Gc7l2vdM&J-&0DrjAFkS$D z2y}oi11;bl;0W+h;H^W3@iOo%@Hp@-;G4h@xDS{G-nhXqeh&N$_$lxta2mK4xCXcy z_|*pt<8|O=;CbLXfCqGeHt-qX5O4$V%l8|`FM#g@Tfh=w&-G0OJYZG2k?CBk=lrp(ntAH?A{`r-6roTY%TDHHG4R{WC z6u1+38;2x+27U~j1sGreuYmh=z%H-~m;jB(KX3^1Y<>`f;@K6&UXa$zRmWv%xXFW- z*h@!sNFLTJ+ZFVXQc&m6{A{gRCo{HV;53B*lP?Yt} z2V;6~)Zhq#l8n6q%U#?nu>4jDoMG!^4c<8?^}>KTfy)wS6!~5-P_~nXac^k##;#)} zJhffE<@Ihrz;wSO6o=* z%Ax1FJjhFx?vyZw%N!@%j^c0>rFJ?Fzy^lnUdK;LJnSigW_w}K_XZuaJNA4R%*lIm zL)c59iLen#ZPyQbn+H`iCo4diyN@VlB4@x|dvnX~IlUo2P?}zRZ^uSsKlSXhV;=9? zWbyk5?3j~<&5fd)!FXi*&MuFWeI=-3rMhbT_N}XmmFkv8b>GURa^?n`on^{owLT2F z;~v}vQR+3jt1znqkMA#H}y+)%Y@^$T^g(AP;4 z9;cZgt{E%@(sVA|UfOOZdu*cKDMV@@ky>N7QJ4&Yv#6m*j7wxx(t z8S*gGitN*@L2~JuVQV~PTMi~)AAT(JOuCno-U-sanuLMnhmKq8t6|vo(jf~Y9@Jzx zkeUt~%#6)Cb0AewQ+f|!To}3SdNG@2rhI2n3u-V*McQl?Vr#|9+A-Bjb35THUf?MP6##szyU@ zxoDSCDi3Gd+qAvDjTBW{`%o=OX-c@+vQ<|c75jn~TaEcTEHU72wsP~hQRq+*(PE2U zFFh_66!n@`I(g^vJ}VS8jGm9FPxN5E&~(Y>No_gLsXey3TzU@8WWQscj^lut=P%g! zCynqgT3gRwu;e(HRzHkKPAa)*FfqeHN0cw^Vo4`uuqCpyu?w6LmlawpN#m?m_B*TE zV%W}NYk>OVCp4O2_mZiWM{$<#&}N1wv*#rKXvSYKsIa6Wj-ROm_JIy~tmijlF{ za>_>IBxPL=4YKjV@G{IcmSXwLIE0tTI4pS)=|Gl8(np3t40q>Ss# zxY$~@UzGPkKvt|Q!bsy?sB7D>aNCA{9U@sbH#jdtw5^i6EirD1H8r^#Pg5TEks6*W zb_sjU#HKKpgbb246`77xrjyhm9Pxoe{wFV~CX|u)kM_m6>|NUR=4FosyA5_aBFF6b zQXcs~-|OZj$ZzFx$i3m?S4_gzBi3->u`fH9y#IK4!1W_;S7k6Zkvg^%sH10S8zBt^(dfZ2l?WA>cva!vMwTzXTiw zZUNpxEdD9rNgx7F01WsI4k+FPo&`F}o$I(;bx%Tc=GR^Pi|iw{pBj&Q48Jyl#0ZAJrztM$-+^wsEA_vMA79o13VW55!WY zAVu|3FUX?2Qc)U~%j`rZu53~|p?Q1DG!i70MuHm9duWu|(CfeknCqmDs)Q#}K`8^b z^<17bm}y=l7b>$$E^T@8{lR0U6xJt@))Q4|$}bVpYs)dGn4DHqqo<-%U3nahEkH!oZn!dV)H78 zh3G2EqI&VWSW7eorQH_IOfQtJjKm1_LK+DcC|!SQdFqvIkx7$$z98_7T{XvUxfF3G z-!5BdD7DGgpYx(A`)oCEq5uyQ93cPwIu6cV$c{k9XkMoz&DTkebStkjNTWel<1TzG?RBd>@>*LWJNP!Cs2>QMeaDHXi%CxD{17_ik*7v<$9?>du2foFmT;%oExS}QrgDB=7Ra?eUwlBV z@q`j;^c~Rv(McqrQLI*UIs2x8Gzt={rikA?nQZMnPn2~~1xcdF4HOA%n7(Am>VaNd zPdIDqw|dcOSu^cv5A|x3S!nIa2YT^oNj0L+(_ZLRRkm7N{nU%7q_oK$w_dK4e None: batch = self.rebatch(batch) - metric = metric.to(device=outputs.device) + metric = metric.to(device=self.device) + self.labels = batch.pop('labels') + if isinstance(metric, InContextLearningQAAccuracy): + # Labels are strings, not tokens, for QA tasks. + metric.update(outputs, self.labels, batch) + return + self.labels[:, :-1] = self.labels[:, 1:].clone() self.labels[:, -1] = -100 - # print("Devices:", outputs.get_device(), self.labels.get_device(), metric.device) if isinstance(metric, InContextLearningMetric) and batch.get( 'mode', None) == 'icl_task': assert self.labels is not None diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index 4e77184a24..bcd158f731 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -111,6 +111,7 @@ def __init__( self.hidden_size = hidden_size self.vocab_size = vocab_size + self.max_seq_len = 2048 #TODO: Do Not hardcode # Device and rank runtime_rank = tensorrt_llm.mpi_rank() @@ -177,16 +178,80 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): # model's generate function. Strings will be returned from eval_forward output_logits_batch = [] batch = self.rebatch(batch) + + # Question-answering tasks + if 'continuation_indices' not in batch: + # batch['continuation_indices'] = torch.tensor([], dtype=torch.int, device=self.device) + # print("Batch:", batch) + output_strs = [] + + for tokens in batch['input_ids']: + seqlen = tokens.shape[0] + prompt = tokens.tolist() + eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0] + end_prompt_idx = len(prompt) + if eos_occurence.shape[0] >= 1: + end_prompt_idx = eos_occurence[0] + prompt = prompt[:end_prompt_idx] + input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) + input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device) + #print("prompt:", self.tokenizer.decode(prompt)) + #print("promp tokens:", prompt) + #print("Input lengths:", input_lengths) + #print("Generation Length:", batch['generation_length']) + + with torch.no_grad(): + self.decoder.setup(input_lengths.size(0), + torch.max(input_lengths).item(), + batch['generation_length']) + + output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) + output_logits_list = output_dict['generation_logits'] + + for i in range(len(output_logits_list)): + output_logits_list[i] = output_logits_list[i].squeeze() + + #print("Shape:", output_dict['output_ids'].shape) + decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):]) + output_strs.append(decoded_str) + #print("Decoded OUTPUT:", decoded_str) + #print("-------------") + # print("Output ids:", output_dict['output_ids'][0][0].tolist()) + """ + context_logits = output_dict['context_logits'].squeeze() + output_logits_tensor = torch.stack(output_logits_list) + print("Context logits shape:", context_logits.shape) + print("Output logits shape:", output_logits_tensor.shape) + combined_logits = torch.cat([context_logits, output_logits_tensor]) + + padding = torch.nn.functional.one_hot( + torch.full( + (self.max_seq_len - combined_logits.shape[0],), + self.PAD_ID, + device=self.device + ), + num_classes=self.vocab_size) + padded_combined_logits = torch.cat([combined_logits, padding]) + + output_logits_batch.append(padded_combined_logits) + """ + return output_strs + + # Language modeling and multiple choice tasks for tokens, cont_idxs in zip(batch['input_ids'], batch['continuation_indices']): seqlen = tokens.shape[0] tokens = tokens.tolist() + # print("Continuation indices:", cont_idxs) cont_idxs = cont_idxs.tolist() - expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] + if len(cont_idxs) > 1: + expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] + prompt = tokens[:cont_idxs[0]] + else: + prompt = tokens + expected_cont_tokens = [tokens[-1]] - prompt = tokens[:cont_idxs[0]] - input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) input_lengths = torch.tensor([input_ids.size(1)], @@ -243,27 +308,3 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device) #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) - """ - # Old logits logic, back before TRT-LLM natively returned logits - output_logits = torch.nn.functional.one_hot( - torch.tensor(tokens[1:cont_idxs[0]], device='cuda'), - num_classes=self.vocab_size) - - for i in range(len(output_logits_list)): - output_logits_list[i] = output_logits_list[i].squeeze() - - next_logit_tensor = torch.stack(output_logits_list) - output_logits = torch.cat([output_logits, next_logit_tensor]) - #print(output_logits.shape) - #print(output_ids[0][0][cont_idxs[0]:].tolist()) - padding = torch.nn.functional.one_hot(torch.full( - (seqlen - output_logits.shape[0],), - self.PAD_ID, - device=output_logits.device), - num_classes=self.vocab_size) - output_logits = torch.cat([output_logits, padding]) - #print("Output logits shape:", output_logits.shape) - output_logits_batch.append(output_logits) - - return torch.stack(output_logits_batch).to(batch['input_ids'].device) - """ diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py index 4bce4d4170..5ed4e88f57 100644 --- a/scripts/eval/run_trtllm_eval.py +++ b/scripts/eval/run_trtllm_eval.py @@ -12,6 +12,10 @@ trt_folder_path = '/workspace/TensorRT-LLM/' +MINI_TASKS = './eval/yamls/mini_tasks_v0.2.yaml' +QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.2.yaml' +ALL_TASKS = './eval/yamls/tasks_v0.2.yaml' +LM_TASKS = './eval/yamls/lm_tasks_v0.2.yaml' # GPT config is just for quick initial testing purposes trt_gpt_config = { @@ -42,79 +46,69 @@ 'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml', } -trt_llama7b_config = { - 'run_name': 'trtllm-eval', - 'seed': 0, - 'max_seq_len': 2048, - 'device_eval_batch_size': 32, - 'precision': 'amp_fp16', - 'dist_timeout': 6000, - 'models': - [ - { - 'model_name': 'trtllm/llama', - 'model': - { - 'name': 'trtllm', - 'version': 'llama', - 'engine_dir': trt_folder_path + 'examples/llama/tmp/llama/7B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/1-gpu', - 'log_level': 'error', - 'eos_token_id': 2, - 'pad_token_id': 2 - }, - 'tokenizer': + +def get_llama_config(engine_dir, tokenizer_name, icl_tasks=QA_MC_TASKS): + return { + 'run_name': 'trtllm-eval', + 'seed': 0, + 'max_seq_len': 2048, + 'device_eval_batch_size': 8, # Llama-7B should be batch size 32 + 'precision': 'amp_fp16', + 'dist_timeout': 6000, + 'models': + [ { - 'name': '/workspace/llama-7b-chat-hf/' + 'model_name': 'trtllm/llama', + 'model': + { + 'name': 'trtllm', + 'version': 'llama', + 'engine_dir': engine_dir, + 'log_level': 'error', + 'eos_token_id': 2, + 'pad_token_id': 2 + }, + 'tokenizer': + { + 'name': tokenizer_name, + } + } + ], + 'icl_tasks': icl_tasks, + 'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml', + 'loggers': { + 'wandb': { + 'project': 'nik-quant-eval' } - } - ], - 'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml', - 'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml', - 'loggers': { - 'wandb': { - 'project': 'nik-quant-eval' } } -} +def engine_dir_str(model_type, model_dir, variant, ngpus=8): + return f"{trt_folder_path}examples/{model_type}/tmp/{model_type}/{model_dir}/trt_engines/{variant}/{ngpus}-gpu" + + +LLAMA_TOK_DIR = '/workspace/llama-70b-chat-hf/' +LLAMA_7B_DIR = '7B-chat-quality-eval' +LLAMA_70B_DIR = '70B-chat-quality-eval' + + +llama7b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_7B_DIR, 'int8_kv_cache_weight_only', 1), LLAMA_TOK_DIR) +llama70b_fp8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp8'), LLAMA_TOK_DIR) +llama70b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'int8_kv_cache_weight_only'), LLAMA_TOK_DIR) +llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR) +llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR) + + +def run_eval(config): + print("RUNNING EVAL") + om_dict_config: DictConfig = om.create(config) + print("OmegaConfig dictionary", om.to_yaml(om_dict_config)) + run_evaluation(om_dict_config) -trt_llama70b_config = { - 'run_name': 'trtllm-eval', - 'seed': 0, - 'max_seq_len': 2048, - 'device_eval_batch_size': 8, - 'precision': 'amp_fp16', - 'dist_timeout': 6000, - 'models': - [ - { - 'model_name': 'trtllm/llama', - 'model': - { - 'name': 'trtllm', - 'version': 'llama', - 'engine_dir': trt_folder_path + 'examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/fp8/8-gpu', - 'log_level': 'error', - 'eos_token_id': 2, - 'pad_token_id': 2 - }, - 'tokenizer': - { - 'name': '/workspace/llama-70b-chat-hf/' - } - } - ], - 'icl_tasks': './eval/yamls/tasks_v0.2.yaml', - 'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml', - 'loggers': { - 'wandb': { - 'project': 'nik-quant-eval' - } - } -} +run_eval(llama70b_int8_config) +run_eval(llama70b_fp16_config) +run_eval(llama70b_fp8_config) +run_eval(llama70b_smoothquant_config) -om_dict_config: DictConfig = om.create(trt_llama70b_config) -print("OmegaConfig dictionary", om.to_yaml(om_dict_config)) -run_evaluation(om_dict_config) diff --git a/scripts/eval/yamls/lm_tasks_v0.2.yaml b/scripts/eval/yamls/lm_tasks_v0.2.yaml new file mode 100644 index 0000000000..1f550aba6e --- /dev/null +++ b/scripts/eval/yamls/lm_tasks_v0.2.yaml @@ -0,0 +1,59 @@ +icl_tasks: +- + label: jeopardy + dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl + num_fewshot: [3] + icl_task_type: language_modeling + continuation_delimiter: "\nAnswer: " # this separates questions from answers + has_categories: true +- + label: bigbench_qa_wikidata + dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl + num_fewshot: [3] + icl_task_type: language_modeling +- + label: bigbench_dyck_languages + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_dyck_languages.jsonl + num_fewshot: [5] + icl_task_type: language_modeling +- + label: lambada_openai + dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl + num_fewshot: [0] + icl_task_type: language_modeling +- + label: bigbench_cs_algorithms + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_cs_algorithms.jsonl + num_fewshot: [10] + icl_task_type: language_modeling +- + label: bigbench_operators + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_operators.jsonl + num_fewshot: [3] + icl_task_type: language_modeling +- + label: simple_arithmetic_nospaces + dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_nospaces.jsonl + num_fewshot: [5] + icl_task_type: language_modeling +- + label: simple_arithmetic_withspaces + dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_withspaces.jsonl + num_fewshot: [5] + icl_task_type: language_modeling +- + label: pubmed_qa_labeled + dataset_uri: eval/local_data/reading_comprehension/pubmed_qa_labeled.jsonl + num_fewshot: [10] + icl_task_type: language_modeling +- + label: squad + dataset_uri: eval/local_data/reading_comprehension/squad.jsonl + num_fewshot: [3] + icl_task_type: language_modeling +- + label: coqa + dataset_uri: eval/local_data/reading_comprehension/coqa.jsonl + num_fewshot: [0] + icl_task_type: language_modeling + diff --git a/scripts/eval/yamls/mini_tasks_v0.2.yaml b/scripts/eval/yamls/mini_tasks_v0.2.yaml index 2366ccfb8d..e5e7c459d9 100644 --- a/scripts/eval/yamls/mini_tasks_v0.2.yaml +++ b/scripts/eval/yamls/mini_tasks_v0.2.yaml @@ -1,8 +1,21 @@ icl_tasks: +#- +# label: jeopardy +# dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl +# num_fewshot: [3] +# icl_task_type: language_modeling +# continuation_delimiter: "\nAnswer: " # this separates questions from answers +# has_categories: true - - label: jeopardy - dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl + label: triviaqa_sm_sub + dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl num_fewshot: [3] - icl_task_type: language_modeling - continuation_delimiter: "\nAnswer: " # this separates questions from answers - has_categories: true + icl_task_type: question_answering +#- +# label: gsm8k +# dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl +# num_fewshot: [8, 5] +# icl_task_type: question_answering +# cot_delimiter: ' #### ' +# continuation_delimiter: "\nA: Let's think step by step. " +# question_prelimiter: "Q: " diff --git a/scripts/eval/yamls/qa_mc_tasks_v0.2.yaml b/scripts/eval/yamls/qa_mc_tasks_v0.2.yaml new file mode 100644 index 0000000000..f566e593d0 --- /dev/null +++ b/scripts/eval/yamls/qa_mc_tasks_v0.2.yaml @@ -0,0 +1,138 @@ +icl_tasks: +- + label: triviaqa_sm_sub + dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl + num_fewshot: [3] + icl_task_type: question_answering +#BROKEN: https://github.com/mosaicml/llm-foundry/pull/824 +#- +# label: gsm8k +# dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl +# num_fewshot: [8, 5] +# icl_task_type: question_answering +# cot_delimiter: ' #### ' +# continuation_delimiter: "\nA: Let's think step by step. " +# question_prelimiter: "Q: " +#- +# label: agi_eval_sat_math +# dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_sat_math.jsonl +# num_fewshot: [3] +# icl_task_type: question_answering +# cot_delimiter: ' #### ' +# continuation_delimiter: "\nA: Let's think step by step. " +#- +# label: aqua +# dataset_uri: eval/local_data/symbolic_problem_solving/aqua.jsonl +# num_fewshot: [3] +# icl_task_type: question_answering +# cot_delimiter: ' #### ' +# continuation_delimiter: "\nA: Let's think step by step. " +#- +# label: svamp +# dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl +# num_fewshot: [5] +# icl_task_type: question_answering +# continuation_delimiter: "\nUsing the formula below:\n" +# cot_delimiter: ' #### ' +# question_prelimiter: "Q: " +- + label: arc_easy + dataset_uri: eval/local_data/world_knowledge/arc_easy.jsonl + num_fewshot: [3] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers +- + label: arc_challenge + dataset_uri: eval/local_data/world_knowledge/arc_challenge.jsonl + num_fewshot: [3, 25] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers +- + label: mmlu + dataset_uri: eval/local_data/world_knowledge/mmlu.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers + has_categories: true +- + label: copa + dataset_uri: eval/local_data/commonsense_reasoning/copa.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice +- + label: siqa + dataset_uri: eval/local_data/commonsense_reasoning/siqa.jsonl + num_fewshot: [3] + icl_task_type: multiple_choice +- + label: commonsense_qa + dataset_uri: eval/local_data/commonsense_reasoning/commonsense_qa.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice +- + label: piqa + dataset_uri: eval/local_data/commonsense_reasoning/piqa.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers +- + label: openbook_qa + dataset_uri: eval/local_data/commonsense_reasoning/openbook_qa.jsonl + num_fewshot: [10] + icl_task_type: multiple_choice +- + label: bigbench_strange_stories + dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strange_stories.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice +- + label: bigbench_strategy_qa + dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strategy_qa.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice +- + label: hellaswag + dataset_uri: eval/local_data/language_understanding/hellaswag.jsonl + num_fewshot: [0, 10] + icl_task_type: multiple_choice +- + label: winograd + dataset_uri: eval/local_data/language_understanding/winograd_wsc.jsonl + num_fewshot: [3] + icl_task_type: schema +- + label: winogrande + dataset_uri: eval/local_data/language_understanding/winogrande.jsonl + num_fewshot: [5] + icl_task_type: schema +- + label: bigbench_elementary_math_qa + dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_elementary_math_qa.jsonl + num_fewshot: [1] + icl_task_type: multiple_choice +- + label: agi_eval_lsat_ar + dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_lsat_ar.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice +- + label: agi_eval_lsat_rc + dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_rc.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice +- + label: agi_eval_lsat_lr + dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_lr.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice +- + label: boolq + dataset_uri: eval/local_data/reading_comprehension/boolq.jsonl + num_fewshot: [0] + icl_task_type: multiple_choice + continuation_delimiter: "\nAnswer: " # this separates questions from answers +- + label: agi_eval_sat_en + dataset_uri: eval/local_data/reading_comprehension/agi_eval_sat_en.jsonl + num_fewshot: [5] + icl_task_type: multiple_choice diff --git a/scripts/eval/yamls/tasks_v0.2.yaml b/scripts/eval/yamls/tasks_v0.2.yaml index f5c9f20880..ea74f84a2f 100644 --- a/scripts/eval/yamls/tasks_v0.2.yaml +++ b/scripts/eval/yamls/tasks_v0.2.yaml @@ -11,36 +11,37 @@ icl_tasks: dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl num_fewshot: [3] icl_task_type: question_answering -- - label: gsm8k - dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl - num_fewshot: [8, 5] - icl_task_type: question_answering - cot_delimiter: ' #### ' - continuation_delimiter: "\nA: Let's think step by step. " - question_prelimiter: "Q: " -- - label: agi_eval_sat_math - dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_sat_math.jsonl - num_fewshot: [3] - icl_task_type: question_answering - cot_delimiter: ' #### ' - continuation_delimiter: "\nA: Let's think step by step. " -- - label: aqua - dataset_uri: eval/local_data/symbolic_problem_solving/aqua.jsonl - num_fewshot: [3] - icl_task_type: question_answering - cot_delimiter: ' #### ' - continuation_delimiter: "\nA: Let's think step by step. " -- - label: svamp - dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl - num_fewshot: [5] - icl_task_type: question_answering - continuation_delimiter: "\nUsing the formula below:\n" - cot_delimiter: ' #### ' - question_prelimiter: "Q: " +#BROKEN +#- +# label: gsm8k +# dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl +# num_fewshot: [8, 5] +# icl_task_type: question_answering +# cot_delimiter: ' #### ' +# continuation_delimiter: "\nA: Let's think step by step. " +# question_prelimiter: "Q: " +#- +# label: agi_eval_sat_math +# dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_sat_math.jsonl +# num_fewshot: [3] +# icl_task_type: question_answering +# cot_delimiter: ' #### ' +# continuation_delimiter: "\nA: Let's think step by step. " +#- +# label: aqua +# dataset_uri: eval/local_data/symbolic_problem_solving/aqua.jsonl +# num_fewshot: [3] +# icl_task_type: question_answering +# cot_delimiter: ' #### ' +# continuation_delimiter: "\nA: Let's think step by step. " +#- +# label: svamp +# dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl +# num_fewshot: [5] +# icl_task_type: question_answering +# continuation_delimiter: "\nUsing the formula below:\n" +# cot_delimiter: ' #### ' +# question_prelimiter: "Q: " - label: bigbench_qa_wikidata dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl From 1c6037cb15c932f926ce473b893cd7ec23570e77 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Sat, 6 Jan 2024 02:01:42 -0800 Subject: [PATCH 13/21] Update scripts, fix batching --- .../models/inference_api_wrapper/trtllm.py | 56 ++++++++++++++++--- scripts/eval/eval_trt_multigpu.py | 5 +- scripts/eval/run_trtllm_eval.py | 10 ++-- 3 files changed, 58 insertions(+), 13 deletions(-) diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index bcd158f731..3c3beb9309 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -181,10 +181,56 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): # Question-answering tasks if 'continuation_indices' not in batch: - # batch['continuation_indices'] = torch.tensor([], dtype=torch.int, device=self.device) - # print("Batch:", batch) - output_strs = [] + # Batched version + batch_size = len(batch['input_ids']) + prompt_lens = [] + output_strs = [] # QA tasks return strings, not logits + max_prompt_len = 0 + + for tokens in batch['input_ids']: + prompt = tokens.tolist() + eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0] + end_prompt_idx = len(prompt) + if eos_occurence.shape[0] >= 1: + end_prompt_idx = eos_occurence[0] + prompt_lens.append(end_prompt_idx) + if end_prompt_idx > max_prompt_len: + max_prompt_len = end_prompt_idx + + #if batch_size == 1: + # # Remove pad tokens + # prompt = batch['input_ids'][0].tolist()[:prompt_lens[0]] + # input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) + # input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device) + # else: + # Keep padding + input_ids = torch.narrow(batch['input_ids'], 1, 0, max_prompt_len).to(dtype=torch.int, device=self.device) + # input_ids = batch['input_ids'].to(dtype=torch.int, device=self.device) + input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) + + #if tensorrt_llm.mpi_rank() == 0: + # print("Prompt:", input_ids[7]) + #print("Input shape:", input_ids.shape) + #print("Input lengths:", input_lengths) + with torch.no_grad(): + self.decoder.setup(batch_size, + input_lengths.max().item(), + batch['generation_length']) + output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) + + #if tensorrt_llm.mpi_rank() == 0: + # print("Output:", [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']] for i in range(batch_size)]) + # print("Output shape:", output_dict['output_ids'].shape) + #inp_len = input_ids.size(1) + + decoded_strs = [self.tokenizer.decode(output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']]) for i in range(batch_size)] + output_strs += decoded_strs + print("decoded strs:", decoded_strs) + return output_strs + + # Non-batched version + output_strs = [] for tokens in batch['input_ids']: seqlen = tokens.shape[0] prompt = tokens.tolist() @@ -206,10 +252,6 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): batch['generation_length']) output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) - output_logits_list = output_dict['generation_logits'] - - for i in range(len(output_logits_list)): - output_logits_list[i] = output_logits_list[i].squeeze() #print("Shape:", output_dict['output_ids'].shape) decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):]) diff --git a/scripts/eval/eval_trt_multigpu.py b/scripts/eval/eval_trt_multigpu.py index 809b1a53e3..1a63fac90c 100644 --- a/scripts/eval/eval_trt_multigpu.py +++ b/scripts/eval/eval_trt_multigpu.py @@ -25,7 +25,7 @@ build_evaluators, build_logger, build_tokenizer) from llmfoundry.utils.config_utils import pop_config, process_init_device - +import tensorrt_llm def load_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, @@ -132,6 +132,9 @@ def evaluate_model( assert composer_model is not None + if tensorrt_llm.mpi_rank() > 0: + loggers = None + trainer = Trainer( run_name=run_name, seed=seed, diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py index 5ed4e88f57..d37168e900 100644 --- a/scripts/eval/run_trtllm_eval.py +++ b/scripts/eval/run_trtllm_eval.py @@ -47,7 +47,7 @@ } -def get_llama_config(engine_dir, tokenizer_name, icl_tasks=QA_MC_TASKS): +def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): return { 'run_name': 'trtllm-eval', 'seed': 0, @@ -105,10 +105,10 @@ def run_eval(config): print("OmegaConfig dictionary", om.to_yaml(om_dict_config)) run_evaluation(om_dict_config) - -run_eval(llama70b_int8_config) -run_eval(llama70b_fp16_config) -run_eval(llama70b_fp8_config) +#run_eval(llama7b_int8_config) +#run_eval(llama70b_int8_config) +#run_eval(llama70b_fp16_config) +#run_eval(llama70b_fp8_config) run_eval(llama70b_smoothquant_config) From 3e5b5eedd4a75c401d5df9a8a47f6524fd550cbd Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Thu, 11 Jan 2024 04:55:42 -0800 Subject: [PATCH 14/21] Update foundry: --- .../models/inference_api_wrapper/trtllm.py | 101 ++++++++---------- scripts/eval/run_trtllm_eval.py | 37 +------ 2 files changed, 50 insertions(+), 88 deletions(-) diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index 3c3beb9309..1bc2b8a1eb 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -4,10 +4,13 @@ """Implements a TRT-LLM evaluation model wrapped around a :class:`.ComposerModel`.""" +import os +import sys import json from pathlib import Path -from typing import Any, Optional +from typing import Any, Optional, List, Tuple +import warnings import torch from omegaconf import DictConfig from transformers import PreTrainedTokenizer @@ -49,6 +52,10 @@ def __init__( ): check_if_trt_llm_installed() + if tensorrt_llm.mpi_rank() != 0: + f = open(os.devnull, 'w') + sys.stdout = f + sys.stderr = f super().__init__(model_cfg, tokenizer) tensorrt_llm.logger.set_level(model_cfg['log_level']) @@ -74,10 +81,10 @@ def __init__( use_gpt_attention_plugin = bool( config['plugin_config']['gpt_attention_plugin']) remove_input_padding = config['plugin_config']['remove_input_padding'] - #if remove_input_padding: - # raise ValueError( - # 'TRT-LLM Evaluation Wrapper does not support remove_input_padding.' - # ) + if remove_input_padding: + raise ValueError( + 'TRT-LLM Evaluation Wrapper does not support remove_input_padding.' + ) num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads) paged_kv_cache = config['plugin_config']['paged_kv_cache'] @@ -111,7 +118,6 @@ def __init__( self.hidden_size = hidden_size self.vocab_size = vocab_size - self.max_seq_len = 2048 #TODO: Do Not hardcode # Device and rank runtime_rank = tensorrt_llm.mpi_rank() @@ -151,11 +157,6 @@ def __init__( print("!!! Initialized generation session for rank:", runtime_rank) torch.cuda.synchronize() - - # Move metrics to proper device (doesn't help, have to do this in update_metric()) - # for key, value in self.eval_metrics.items(): - # self.eval_metrics[key] = value.to(device=self.device) - # print("Eval metric now at:", self.eval_metrics[key].device) def rebatch(self, batch): """ @@ -169,24 +170,42 @@ def rebatch(self, batch): return batch.to(device=self.device) elif isinstance(batch, list): return [self.rebatch(b) for b in batch] - return batch + + + # Remove potential additional dim, cast to int32 + batch_input_ids = [ + x.flatten().type(torch.int32) for x in batch_input_ids + ] + input_lengths = [x.size(0) for x in batch_input_ids] + max_length = max(input_lengths) + # Right padding for trt-llm + paddings = [ + torch.ones(max_length - l, dtype=torch.int32, device=self.device) * pad_id + for l in input_lengths + ] + batch_input_ids = [ + torch.cat([x, pad]) for x, pad in zip(batch_input_ids, paddings) + ] + batch_input_ids = torch.stack(batch_input_ids) + input_lengths = torch.tensor(input_lengths, dtype=torch.int32, device=self.device) + return batch_input_ids, input_lengths def eval_forward(self, batch, outputs: Optional[Any] = None): - # If the batch mode is generate, we will generate a requested number of tokens using the underlying - # model's generate function. Strings will be returned from eval_forward + # Run TRTLLM forward pass output_logits_batch = [] batch = self.rebatch(batch) # Question-answering tasks if 'continuation_indices' not in batch: + """ # Batched version batch_size = len(batch['input_ids']) prompt_lens = [] - output_strs = [] # QA tasks return strings, not logits max_prompt_len = 0 - + # prompt_list = [] + for tokens in batch['input_ids']: prompt = tokens.tolist() eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0] @@ -197,21 +216,12 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): if end_prompt_idx > max_prompt_len: max_prompt_len = end_prompt_idx - #if batch_size == 1: - # # Remove pad tokens - # prompt = batch['input_ids'][0].tolist()[:prompt_lens[0]] - # input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) - # input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device) - # else: - # Keep padding input_ids = torch.narrow(batch['input_ids'], 1, 0, max_prompt_len).to(dtype=torch.int, device=self.device) - # input_ids = batch['input_ids'].to(dtype=torch.int, device=self.device) input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) - #if tensorrt_llm.mpi_rank() == 0: - # print("Prompt:", input_ids[7]) - #print("Input shape:", input_ids.shape) - #print("Input lengths:", input_lengths) + print("Prompt:", input_ids) + print("Input shape:", input_ids.shape) + print("Input lengths:", input_lengths) with torch.no_grad(): self.decoder.setup(batch_size, @@ -219,15 +229,14 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): batch['generation_length']) output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) - #if tensorrt_llm.mpi_rank() == 0: - # print("Output:", [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']] for i in range(batch_size)]) - # print("Output shape:", output_dict['output_ids'].shape) - #inp_len = input_ids.size(1) - - decoded_strs = [self.tokenizer.decode(output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']]) for i in range(batch_size)] - output_strs += decoded_strs - print("decoded strs:", decoded_strs) - return output_strs + output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']] for i in range(batch_size)] + + print("Output:", output_ids) + + decoded_strs = [self.tokenizer.decode(out) for out in output_ids] + # print("decoded strs:", decoded_strs) + return decoded_strs + """ # Non-batched version output_strs = [] @@ -259,24 +268,6 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): #print("Decoded OUTPUT:", decoded_str) #print("-------------") # print("Output ids:", output_dict['output_ids'][0][0].tolist()) - """ - context_logits = output_dict['context_logits'].squeeze() - output_logits_tensor = torch.stack(output_logits_list) - print("Context logits shape:", context_logits.shape) - print("Output logits shape:", output_logits_tensor.shape) - combined_logits = torch.cat([context_logits, output_logits_tensor]) - - padding = torch.nn.functional.one_hot( - torch.full( - (self.max_seq_len - combined_logits.shape[0],), - self.PAD_ID, - device=self.device - ), - num_classes=self.vocab_size) - padded_combined_logits = torch.cat([combined_logits, padding]) - - output_logits_batch.append(padded_combined_logits) - """ return output_strs # Language modeling and multiple choice tasks diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py index d37168e900..cd53857179 100644 --- a/scripts/eval/run_trtllm_eval.py +++ b/scripts/eval/run_trtllm_eval.py @@ -17,35 +17,6 @@ ALL_TASKS = './eval/yamls/tasks_v0.2.yaml' LM_TASKS = './eval/yamls/lm_tasks_v0.2.yaml' -# GPT config is just for quick initial testing purposes -trt_gpt_config = { - 'run_name': 'trtllm-eval', - 'seed': 0, - 'max_seq_len': 1024, - 'device_eval_batch_size': 4, - 'precision': 'amp_fp16', - 'dist_timeout': 6000, - 'models': - [ - { - 'model_name': 'trtllm/gpt', - 'model': - { - 'name': 'trtllm', - 'version': 'gpt', - 'engine_dir': trt_folder_path + 'examples/gpt/engine_outputs', - 'log_level': 'error' - }, - 'tokenizer': - { - 'name': 'gpt2' - } - } - ], - 'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml', - 'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml', -} - def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): return { @@ -105,10 +76,10 @@ def run_eval(config): print("OmegaConfig dictionary", om.to_yaml(om_dict_config)) run_evaluation(om_dict_config) -#run_eval(llama7b_int8_config) -#run_eval(llama70b_int8_config) -#run_eval(llama70b_fp16_config) +# run_eval(llama7b_int8_config) +run_eval(llama70b_int8_config) +# run_eval(llama70b_fp16_config) #run_eval(llama70b_fp8_config) -run_eval(llama70b_smoothquant_config) +# run_eval(llama70b_smoothquant_config) From 9cc6deaa12bf793e4ac9b82191b2d4e4fa15b1c7 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Sun, 24 Mar 2024 17:29:04 -0700 Subject: [PATCH 15/21] update wrappers --- .../models/inference_api_wrapper/interface.py | 4 +- .../models/inference_api_wrapper/trtllm.py | 255 +++++++++++++----- scripts/eval/eval.py | 19 +- scripts/eval/eval_trt_multigpu.py | 4 +- 4 files changed, 207 insertions(+), 75 deletions(-) diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py index b0e0f4adee..493ee14f02 100644 --- a/llmfoundry/models/inference_api_wrapper/interface.py +++ b/llmfoundry/models/inference_api_wrapper/interface.py @@ -102,6 +102,8 @@ def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None: return self.labels[:, :-1] = self.labels[:, 1:].clone() + #print("**** Labels:",self.tokenizer.decode(self.labels[0])) + #print("*******") self.labels[:, -1] = -100 if isinstance(metric, InContextLearningMetric) and batch.get( 'mode', None) == 'icl_task': @@ -111,7 +113,7 @@ def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None: raise NotImplementedError( 'Inference API wrapper only supports InContextLearningMetrics and mode=icl_task' ) - + def forward(self): raise NotImplementedError( "Inference API wrapper doesn't support forward") diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index 1bc2b8a1eb..efe62fa85e 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -37,10 +37,7 @@ def check_if_trt_llm_installed(): # From tensorrt_llm/examples/{model_name}/build.py def get_engine_name(model: str, dtype: str, tp_size: int, pp_size: int, rank: int): - if pp_size == 1: - return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank) - return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size, - pp_size, rank) + return 'rank{}.engine'.format(rank) class TRTLLMEvalWrapper(InferenceAPIEvalWrapper): @@ -66,34 +63,43 @@ def __init__( with open(config_path, 'r') as f: config = json.load(f) - dtype = config['builder_config']['precision'] - tp_size = config['builder_config']['tensor_parallel'] - pp_size = config['builder_config'].get('pipeline_parallel', 1) + pretrained_config = config['pretrained_config'] + quantization_config = pretrained_config['quantization'] + build_config = config['build_config'] + plugin_config = build_config['plugin_config'] + + dtype = pretrained_config['dtype'] + tp_size = pretrained_config['mapping']['tp_size'] + pp_size = pretrained_config['mapping'].get('pp_size', 1) world_size = tp_size * pp_size assert world_size == tensorrt_llm.mpi_world_size(), \ f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})' - num_heads = config['builder_config']['num_heads'] // tp_size - hidden_size = config['builder_config']['hidden_size'] // tp_size - vocab_size = config['builder_config']['vocab_size'] - num_layers = config['builder_config']['num_layers'] - use_gpt_attention_plugin = bool( - config['plugin_config']['gpt_attention_plugin']) - remove_input_padding = config['plugin_config']['remove_input_padding'] + num_heads = pretrained_config['num_attention_heads'] // tp_size + hidden_size = pretrained_config['hidden_size'] // tp_size + + max_batch_size = build_config['max_batch_size'] + vocab_size = pretrained_config['vocab_size'] + num_layers = pretrained_config['num_hidden_layers'] + + use_gpt_attention_plugin = bool(plugin_config['gpt_attention_plugin']) + remove_input_padding = plugin_config['remove_input_padding'] if remove_input_padding: raise ValueError( 'TRT-LLM Evaluation Wrapper does not support remove_input_padding.' ) - num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads) - paged_kv_cache = config['plugin_config']['paged_kv_cache'] - tokens_per_block = config['plugin_config']['tokens_per_block'] - use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce', + num_kv_heads = build_config.get('num_key_value_heads', num_heads) + paged_kv_cache = plugin_config['paged_kv_cache'] + tokens_per_block = plugin_config['tokens_per_block'] + use_custom_all_reduce = plugin_config.get('use_custom_all_reduce', False) - - quant_mode = QuantMode(config['builder_config']['quant_mode']) - if config['builder_config'].get('multi_query_mode', False): + quant_mode = QuantMode.from_quant_algo( + quant_algo=quantization_config['quant_algo'], + kv_cache_quant_algo=quantization_config['kv_cache_quant_algo']) + + if pretrained_config.get('multi_query_mode', False): tensorrt_llm.logger.warning( "`multi_query_mode` config is deprecated. Please rebuild the engine." ) @@ -101,6 +107,8 @@ def __init__( num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size model_config = tensorrt_llm.runtime.ModelConfig( + max_batch_size=max_batch_size, + max_beam_width=1, vocab_size=vocab_size, num_layers=num_layers, num_heads=num_heads, @@ -113,11 +121,14 @@ def __init__( use_custom_all_reduce=use_custom_all_reduce, dtype=dtype, quant_mode=quant_mode, - gather_all_token_logits=True) + gather_context_logits=build_config.get('gather_context_logits', False), + gather_generation_logits=build_config.get('gather_generation_logits', False), + ) self.hidden_size = hidden_size self.vocab_size = vocab_size + self.max_output_len = build_config['max_output_len'] # Device and rank runtime_rank = tensorrt_llm.mpi_rank() @@ -129,6 +140,7 @@ def __init__( self.device = torch.device('cuda:' + str(self.device_num)) torch.cuda.set_device(self.device_num) + print("My rank:", runtime_rank) # Tokenization and sampling self.END_ID = model_cfg.get('eos_token_id', self.tokenizer.eos_token_id) self.PAD_ID = model_cfg.get('pad_token_id', self.tokenizer.pad_token_id) @@ -144,10 +156,8 @@ def __init__( pad_id=self.PAD_ID, num_beams=1, return_dict=True) - # Load TRT engine - engine_name = get_engine_name(model_cfg['version'], dtype, tp_size, pp_size, - runtime_rank) + engine_name = 'rank{}.engine'.format(runtime_rank) serialize_path = engine_dir / engine_name with open(serialize_path, 'rb') as f: engine_buffer = f.read() @@ -158,6 +168,7 @@ def __init__( print("!!! Initialized generation session for rank:", runtime_rank) torch.cuda.synchronize() + def rebatch(self, batch): """ Move tensors in batch to the correct GPU. @@ -174,6 +185,7 @@ def rebatch(self, batch): # Remove potential additional dim, cast to int32 + """ batch_input_ids = [ x.flatten().type(torch.int32) for x in batch_input_ids ] @@ -187,10 +199,10 @@ def rebatch(self, batch): batch_input_ids = [ torch.cat([x, pad]) for x, pad in zip(batch_input_ids, paddings) ] - batch_input_ids = torch.stack(batch_input_ids) + batch_input_ids = torch.stack(batch_input_ids).to(device=self.device) input_lengths = torch.tensor(input_lengths, dtype=torch.int32, device=self.device) return batch_input_ids, input_lengths - + """ def eval_forward(self, batch, outputs: Optional[Any] = None): # Run TRTLLM forward pass @@ -199,16 +211,13 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): # Question-answering tasks if 'continuation_indices' not in batch: - """ # Batched version batch_size = len(batch['input_ids']) prompt_lens = [] max_prompt_len = 0 - # prompt_list = [] - for tokens in batch['input_ids']: prompt = tokens.tolist() - eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0] + eos_occurence = (tokens == self.END_ID).nonzero(as_tuple=True)[0] end_prompt_idx = len(prompt) if eos_occurence.shape[0] >= 1: end_prompt_idx = eos_occurence[0] @@ -220,27 +229,51 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) print("Prompt:", input_ids) - print("Input shape:", input_ids.shape) - print("Input lengths:", input_lengths) - + #print("Input shape:", input_ids.shape) + #print("Input lengths:", input_lengths) + max_generation_length = 256 with torch.no_grad(): self.decoder.setup(batch_size, input_lengths.max().item(), - batch['generation_length']) + batch.get('generation_length', max_generation_length)) output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) - - output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']] for i in range(batch_size)] + #self.decoder.setup(1, + # input_lengths[:1].max().item(), + # batch.get('generation_length', max_generation_length)) + #output_dict2 = self.decoder.decode(input_ids[:1,:], input_lengths[:1], self.sampling_config, return_dict=True) + + #answer1 = output_dict['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length] + #answer2 = output_dict2['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length] + #all_equal = torch.equal(answer1, answer2) + """ + if not all_equal: + print("Prompt:", input_ids[0]) + print("Answer 1:", self.tokenizer.decode(answer1)) + print("Answer 2:", self.tokenizer.decode(answer2)) + print("Shape 1:", answer1.shape) + print("Shape 2", answer2.shape) + difference = answer1 - answer2 + nonzero_indices = difference.nonzero(as_tuple=True) + nonzero = difference[difference.nonzero(as_tuple=True)] + print("EQUAL?", all_equal) + print("Difference:", difference) + print("nonzero indices:", nonzero_indices) + print("Nonzero Elements", nonzero) + quit() + """ + output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch.get('generation_length', max_generation_length)] for i in range(batch_size)] - print("Output:", output_ids) + #print("Output:", output_ids) decoded_strs = [self.tokenizer.decode(out) for out in output_ids] - # print("decoded strs:", decoded_strs) + print("decoded strs:", decoded_strs) return decoded_strs - """ - + # Non-batched version + """ output_strs = [] for tokens in batch['input_ids']: + #print("RAW Tokens:", tokens) seqlen = tokens.shape[0] prompt = tokens.tolist() eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0] @@ -250,51 +283,133 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): prompt = prompt[:end_prompt_idx] input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device) - #print("prompt:", self.tokenizer.decode(prompt)) + print("prompt:", self.tokenizer.decode(prompt)) #print("promp tokens:", prompt) #print("Input lengths:", input_lengths) #print("Generation Length:", batch['generation_length']) - + #print("Batch keys:", batch.keys()) + torch.cuda.synchronize() with torch.no_grad(): - self.decoder.setup(input_lengths.size(0), - torch.max(input_lengths).item(), - batch['generation_length']) - - output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) + self.decoder.setup(batch_size=input_lengths.size(0), + max_context_length=torch.max(input_lengths).item(), + max_new_tokens=batch.get('generation_length', 200)) + output_dict = self.decoder.decode( + input_ids, + input_lengths, + self.sampling_config, + #stopping_criteria=batch['stopping_criteria'], + return_dict=True) #print("Shape:", output_dict['output_ids'].shape) decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):]) output_strs.append(decoded_str) - #print("Decoded OUTPUT:", decoded_str) + print("Decoded OUTPUT:", decoded_str) #print("-------------") - # print("Output ids:", output_dict['output_ids'][0][0].tolist()) + #print("Output ids:", output_dict['output_ids'][0][0].tolist()) return output_strs + """ + ################# + # Batched version of language modeling/multiple choice tasks + batch_size = len(batch['input_ids']) + seqlen = batch['input_ids'].shape[1] + #print("Seq len:", seqlen) + prompt_lens = [] + continuation_lens = [] + for tokens, cont_idxs in zip(batch['input_ids'], + batch['continuation_indices']): + tokens = tokens.tolist() + cont_idxs = cont_idxs.tolist() + expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] + prompt = tokens[:cont_idxs[0]] + prompt_lens.append(cont_idxs[0]) + continuation_lens.append(len(expected_cont_tokens)) + + input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) + input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int) + for i in range(batch_size): + input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]] + + #print("New batch shape", input_ids.shape) + #print("Continuation lengths:", continuation_lens) + #print("Prompt:", input_ids) + #print("Input shape:", input_ids.shape) + #print("Input lengths:", input_lengths) + with torch.no_grad(): + self.decoder.setup(batch_size, + input_lengths.max().item(), + max(continuation_lens)) + output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) + torch.cuda.synchronize() + + output_logits_list = output_dict['generation_logits'] + #print("Output logits list", output_logits_list) + # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()) + + # output_logits_list length is == max(continuation_lens) + # Output logits_list[i] is of shape (batch_size, vocab_size) + if len(output_logits_list) > 0: + #print("Shape:", output_logits_list[0].shape) + output_logits_tensor = torch.stack(output_logits_list, dim=1) + else: + output_logits_tensor = None + + #if output_logits_tensor is not None: + #print("Output logits tensor shape:", output_logits_tensor.shape) + + # Put together logits + # We loop through batch_size dimension rather than deal with NestedTensor + output_logits_batch = [] + for i in range(batch_size): + # First create context "logits" (one-hot vector with 1 at token position) + tokens = input_ids[i].tolist() + context_psuedologits = torch.nn.functional.one_hot( + torch.tensor(tokens[1:prompt_lens[i]], device=self.device), + num_classes=self.vocab_size) + # Then add generation logits (up to continuation_length) + if output_logits_tensor is not None: + output_logits_trimmed = output_logits_tensor[i][:continuation_lens[i]] + # print("Output logits trimmed shape:", output_logits_trimmed.shape) + combined_logits = torch.cat([context_psuedologits, output_logits_trimmed]) + else: + combined_logits = context_psuedologits + # Then pad with Padding token "logits" to end of sequence length + padding = torch.nn.functional.one_hot( + torch.full( + (seqlen - combined_logits.shape[0],), + self.PAD_ID, + device=self.device + ), + num_classes=self.vocab_size) + padded_combined_logits = torch.cat([combined_logits, padding]) + output_logits_batch.append(padded_combined_logits) + + return torch.stack(output_logits_batch).to(self.device) + ############################################### + # NON BATCHED VERSION # Language modeling and multiple choice tasks + """ for tokens, cont_idxs in zip(batch['input_ids'], batch['continuation_indices']): - + # print("******************************") seqlen = tokens.shape[0] tokens = tokens.tolist() + # print("Tokens:", tokens) # print("Continuation indices:", cont_idxs) cont_idxs = cont_idxs.tolist() - if len(cont_idxs) > 1: - expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] - prompt = tokens[:cont_idxs[0]] - else: - prompt = tokens - expected_cont_tokens = [tokens[-1]] + expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] + # print("Expected continuation tokens:", expected_cont_tokens) + prompt = tokens[:cont_idxs[0]] input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device) - # print("prompt:", self.tokenizer.decode(prompt)) + # print("*** PROMPT:", self.tokenizer.decode(prompt)) # print("Input device:", input_ids.get_device()) - #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) - #print("Input lengths:", input_lengths) - #print(cont_idxs[0]) + # print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) + # print("Input lengths:", input_lengths) #print("Expected continuation tokens:", len(expected_cont_tokens)) with torch.no_grad(): self.decoder.setup(input_lengths.size(0), @@ -306,23 +421,22 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): torch.cuda.synchronize() - context_logits = output_dict['context_logits'] - context_logits = context_logits.squeeze() + context_psuedologits = torch.nn.functional.one_hot( + torch.tensor(tokens[1:cont_idxs[0]], device=self.device), + num_classes=self.vocab_size) output_logits_list = output_dict['generation_logits'] # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()) for i in range(len(output_logits_list)): output_logits_list[i] = output_logits_list[i].squeeze() - # print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0].tolist())) - # print("Context logits:", context_logits.shape) - # print("Output logits list:", output_logits_list) + # print("*** Output string:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())) + #print("Context logits:", context_psuedologits.shape) if len(output_logits_list) > 0: - # print("Output logits 0 shape:", output_logits_list[0].shape) output_logits_tensor = torch.stack(output_logits_list) # print("Output logits stacked:", output_logits_tensor.shape) - combined_logits = torch.cat([context_logits, output_logits_tensor]) + combined_logits = torch.cat([context_psuedologits, output_logits_tensor]) else: - combined_logits = context_logits - + combined_logits = context_psuedologits + #print("Seqlen", seqlen) # print("Combined logits shape:", combined_logits.shape) padding = torch.nn.functional.one_hot( @@ -341,3 +455,4 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device) #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) + """ \ No newline at end of file diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index e36e08575b..12f8c631f6 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -28,6 +28,16 @@ build_tokenizer) from llmfoundry.utils.config_utils import (log_config, pop_config, process_init_device) +try: + import tensorrt_llm + TRTLLM = True + if tensorrt_llm.mpi_world_size() > 1: + TRTLLM_MULTIGPU = True + else: + TRTLLM_MULTIGPU = False +except: + TRTLLM = False + TRTLLM_MULTIGPU = False log = logging.getLogger(__name__) @@ -97,7 +107,7 @@ def evaluate_model( icl_seq_len=max_seq_len, icl_subset_num_batches=icl_subset_num_batches, ) - + callbacks = [] if eval_gauntlet_callback is not None: callbacks.append(eval_gauntlet_callback) @@ -153,6 +163,9 @@ def evaluate_model( log.info(f'Building trainer for {model_cfg.model_name}...') + #if TRTLLM_MULTIGPU and tensorrt_llm.mpi_rank() > 0: + # loggers = None + trainer = Trainer( run_name=run_name, seed=seed, @@ -272,7 +285,9 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: ) reproducibility.seed_all(seed) - dist.initialize_dist(get_device(None), timeout=dist_timeout) + + if not TRTLLM_MULTIGPU: + dist.initialize_dist(get_device(None), timeout=dist_timeout) if python_log_level is not None: logging.basicConfig( diff --git a/scripts/eval/eval_trt_multigpu.py b/scripts/eval/eval_trt_multigpu.py index 1a63fac90c..6d850dcc3c 100644 --- a/scripts/eval/eval_trt_multigpu.py +++ b/scripts/eval/eval_trt_multigpu.py @@ -132,8 +132,8 @@ def evaluate_model( assert composer_model is not None - if tensorrt_llm.mpi_rank() > 0: - loggers = None + # if tensorrt_llm.mpi_rank() > 0: + # loggers = None trainer = Trainer( run_name=run_name, From 86de60198005f05d67e98fe62b7763de92c728f4 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Sun, 24 Mar 2024 21:35:05 -0700 Subject: [PATCH 16/21] update runner --- scripts/eval/run_trtllm_eval.py | 74 ++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 15 deletions(-) diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py index cd53857179..bd4d4c2a30 100644 --- a/scripts/eval/run_trtllm_eval.py +++ b/scripts/eval/run_trtllm_eval.py @@ -5,6 +5,7 @@ # All this can be written in YAML form. +#from eval import main as run_evaluation from eval_trt_multigpu import main as run_evaluation from omegaconf import OmegaConf as om from omegaconf import DictConfig @@ -13,12 +14,50 @@ trt_folder_path = '/workspace/TensorRT-LLM/' MINI_TASKS = './eval/yamls/mini_tasks_v0.2.yaml' -QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.2.yaml' -ALL_TASKS = './eval/yamls/tasks_v0.2.yaml' -LM_TASKS = './eval/yamls/lm_tasks_v0.2.yaml' +QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.3.yaml' +ALL_TASKS = './eval/yamls/tasks_v0.3.yaml' +LM_TASKS = './eval/yamls/lm_tasks_v0.3.yaml' +GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml' + +def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS): + return { + 'run_name': 'trtllm-eval', + 'seed': 0, + 'max_seq_len': 2048, + 'device_eval_batch_size': 8, + 'precision': 'amp_bf16', + 'dist_timeout': 6000, + 'models': + [ + { + 'model_name': 'trtllm/dbrx', + 'model': + { + 'name': 'trtllm', + 'version': 'dbrx', + 'engine_dir': engine_dir, + 'log_level': 'error', + 'eos_token_id': 2, + 'pad_token_id': 2 + }, + 'tokenizer': + { + 'name': tokenizer_name, + } + } + ], + 'icl_tasks': icl_tasks, + 'eval_gauntlet': EVAL_GAUNTLET, + 'loggers': { + 'wandb': { + 'project': 'nik-dbrx-quant-eval' + } + } + } -def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): + +def get_llama_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS): return { 'run_name': 'trtllm-eval', 'seed': 0, @@ -46,7 +85,7 @@ def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): } ], 'icl_tasks': icl_tasks, - 'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml', + 'eval_gauntlet': EVAL_GAUNTLET, 'loggers': { 'wandb': { 'project': 'nik-quant-eval' @@ -58,17 +97,22 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8): return f"{trt_folder_path}examples/{model_type}/tmp/{model_type}/{model_dir}/trt_engines/{variant}/{ngpus}-gpu" -LLAMA_TOK_DIR = '/workspace/llama-70b-chat-hf/' +LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/' +DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_23_hf_ckpt/' + LLAMA_7B_DIR = '7B-chat-quality-eval' LLAMA_70B_DIR = '70B-chat-quality-eval' +# LLama URLs +# fp8_engine_dir = '/mnt/workdisk/nikhil/engines-quality-eval/llama-2-70b-chat-tp8-fp8' +# llama7b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_7B_DIR, 'int8_kv_cache_weight_only', 1), LLAMA_TOK_DIR) +#llama70b_fp8_config = get_llama_config(fp8_engine_dir, LLAMA_TOK_DIR) +# llama70b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'int8_kv_cache_weight_only'), LLAMA_TOK_DIR) +# llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR) +# llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR) -llama7b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_7B_DIR, 'int8_kv_cache_weight_only', 1), LLAMA_TOK_DIR) -llama70b_fp8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp8'), LLAMA_TOK_DIR) -llama70b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'int8_kv_cache_weight_only'), LLAMA_TOK_DIR) -llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR) -llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR) - +dbrx_bf16_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_23_tllm_engine_bf16' +dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR) def run_eval(config): print("RUNNING EVAL") @@ -77,9 +121,9 @@ def run_eval(config): run_evaluation(om_dict_config) # run_eval(llama7b_int8_config) -run_eval(llama70b_int8_config) +# run_eval(llama70b_int8_config) # run_eval(llama70b_fp16_config) -#run_eval(llama70b_fp8_config) +# run_eval(llama70b_fp8_config) # run_eval(llama70b_smoothquant_config) - +run_eval(dbrx_bf16_config) From 1f3eeb621e898795a39d1184839c9154b54fa0a8 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Sun, 24 Mar 2024 21:45:15 -0700 Subject: [PATCH 17/21] update script --- scripts/eval/run_trtllm_eval.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py index bd4d4c2a30..b3f1fca6e9 100644 --- a/scripts/eval/run_trtllm_eval.py +++ b/scripts/eval/run_trtllm_eval.py @@ -5,8 +5,8 @@ # All this can be written in YAML form. -#from eval import main as run_evaluation -from eval_trt_multigpu import main as run_evaluation +from eval import main as run_evaluation +#from eval_trt_multigpu import main as run_evaluation from omegaconf import OmegaConf as om from omegaconf import DictConfig @@ -17,7 +17,7 @@ QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.3.yaml' ALL_TASKS = './eval/yamls/tasks_v0.3.yaml' LM_TASKS = './eval/yamls/lm_tasks_v0.3.yaml' -GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml' +EVAL_GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml' def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS): return { @@ -43,6 +43,10 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS): 'tokenizer': { 'name': tokenizer_name, + 'kwargs': + { + 'trust_remote_code': 'True' + } } } ], From 8b3a4b10fc5b2988a0dbf3d18d487b426ae321aa Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Sun, 24 Mar 2024 21:56:13 -0700 Subject: [PATCH 18/21] Remove prints --- scripts/eval/run_trtllm_eval.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py index b3f1fca6e9..12d25a3668 100644 --- a/scripts/eval/run_trtllm_eval.py +++ b/scripts/eval/run_trtllm_eval.py @@ -102,7 +102,7 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8): LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/' -DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_23_hf_ckpt/' +DBRX_TOK_DIR = '/workspace/dbrx/03_23_hf_ckpt/' LLAMA_7B_DIR = '7B-chat-quality-eval' LLAMA_70B_DIR = '70B-chat-quality-eval' @@ -116,7 +116,9 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8): # llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR) dbrx_bf16_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_23_tllm_engine_bf16' +dbrx_int8_engine_dir = '/workspace/dbrx/03_23_tllm_engine_int8' dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR) +dbrx_int8_config = get_dbrx_config(dbrx_int8_engine_dir, DBRX_TOK_DIR) def run_eval(config): print("RUNNING EVAL") @@ -129,5 +131,6 @@ def run_eval(config): # run_eval(llama70b_fp16_config) # run_eval(llama70b_fp8_config) # run_eval(llama70b_smoothquant_config) -run_eval(dbrx_bf16_config) +# run_eval(dbrx_bf16_config) +run_eval(dbrx_int8_config) From 8382411d4615d761cf943125ba6a4ae65ac9cd06 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Sun, 24 Mar 2024 22:51:51 -0700 Subject: [PATCH 19/21] update wrappers --- llmfoundry/models/inference_api_wrapper/trtllm.py | 9 +++++---- scripts/eval/run_trtllm_eval.py | 14 +++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index efe62fa85e..2ec26ae192 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -217,10 +217,10 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): max_prompt_len = 0 for tokens in batch['input_ids']: prompt = tokens.tolist() - eos_occurence = (tokens == self.END_ID).nonzero(as_tuple=True)[0] + pad_start = (tokens == self.PAD_ID).nonzero(as_tuple=True)[0] end_prompt_idx = len(prompt) - if eos_occurence.shape[0] >= 1: - end_prompt_idx = eos_occurence[0] + if pad_start.shape[0] >= 1: + end_prompt_idx = pad_start[0] prompt_lens.append(end_prompt_idx) if end_prompt_idx > max_prompt_len: max_prompt_len = end_prompt_idx @@ -228,7 +228,8 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): input_ids = torch.narrow(batch['input_ids'], 1, 0, max_prompt_len).to(dtype=torch.int, device=self.device) input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) - print("Prompt:", input_ids) + torch.set_printoptions(threshold=10_000) + print("Prompt0:", input_ids[0]) #print("Input shape:", input_ids.shape) #print("Input lengths:", input_lengths) max_generation_length = 256 diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py index 12d25a3668..26d829dd20 100644 --- a/scripts/eval/run_trtllm_eval.py +++ b/scripts/eval/run_trtllm_eval.py @@ -19,12 +19,12 @@ LM_TASKS = './eval/yamls/lm_tasks_v0.3.yaml' EVAL_GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml' -def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS): +def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): return { 'run_name': 'trtllm-eval', 'seed': 0, 'max_seq_len': 2048, - 'device_eval_batch_size': 8, + 'device_eval_batch_size': 64, 'precision': 'amp_bf16', 'dist_timeout': 6000, 'models': @@ -37,8 +37,8 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS): 'version': 'dbrx', 'engine_dir': engine_dir, 'log_level': 'error', - 'eos_token_id': 2, - 'pad_token_id': 2 + 'eos_token_id': 100257, + 'pad_token_id': 100277, }, 'tokenizer': { @@ -102,7 +102,7 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8): LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/' -DBRX_TOK_DIR = '/workspace/dbrx/03_23_hf_ckpt/' +DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_23_hf_ckpt/' LLAMA_7B_DIR = '7B-chat-quality-eval' LLAMA_70B_DIR = '70B-chat-quality-eval' @@ -131,6 +131,6 @@ def run_eval(config): # run_eval(llama70b_fp16_config) # run_eval(llama70b_fp8_config) # run_eval(llama70b_smoothquant_config) -# run_eval(dbrx_bf16_config) -run_eval(dbrx_int8_config) +run_eval(dbrx_bf16_config) +# run_eval(dbrx_int8_config) From a92f7ccd82a9085e208995e2b772325ef74b7296 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Tue, 26 Mar 2024 02:11:21 -0700 Subject: [PATCH 20/21] update wrapper to properly support MC tasks --- .../models/inference_api_wrapper/trtllm.py | 344 +++++++++--------- scripts/eval/run_trtllm_eval.py | 12 +- 2 files changed, 187 insertions(+), 169 deletions(-) diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index 2ec26ae192..ed765c2572 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -211,7 +211,10 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): # Question-answering tasks if 'continuation_indices' not in batch: - # Batched version + # Batched version. For some reason + # GSM-8k gives bad outputs when we batch on BF16 version. + # So, we will not batch. + """ batch_size = len(batch['input_ids']) prompt_lens = [] max_prompt_len = 0 @@ -229,10 +232,11 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) torch.set_printoptions(threshold=10_000) - print("Prompt0:", input_ids[0]) + #print("Prompt0:", input_ids[0]) #print("Input shape:", input_ids.shape) #print("Input lengths:", input_lengths) max_generation_length = 256 + torch.cuda.synchronize() with torch.no_grad(): self.decoder.setup(batch_size, input_lengths.max().item(), @@ -242,49 +246,47 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): # input_lengths[:1].max().item(), # batch.get('generation_length', max_generation_length)) #output_dict2 = self.decoder.decode(input_ids[:1,:], input_lengths[:1], self.sampling_config, return_dict=True) + torch.cuda.synchronize() #answer1 = output_dict['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length] #answer2 = output_dict2['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length] #all_equal = torch.equal(answer1, answer2) - """ - if not all_equal: - print("Prompt:", input_ids[0]) - print("Answer 1:", self.tokenizer.decode(answer1)) - print("Answer 2:", self.tokenizer.decode(answer2)) - print("Shape 1:", answer1.shape) - print("Shape 2", answer2.shape) - difference = answer1 - answer2 - nonzero_indices = difference.nonzero(as_tuple=True) - nonzero = difference[difference.nonzero(as_tuple=True)] - print("EQUAL?", all_equal) - print("Difference:", difference) - print("nonzero indices:", nonzero_indices) - print("Nonzero Elements", nonzero) - quit() - """ + # if not all_equal: + # print("Prompt:", input_ids[0]) + # print("Answer 1:", self.tokenizer.decode(answer1)) + # print("Answer 2:", self.tokenizer.decode(answer2)) + # print("Shape 1:", answer1.shape) + # print("Shape 2", answer2.shape) + # difference = answer1 - answer2 + # nonzero_indices = difference.nonzero(as_tuple=True) + # nonzero = difference[difference.nonzero(as_tuple=True)] + # print("EQUAL?", all_equal) + # print("Difference:", difference) + # print("nonzero indices:", nonzero_indices) + # print("Nonzero Elements", nonzero) + # quit() output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch.get('generation_length', max_generation_length)] for i in range(batch_size)] - #print("Output:", output_ids) + print("Output:", output_ids) decoded_strs = [self.tokenizer.decode(out) for out in output_ids] print("decoded strs:", decoded_strs) return decoded_strs - - # Non-batched version """ + # Non-batched version output_strs = [] for tokens in batch['input_ids']: #print("RAW Tokens:", tokens) seqlen = tokens.shape[0] prompt = tokens.tolist() - eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0] + pad_start = (tokens == self.PAD_ID).nonzero(as_tuple=True)[0] end_prompt_idx = len(prompt) - if eos_occurence.shape[0] >= 1: - end_prompt_idx = eos_occurence[0] + if pad_start.shape[0] >= 1: + end_prompt_idx = pad_start[0] prompt = prompt[:end_prompt_idx] input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device) - print("prompt:", self.tokenizer.decode(prompt)) + #print("prompt:", self.tokenizer.decode(prompt)) #print("promp tokens:", prompt) #print("Input lengths:", input_lengths) #print("Generation Length:", batch['generation_length']) @@ -304,156 +306,172 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): #print("Shape:", output_dict['output_ids'].shape) decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):]) output_strs.append(decoded_str) - print("Decoded OUTPUT:", decoded_str) + #print("Decoded OUTPUT:", decoded_str) #print("-------------") #print("Output ids:", output_dict['output_ids'][0][0].tolist()) return output_strs - """ - ################# - # Batched version of language modeling/multiple choice tasks - batch_size = len(batch['input_ids']) - seqlen = batch['input_ids'].shape[1] - #print("Seq len:", seqlen) - prompt_lens = [] - continuation_lens = [] - for tokens, cont_idxs in zip(batch['input_ids'], - batch['continuation_indices']): - tokens = tokens.tolist() - cont_idxs = cont_idxs.tolist() - expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] - prompt = tokens[:cont_idxs[0]] - prompt_lens.append(cont_idxs[0]) - continuation_lens.append(len(expected_cont_tokens)) - - input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) - input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int) - for i in range(batch_size): - input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]] - - #print("New batch shape", input_ids.shape) - #print("Continuation lengths:", continuation_lens) - #print("Prompt:", input_ids) - #print("Input shape:", input_ids.shape) - #print("Input lengths:", input_lengths) - with torch.no_grad(): - self.decoder.setup(batch_size, - input_lengths.max().item(), - max(continuation_lens)) - output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) - torch.cuda.synchronize() - output_logits_list = output_dict['generation_logits'] - #print("Output logits list", output_logits_list) - # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()) - - # output_logits_list length is == max(continuation_lens) - # Output logits_list[i] is of shape (batch_size, vocab_size) - if len(output_logits_list) > 0: - #print("Shape:", output_logits_list[0].shape) - output_logits_tensor = torch.stack(output_logits_list, dim=1) + elif 'gold_indices' in batch: + # Multiple choice tasks + batch_size = len(batch['input_ids']) + prompt_lens = [] + for tokens, cont_idxs in zip(batch['input_ids'], + batch['continuation_indices']): + tokens = tokens.tolist() + cont_idxs = cont_idxs.tolist() + prompt = tokens[:cont_idxs[-1] + 1] + prompt_lens.append(cont_idxs[-1] + 1) + + input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) + input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int) + for i in range(batch_size): + #print("Prompt:", self.tokenizer.decode(batch['input_ids'][i][:prompt_lens[i]])) + input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]] + with torch.no_grad(): + self.decoder.setup(batch_size, + input_lengths.max().item(), + 1) + output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) + torch.cuda.synchronize() + #print(output_dict.keys()) + logits = output_dict['context_logits'] + return logits else: - output_logits_tensor = None + ################# + # Batched version of language modeling tasks + batch_size = len(batch['input_ids']) + seqlen = batch['input_ids'].shape[1] + #print("Seq len:", seqlen) + prompt_lens = [] + continuation_lens = [] + for tokens, cont_idxs in zip(batch['input_ids'], + batch['continuation_indices']): + tokens = tokens.tolist() + cont_idxs = cont_idxs.tolist() + expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] + prompt = tokens[:cont_idxs[0]] + prompt_lens.append(cont_idxs[0]) + continuation_lens.append(len(expected_cont_tokens)) - #if output_logits_tensor is not None: - #print("Output logits tensor shape:", output_logits_tensor.shape) - - # Put together logits - # We loop through batch_size dimension rather than deal with NestedTensor - output_logits_batch = [] - for i in range(batch_size): - # First create context "logits" (one-hot vector with 1 at token position) - tokens = input_ids[i].tolist() - context_psuedologits = torch.nn.functional.one_hot( - torch.tensor(tokens[1:prompt_lens[i]], device=self.device), - num_classes=self.vocab_size) - # Then add generation logits (up to continuation_length) - if output_logits_tensor is not None: - output_logits_trimmed = output_logits_tensor[i][:continuation_lens[i]] - # print("Output logits trimmed shape:", output_logits_trimmed.shape) - combined_logits = torch.cat([context_psuedologits, output_logits_trimmed]) - else: - combined_logits = context_psuedologits - # Then pad with Padding token "logits" to end of sequence length - padding = torch.nn.functional.one_hot( - torch.full( - (seqlen - combined_logits.shape[0],), - self.PAD_ID, - device=self.device - ), - num_classes=self.vocab_size) - padded_combined_logits = torch.cat([combined_logits, padding]) - output_logits_batch.append(padded_combined_logits) - - return torch.stack(output_logits_batch).to(self.device) - ############################################### - # NON BATCHED VERSION - # Language modeling and multiple choice tasks - """ - for tokens, cont_idxs in zip(batch['input_ids'], - batch['continuation_indices']): - # print("******************************") - seqlen = tokens.shape[0] - tokens = tokens.tolist() - # print("Tokens:", tokens) - # print("Continuation indices:", cont_idxs) - cont_idxs = cont_idxs.tolist() - expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] - # print("Expected continuation tokens:", expected_cont_tokens) - prompt = tokens[:cont_idxs[0]] + input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) + input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int) + for i in range(batch_size): + input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]] - - input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) - input_lengths = torch.tensor([input_ids.size(1)], - dtype=torch.int, - device=self.device) - # print("*** PROMPT:", self.tokenizer.decode(prompt)) - # print("Input device:", input_ids.get_device()) - # print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) - # print("Input lengths:", input_lengths) - #print("Expected continuation tokens:", len(expected_cont_tokens)) with torch.no_grad(): - self.decoder.setup(input_lengths.size(0), - torch.max(input_lengths).item(), - len(expected_cont_tokens)) - - output_dict = self.decoder.decode( - input_ids, input_lengths, self.sampling_config, return_dict=True) - + self.decoder.setup(batch_size, + input_lengths.max().item(), + max(continuation_lens)) + output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) torch.cuda.synchronize() - context_psuedologits = torch.nn.functional.one_hot( - torch.tensor(tokens[1:cont_idxs[0]], device=self.device), - num_classes=self.vocab_size) output_logits_list = output_dict['generation_logits'] - # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()) - for i in range(len(output_logits_list)): - output_logits_list[i] = output_logits_list[i].squeeze() - # print("*** Output string:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())) - #print("Context logits:", context_psuedologits.shape) + #print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())) + + # output_logits_list length is == max(continuation_lens) + # Output logits_list[i] is of shape (batch_size, vocab_size) if len(output_logits_list) > 0: - output_logits_tensor = torch.stack(output_logits_list) - # print("Output logits stacked:", output_logits_tensor.shape) - combined_logits = torch.cat([context_psuedologits, output_logits_tensor]) + output_logits_tensor = torch.stack(output_logits_list, dim=1) else: - combined_logits = context_psuedologits - #print("Seqlen", seqlen) - # print("Combined logits shape:", combined_logits.shape) - - padding = torch.nn.functional.one_hot( - torch.full( - (seqlen - combined_logits.shape[0],), - self.PAD_ID, - device=combined_logits.device - ), - num_classes=self.vocab_size) - padded_combined_logits = torch.cat([combined_logits, padding]) - - # print("Padded combined logits shape:", padded_combined_logits.shape) - - output_logits_batch.append(padded_combined_logits) - - return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device) + output_logits_tensor = None + + # Put together logits + # We loop through batch_size dimension rather than deal with NestedTensor + output_logits_batch = [] + for i in range(batch_size): + # First create context "logits" (one-hot vector with 1 at token position) + tokens = input_ids[i].tolist() + context_psuedologits = torch.nn.functional.one_hot( + torch.tensor(tokens[1:prompt_lens[i]], device=self.device), + num_classes=self.vocab_size) + # Then add generation logits (up to continuation_length) + if output_logits_tensor is not None: + output_logits_trimmed = output_logits_tensor[i][:continuation_lens[i]] + # print("Output logits trimmed shape:", output_logits_trimmed.shape) + combined_logits = torch.cat([context_psuedologits, output_logits_trimmed]) + else: + combined_logits = context_psuedologits + # Then pad with Padding token "logits" to end of sequence length + padding = torch.nn.functional.one_hot( + torch.full( + (seqlen - combined_logits.shape[0],), + self.PAD_ID, + device=self.device + ), + num_classes=self.vocab_size) + padded_combined_logits = torch.cat([combined_logits, padding]) + output_logits_batch.append(padded_combined_logits) + + return torch.stack(output_logits_batch).to(self.device) + ############################################### + # NON BATCHED VERSION + # Language modeling and multiple choice tasks + """ + for tokens, cont_idxs in zip(batch['input_ids'], + batch['continuation_indices']): + # print("******************************") + seqlen = tokens.shape[0] + tokens = tokens.tolist() + # print("Tokens:", tokens) + # print("Continuation indices:", cont_idxs) + cont_idxs = cont_idxs.tolist() + expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] + # print("Expected continuation tokens:", expected_cont_tokens) + prompt = tokens[:cont_idxs[0]] - #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) - """ \ No newline at end of file + + input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) + input_lengths = torch.tensor([input_ids.size(1)], + dtype=torch.int, + device=self.device) + # print("*** PROMPT:", self.tokenizer.decode(prompt)) + # print("Input device:", input_ids.get_device()) + # print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) + # print("Input lengths:", input_lengths) + #print("Expected continuation tokens:", len(expected_cont_tokens)) + with torch.no_grad(): + self.decoder.setup(input_lengths.size(0), + torch.max(input_lengths).item(), + len(expected_cont_tokens)) + + output_dict = self.decoder.decode( + input_ids, input_lengths, self.sampling_config, return_dict=True) + + torch.cuda.synchronize() + + context_psuedologits = torch.nn.functional.one_hot( + torch.tensor(tokens[1:cont_idxs[0]], device=self.device), + num_classes=self.vocab_size) + output_logits_list = output_dict['generation_logits'] + # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()) + for i in range(len(output_logits_list)): + output_logits_list[i] = output_logits_list[i].squeeze() + # print("*** Output string:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())) + #print("Context logits:", context_psuedologits.shape) + if len(output_logits_list) > 0: + output_logits_tensor = torch.stack(output_logits_list) + # print("Output logits stacked:", output_logits_tensor.shape) + combined_logits = torch.cat([context_psuedologits, output_logits_tensor]) + else: + combined_logits = context_psuedologits + #print("Seqlen", seqlen) + # print("Combined logits shape:", combined_logits.shape) + + padding = torch.nn.functional.one_hot( + torch.full( + (seqlen - combined_logits.shape[0],), + self.PAD_ID, + device=combined_logits.device + ), + num_classes=self.vocab_size) + padded_combined_logits = torch.cat([combined_logits, padding]) + + # print("Padded combined logits shape:", padded_combined_logits.shape) + + output_logits_batch.append(padded_combined_logits) + + return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device) + + #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) + """ diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py index 26d829dd20..aa91e1b941 100644 --- a/scripts/eval/run_trtllm_eval.py +++ b/scripts/eval/run_trtllm_eval.py @@ -24,7 +24,7 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): 'run_name': 'trtllm-eval', 'seed': 0, 'max_seq_len': 2048, - 'device_eval_batch_size': 64, + 'device_eval_batch_size': 8, 'precision': 'amp_bf16', 'dist_timeout': 6000, 'models': @@ -102,7 +102,7 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8): LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/' -DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_23_hf_ckpt/' +DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_25_hf_ckpt/' LLAMA_7B_DIR = '7B-chat-quality-eval' LLAMA_70B_DIR = '70B-chat-quality-eval' @@ -115,8 +115,8 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8): # llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR) # llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR) -dbrx_bf16_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_23_tllm_engine_bf16' -dbrx_int8_engine_dir = '/workspace/dbrx/03_23_tllm_engine_int8' +dbrx_bf16_engine_dir = '/workspace/dbrx/03_25_tllm_engine_bf16_all_logits' +dbrx_int8_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_25_tllm_engine_int8_all_logits' dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR) dbrx_int8_config = get_dbrx_config(dbrx_int8_engine_dir, DBRX_TOK_DIR) @@ -131,6 +131,6 @@ def run_eval(config): # run_eval(llama70b_fp16_config) # run_eval(llama70b_fp8_config) # run_eval(llama70b_smoothquant_config) -run_eval(dbrx_bf16_config) -# run_eval(dbrx_int8_config) +# run_eval(dbrx_bf16_config) +run_eval(dbrx_int8_config) From db3afef2fb3f9ccd5bd23e770737655ed7df9ce7 Mon Sep 17 00:00:00 2001 From: nik-mosaic Date: Mon, 15 Jul 2024 21:14:35 -0700 Subject: [PATCH 21/21] Update TRT wrapper and imports --- .../models/inference_api_wrapper/trtllm.py | 349 +++++++----------- scripts/eval/run_trtllm_eval.py | 58 ++- setup.py | 2 +- 3 files changed, 155 insertions(+), 254 deletions(-) diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py index ed765c2572..87a81600c9 100644 --- a/llmfoundry/models/inference_api_wrapper/trtllm.py +++ b/llmfoundry/models/inference_api_wrapper/trtllm.py @@ -48,7 +48,7 @@ def __init__( tokenizer: PreTrainedTokenizer, ): check_if_trt_llm_installed() - + # Only print on rank 0 if tensorrt_llm.mpi_rank() != 0: f = open(os.devnull, 'w') sys.stdout = f @@ -85,11 +85,6 @@ def __init__( use_gpt_attention_plugin = bool(plugin_config['gpt_attention_plugin']) remove_input_padding = plugin_config['remove_input_padding'] - if remove_input_padding: - raise ValueError( - 'TRT-LLM Evaluation Wrapper does not support remove_input_padding.' - ) - num_kv_heads = build_config.get('num_key_value_heads', num_heads) paged_kv_cache = plugin_config['paged_kv_cache'] tokens_per_block = plugin_config['tokens_per_block'] @@ -165,10 +160,9 @@ def __init__( self.decoder = tensorrt_llm.runtime.GenerationSession( model_config, engine_buffer, runtime_mapping, debug_mode=False) - print("!!! Initialized generation session for rank:", runtime_rank) + print("!!! Initialized generation session for rank:", runtime_rank) torch.cuda.synchronize() - def rebatch(self, batch): """ Move tensors in batch to the correct GPU. @@ -182,213 +176,200 @@ def rebatch(self, batch): elif isinstance(batch, list): return [self.rebatch(b) for b in batch] return batch - - # Remove potential additional dim, cast to int32 - """ - batch_input_ids = [ - x.flatten().type(torch.int32) for x in batch_input_ids - ] - input_lengths = [x.size(0) for x in batch_input_ids] - max_length = max(input_lengths) - # Right padding for trt-llm - paddings = [ - torch.ones(max_length - l, dtype=torch.int32, device=self.device) * pad_id - for l in input_lengths - ] - batch_input_ids = [ - torch.cat([x, pad]) for x, pad in zip(batch_input_ids, paddings) - ] - batch_input_ids = torch.stack(batch_input_ids).to(device=self.device) - input_lengths = torch.tensor(input_lengths, dtype=torch.int32, device=self.device) - return batch_input_ids, input_lengths - """ def eval_forward(self, batch, outputs: Optional[Any] = None): - # Run TRTLLM forward pass + # Run TRT-LLM Forward Pass without any input padding output_logits_batch = [] batch = self.rebatch(batch) + batch_size = len(batch['input_ids']) + prompt_lens = [] + unpadded_input_ids_list = [] - # Question-answering tasks if 'continuation_indices' not in batch: - # Batched version. For some reason - # GSM-8k gives bad outputs when we batch on BF16 version. - # So, we will not batch. - """ - batch_size = len(batch['input_ids']) - prompt_lens = [] + # Question-answering tasks max_prompt_len = 0 for tokens in batch['input_ids']: - prompt = tokens.tolist() pad_start = (tokens == self.PAD_ID).nonzero(as_tuple=True)[0] - end_prompt_idx = len(prompt) + eos_start = (tokens == self.END_ID).nonzero(as_tuple=True)[0] + end_prompt_idx = len(tokens.tolist()) if pad_start.shape[0] >= 1: end_prompt_idx = pad_start[0] + if eos_start.shape[0] >= 1 and eos_start[0] < end_prompt_idx: + end_prompt_idx = eos_start[0] prompt_lens.append(end_prompt_idx) if end_prompt_idx > max_prompt_len: max_prompt_len = end_prompt_idx + + for i in range(batch_size): + #print("Prompt:\n", self.tokenizer.decode(batch['input_ids'][i][:prompt_lens[i]].tolist())) + unpadded_input_ids_list += batch['input_ids'][i][:prompt_lens[i]].tolist() - input_ids = torch.narrow(batch['input_ids'], 1, 0, max_prompt_len).to(dtype=torch.int, device=self.device) + unpadded_input_ids = torch.tensor(unpadded_input_ids_list, dtype=torch.int, device=self.device) input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) - - torch.set_printoptions(threshold=10_000) - #print("Prompt0:", input_ids[0]) - #print("Input shape:", input_ids.shape) - #print("Input lengths:", input_lengths) - max_generation_length = 256 + + MAX_GEN_LEN = 256 + max_generation_length = batch.get('generation_length', MAX_GEN_LEN) torch.cuda.synchronize() with torch.no_grad(): self.decoder.setup(batch_size, input_lengths.max().item(), - batch.get('generation_length', max_generation_length)) - output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) - #self.decoder.setup(1, - # input_lengths[:1].max().item(), - # batch.get('generation_length', max_generation_length)) - #output_dict2 = self.decoder.decode(input_ids[:1,:], input_lengths[:1], self.sampling_config, return_dict=True) + max_generation_length) + output_dict = self.decoder.decode(unpadded_input_ids, input_lengths, self.sampling_config, return_dict=True) torch.cuda.synchronize() - #answer1 = output_dict['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length] - #answer2 = output_dict2['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length] - #all_equal = torch.equal(answer1, answer2) - # if not all_equal: - # print("Prompt:", input_ids[0]) - # print("Answer 1:", self.tokenizer.decode(answer1)) - # print("Answer 2:", self.tokenizer.decode(answer2)) - # print("Shape 1:", answer1.shape) - # print("Shape 2", answer2.shape) - # difference = answer1 - answer2 - # nonzero_indices = difference.nonzero(as_tuple=True) - # nonzero = difference[difference.nonzero(as_tuple=True)] - # print("EQUAL?", all_equal) - # print("Difference:", difference) - # print("nonzero indices:", nonzero_indices) - # print("Nonzero Elements", nonzero) - # quit() - output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch.get('generation_length', max_generation_length)] for i in range(batch_size)] - - print("Output:", output_ids) - + output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+max_generation_length] for i in range(batch_size)] decoded_strs = [self.tokenizer.decode(out) for out in output_ids] - print("decoded strs:", decoded_strs) + # print("Output string:", decoded_strs) return decoded_strs + elif 'gold_indices' in batch: + # Multiple choice tasks + seqlen = batch['input_ids'].shape[1] + """" + Generate one-step at a time """ - # Non-batched version - output_strs = [] - for tokens in batch['input_ids']: - #print("RAW Tokens:", tokens) - seqlen = tokens.shape[0] - prompt = tokens.tolist() - pad_start = (tokens == self.PAD_ID).nonzero(as_tuple=True)[0] - end_prompt_idx = len(prompt) - if pad_start.shape[0] >= 1: - end_prompt_idx = pad_start[0] - prompt = prompt[:end_prompt_idx] - input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) - input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device) - #print("prompt:", self.tokenizer.decode(prompt)) - #print("promp tokens:", prompt) - #print("Input lengths:", input_lengths) - #print("Generation Length:", batch['generation_length']) - #print("Batch keys:", batch.keys()) - torch.cuda.synchronize() - with torch.no_grad(): - self.decoder.setup(batch_size=input_lengths.size(0), - max_context_length=torch.max(input_lengths).item(), - max_new_tokens=batch.get('generation_length', 200)) - output_dict = self.decoder.decode( - input_ids, - input_lengths, - self.sampling_config, - #stopping_criteria=batch['stopping_criteria'], - return_dict=True) + prompt_lens = [cont_idxs.tolist()[-1] + 1 for cont_idxs in batch['continuation_indices']] + logits_list = [] + with torch.no_grad(): + for tokens, cont_idxs in zip(batch['input_ids'], batch['continuation_indices']): + cont_idxs = cont_idxs.tolist() + #print("Continuation Indices:", cont_idxs) + #print("Continuation tokens:", self.tokenizer.decode(tokens.tolist()[cont_idxs[0]:cont_idxs[-1] + 1])) + cont_length = cont_idxs[-1] + 1 - cont_idxs[0] + logits = torch.nn.functional.one_hot( + tokens[1:cont_idxs[0]], + num_classes=self.vocab_size, + ).to(device=self.device) + for i in range(cont_length): + # decode one token at a time + self.decoder.setup(1, cont_idxs[0]+i, 1) + output_dict = self.decoder.decode(tokens[:cont_idxs[0]+i].to(dtype=torch.int32, device=self.device), + torch.tensor([cont_idxs[0]+i], dtype=torch.int, device=self.device), + self.sampling_config, + return_dict=True) + next_logit_tensor = torch.squeeze(output_dict['generation_logits'][0]) + #print("Decoded output:\n", self.tokenizer.decode(output_dict['output_ids'][0].squeeze())) + # append next logit to logits tensor + logits = torch.cat([logits, next_logit_tensor.reshape(1, -1)]) + + padding = torch.nn.functional.one_hot( + torch.full((max(prompt_lens) - logits.shape[0],), self.PAD_ID), + num_classes=self.vocab_size, + ).to(device=next_logit_tensor.device) + logits = torch.cat([logits, padding]) + logits_list.append(logits) - #print("Shape:", output_dict['output_ids'].shape) - decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):]) - output_strs.append(decoded_str) - #print("Decoded OUTPUT:", decoded_str) - #print("-------------") - #print("Output ids:", output_dict['output_ids'][0][0].tolist()) - return output_strs - + return torch.stack(logits_list).to(device=self.device, dtype=torch.float) + """ + Normal (context logits) version + """ + torch.cuda.synchronize() + continuation_starts = [cont_idxs.tolist()[0] for cont_idxs in batch['continuation_indices']] + prompt_lens = [cont_idxs.tolist()[-1] + 1 for cont_idxs in batch['continuation_indices']] - elif 'gold_indices' in batch: - # Multiple choice tasks - batch_size = len(batch['input_ids']) - prompt_lens = [] - for tokens, cont_idxs in zip(batch['input_ids'], - batch['continuation_indices']): - tokens = tokens.tolist() - cont_idxs = cont_idxs.tolist() - prompt = tokens[:cont_idxs[-1] + 1] - prompt_lens.append(cont_idxs[-1] + 1) - - input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) - input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int) - for i in range(batch_size): - #print("Prompt:", self.tokenizer.decode(batch['input_ids'][i][:prompt_lens[i]])) - input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]] + logits_list = [] with torch.no_grad(): + # Batched version: + # Doesn't work because TRT-LLM has bug with batched context logits. + """ + for i in range(batch_size): + #print("Prompt:", self.tokenizer.decode(batch['input_ids'][i][:prompt_lens[i]].tolist())) + unpadded_input_ids_list += batch['input_ids'][i][:prompt_lens[i]].tolist() + + unpadded_input_ids = torch.tensor(unpadded_input_ids_list, dtype=torch.int, device=self.device) + input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) self.decoder.setup(batch_size, input_lengths.max().item(), 1) - output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) - torch.cuda.synchronize() - #print(output_dict.keys()) - logits = output_dict['context_logits'] + output_dict = self.decoder.decode(unpadded_input_ids, input_lengths, self.sampling_config, return_dict=True) + logits = output_dict['context_logits'] + """ + # Unbatched version + for tokens, cont_idxs in zip(batch['input_ids'], batch['continuation_indices']): + # Tensorrt-LLM Input must be int32 tensor, not int64 tensor! + prompt_len = cont_idxs.tolist()[-1] + 1 + self.decoder.setup(1, prompt_len, 1) + output_dict = self.decoder.decode(tokens[:prompt_len].to(dtype=torch.int32, device=self.device), + torch.tensor([prompt_len], dtype=torch.int, device=self.device), + self.sampling_config, + return_dict=True) + context_logits = torch.squeeze(output_dict['context_logits']) + prompt_psuedologits = torch.nn.functional.one_hot( + tokens[1:cont_idxs[0]], + num_classes=self.vocab_size) + context_logits = context_logits[cont_idxs[0]:] + context_logits = torch.cat([prompt_psuedologits, context_logits]) + + pad_len = max(prompt_lens) - context_logits.shape[0] + if pad_len != 0: + padding = torch.nn.functional.one_hot( + torch.full((pad_len,), self.PAD_ID), + num_classes=self.vocab_size, + ).to(device=context_logits.device) + context_logits = torch.cat([context_logits, padding]) + + logits_list.append(context_logits) + torch.cuda.synchronize() + + return torch.stack(logits_list).to(device=self.device, dtype=torch.float) + """ + # Batched version + # Context Logits beyond input lengths should be one-hot vectors + # with a one in the padding token position. + """ + for i in range(batch_size): + pad_len = logits.shape[1] - prompt_lens[i] + if pad_len == 0: + continue + padding = torch.nn.functional.one_hot( + torch.full((pad_len,), self.PAD_ID), + num_classes=logits.shape[2], + ) + logits[i,prompt_lens[i]:,:] = padding return logits else: - ################# - # Batched version of language modeling tasks - batch_size = len(batch['input_ids']) + # Language Modeling Tasks seqlen = batch['input_ids'].shape[1] - #print("Seq len:", seqlen) - prompt_lens = [] continuation_lens = [] for tokens, cont_idxs in zip(batch['input_ids'], batch['continuation_indices']): tokens = tokens.tolist() cont_idxs = cont_idxs.tolist() expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] - prompt = tokens[:cont_idxs[0]] prompt_lens.append(cont_idxs[0]) continuation_lens.append(len(expected_cont_tokens)) - input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) - input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int) for i in range(batch_size): - input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]] - + unpadded_input_ids_list += batch['input_ids'][i][:prompt_lens[i]].tolist() + + unpadded_input_ids = torch.tensor(unpadded_input_ids_list, dtype=torch.int, device=self.device) + input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device) + + torch.cuda.synchronize() with torch.no_grad(): self.decoder.setup(batch_size, input_lengths.max().item(), max(continuation_lens)) - output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True) + output_dict = self.decoder.decode(unpadded_input_ids, input_lengths, self.sampling_config, return_dict=True) torch.cuda.synchronize() output_logits_list = output_dict['generation_logits'] - #print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())) - - # output_logits_list length is == max(continuation_lens) - # Output logits_list[i] is of shape (batch_size, vocab_size) if len(output_logits_list) > 0: output_logits_tensor = torch.stack(output_logits_list, dim=1) else: output_logits_tensor = None # Put together logits - # We loop through batch_size dimension rather than deal with NestedTensor output_logits_batch = [] for i in range(batch_size): + prior_data = 0 if i == 0 else sum(prompt_lens[:i]) # First create context "logits" (one-hot vector with 1 at token position) - tokens = input_ids[i].tolist() context_psuedologits = torch.nn.functional.one_hot( - torch.tensor(tokens[1:prompt_lens[i]], device=self.device), + torch.tensor(unpadded_input_ids_list[prior_data+1:prior_data+prompt_lens[i]], device=self.device), num_classes=self.vocab_size) # Then add generation logits (up to continuation_length) if output_logits_tensor is not None: output_logits_trimmed = output_logits_tensor[i][:continuation_lens[i]] - # print("Output logits trimmed shape:", output_logits_trimmed.shape) combined_logits = torch.cat([context_psuedologits, output_logits_trimmed]) else: combined_logits = context_psuedologits @@ -404,74 +385,4 @@ def eval_forward(self, batch, outputs: Optional[Any] = None): output_logits_batch.append(padded_combined_logits) return torch.stack(output_logits_batch).to(self.device) - ############################################### - # NON BATCHED VERSION - # Language modeling and multiple choice tasks - """ - for tokens, cont_idxs in zip(batch['input_ids'], - batch['continuation_indices']): - # print("******************************") - seqlen = tokens.shape[0] - tokens = tokens.tolist() - # print("Tokens:", tokens) - # print("Continuation indices:", cont_idxs) - cont_idxs = cont_idxs.tolist() - expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1] - # print("Expected continuation tokens:", expected_cont_tokens) - prompt = tokens[:cont_idxs[0]] - - input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device) - input_lengths = torch.tensor([input_ids.size(1)], - dtype=torch.int, - device=self.device) - # print("*** PROMPT:", self.tokenizer.decode(prompt)) - # print("Input device:", input_ids.get_device()) - # print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape) - # print("Input lengths:", input_lengths) - #print("Expected continuation tokens:", len(expected_cont_tokens)) - with torch.no_grad(): - self.decoder.setup(input_lengths.size(0), - torch.max(input_lengths).item(), - len(expected_cont_tokens)) - - output_dict = self.decoder.decode( - input_ids, input_lengths, self.sampling_config, return_dict=True) - - torch.cuda.synchronize() - - context_psuedologits = torch.nn.functional.one_hot( - torch.tensor(tokens[1:cont_idxs[0]], device=self.device), - num_classes=self.vocab_size) - output_logits_list = output_dict['generation_logits'] - # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()) - for i in range(len(output_logits_list)): - output_logits_list[i] = output_logits_list[i].squeeze() - # print("*** Output string:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())) - #print("Context logits:", context_psuedologits.shape) - if len(output_logits_list) > 0: - output_logits_tensor = torch.stack(output_logits_list) - # print("Output logits stacked:", output_logits_tensor.shape) - combined_logits = torch.cat([context_psuedologits, output_logits_tensor]) - else: - combined_logits = context_psuedologits - #print("Seqlen", seqlen) - # print("Combined logits shape:", combined_logits.shape) - - padding = torch.nn.functional.one_hot( - torch.full( - (seqlen - combined_logits.shape[0],), - self.PAD_ID, - device=combined_logits.device - ), - num_classes=self.vocab_size) - padded_combined_logits = torch.cat([combined_logits, padding]) - - # print("Padded combined logits shape:", padded_combined_logits.shape) - - output_logits_batch.append(padded_combined_logits) - - return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device) - - #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist())) - """ diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py index aa91e1b941..39fc9eb29e 100644 --- a/scripts/eval/run_trtllm_eval.py +++ b/scripts/eval/run_trtllm_eval.py @@ -17,6 +17,7 @@ QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.3.yaml' ALL_TASKS = './eval/yamls/tasks_v0.3.yaml' LM_TASKS = './eval/yamls/lm_tasks_v0.3.yaml' +BROKEN_TASKS = './eval/yamls/broken_tasks.yaml' EVAL_GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml' def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): @@ -24,7 +25,7 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): 'run_name': 'trtllm-eval', 'seed': 0, 'max_seq_len': 2048, - 'device_eval_batch_size': 8, + 'device_eval_batch_size': 4, 'precision': 'amp_bf16', 'dist_timeout': 6000, 'models': @@ -37,8 +38,6 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): 'version': 'dbrx', 'engine_dir': engine_dir, 'log_level': 'error', - 'eos_token_id': 100257, - 'pad_token_id': 100277, }, 'tokenizer': { @@ -61,13 +60,13 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): -def get_llama_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS): +def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS): return { 'run_name': 'trtllm-eval', 'seed': 0, - 'max_seq_len': 2048, - 'device_eval_batch_size': 8, # Llama-7B should be batch size 32 - 'precision': 'amp_fp16', + 'max_seq_len': 1024, + 'device_eval_batch_size': 4, # Llama-7B should be batch size 32 + 'precision': 'amp_bf16', 'dist_timeout': 6000, 'models': [ @@ -79,20 +78,21 @@ def get_llama_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS): 'version': 'llama', 'engine_dir': engine_dir, 'log_level': 'error', - 'eos_token_id': 2, - 'pad_token_id': 2 + 'end_token_id': 128009, + 'pad_token_id': 128001, + }, 'tokenizer': { 'name': tokenizer_name, - } + } } ], 'icl_tasks': icl_tasks, 'eval_gauntlet': EVAL_GAUNTLET, 'loggers': { 'wandb': { - 'project': 'nik-quant-eval' + 'project': 'nik-llama3-eval' } } } @@ -100,25 +100,20 @@ def get_llama_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS): def engine_dir_str(model_type, model_dir, variant, ngpus=8): return f"{trt_folder_path}examples/{model_type}/tmp/{model_type}/{model_dir}/trt_engines/{variant}/{ngpus}-gpu" +LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/models/llama3-70b-instruct-hf/' +DBRX_TOK_DIR = '/mnt/workdisk/nikhil/models/dbrx-hf/03_25_hf_ckpt/' -LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/' -DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_25_hf_ckpt/' +# LLama URLs +#llama_bf16_engine_dir = '/mnt/workdisk/nikhil/engines/llama3_70b_bf16_logits_v0.10' +#llama_fp8_engine_dir = '/mnt/workdisk/nikhil/engines/llama3_70b_fp8_logits_0521/' +llama_fp8_engine_dir = '/mnt/workdisk/nikhil/engines/llama3_70b_fp8_logits_v2_v0.10' +llama70b_config = get_llama_config(llama_fp8_engine_dir, LLAMA_TOK_DIR) -LLAMA_7B_DIR = '7B-chat-quality-eval' -LLAMA_70B_DIR = '70B-chat-quality-eval' +#dbrx_bf16_engine_dir = '/mnt/workdisk/nikhil/engines/dbrx_bf16_logits_0521/' +#dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR) -# LLama URLs -# fp8_engine_dir = '/mnt/workdisk/nikhil/engines-quality-eval/llama-2-70b-chat-tp8-fp8' -# llama7b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_7B_DIR, 'int8_kv_cache_weight_only', 1), LLAMA_TOK_DIR) -#llama70b_fp8_config = get_llama_config(fp8_engine_dir, LLAMA_TOK_DIR) -# llama70b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'int8_kv_cache_weight_only'), LLAMA_TOK_DIR) -# llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR) -# llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR) - -dbrx_bf16_engine_dir = '/workspace/dbrx/03_25_tllm_engine_bf16_all_logits' -dbrx_int8_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_25_tllm_engine_int8_all_logits' -dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR) -dbrx_int8_config = get_dbrx_config(dbrx_int8_engine_dir, DBRX_TOK_DIR) +dbrx_fp8_engine_dir = '/mnt/workdisk/nikhil/engines/dbrx_fp8_logits_v2_v0.10' +dbrx_fp8_config = get_dbrx_config(dbrx_fp8_engine_dir, DBRX_TOK_DIR) def run_eval(config): print("RUNNING EVAL") @@ -126,11 +121,6 @@ def run_eval(config): print("OmegaConfig dictionary", om.to_yaml(om_dict_config)) run_evaluation(om_dict_config) -# run_eval(llama7b_int8_config) -# run_eval(llama70b_int8_config) -# run_eval(llama70b_fp16_config) -# run_eval(llama70b_fp8_config) -# run_eval(llama70b_smoothquant_config) -# run_eval(dbrx_bf16_config) -run_eval(dbrx_int8_config) +# run_eval(dbrx_fp8_config) +run_eval(llama70b_config) diff --git a/setup.py b/setup.py index 7534d24503..d8dfddb058 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ 'accelerate>=0.25,<0.26', # for HF inference `device_map` 'transformers>=4.38.2,<4.39', 'mosaicml-streaming>=0.7.4,<0.8', - 'torch>=2.2.1,<2.3', + #'torch>=2.2.1,<2.3', 'datasets>=2.16,<2.17', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.1.97',