From 6dbfb271e3f4e7162b8fb45580272a2ecf9a4a3c Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Thu, 24 Aug 2023 16:13:21 -0700
Subject: [PATCH 01/21] Add TRT ComposerModel inference wrapper

---
 .../models/inference_api_wrapper/__init__.py  |   6 +
 .../models/inference_api_wrapper/trtllm.py    | 166 ++++++++++++++++++
 llmfoundry/models/model_registry.py           |   4 +-
 3 files changed, 175 insertions(+), 1 deletion(-)
 create mode 100644 llmfoundry/models/inference_api_wrapper/__init__.py
 create mode 100644 llmfoundry/models/inference_api_wrapper/trtllm.py
diff --git a/llmfoundry/models/inference_api_wrapper/__init__.py b/llmfoundry/models/inference_api_wrapper/__init__.py
new file mode 100644
index 0000000000..dbb5530f11
--- /dev/null
+++ b/llmfoundry/models/inference_api_wrapper/__init__.py
@@ -0,0 +1,6 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from llmfoundry.models.inference_api_wrapper.trtllm import TRTLLMEvalWrapper
+
+__all__ = ['TRTLLMEvalWrapper']
diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
new file mode 100644
index 0000000000..2a51d590cc
--- /dev/null
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -0,0 +1,166 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Implements a TRT-LLM evaluation model wrapped around a
+:class:`.ComposerModel`."""
+
+import json
+from pathlib import Path
+from typing import Any, Optional
+
+import tensorrt_llm
+import torch
+from omegaconf import DictConfig
+from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+from transformers import PreTrainedTokenizer
+
+from llmfoundry.models.inference_api_wrapper.interface import \
+    InferenceAPIEvalWrapper
+
+__all__ = ['TRTLLMEvalWrapper']
+
+
+# From tensorrt_llm/examples/{model_name}/build.py
+def get_engine_name(model: str, dtype: str, tp_size: int, rank: int):
+    return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+
+
+class TRTLLMEvalWrapper(InferenceAPIEvalWrapper):
+
+    def __init__(
+        self,
+        model_cfg: DictConfig,
+        tokenizer: PreTrainedTokenizer,
+    ):
+
+        super().__init__(model_cfg, tokenizer)
+
+        tensorrt_llm.logger.set_level(model_cfg['log_level'])
+
+        # Load TRT config from file
+        engine_dir = Path(model_cfg['engine_dir'])
+        config_path = engine_dir / 'config.json'
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+
+        # Set vars from config
+        use_gpt_attention_plugin = config['plugin_config'][
+            'gpt_attention_plugin']
+        inflight_batching_gpt_attention_plugin = config['plugin_config'][
+            'inflight_batching_gpt_attention_plugin']
+        remove_input_padding = config['plugin_config']['remove_input_padding']
+        if remove_input_padding:
+            raise ValueError(
+                'TRT-LLM Evaluation Wrapper does not support remove_input_padding.'
+            )
+        dtype = config['builder_config']['precision']
+        world_size = config['builder_config']['tensor_parallel']
+        assert world_size == tensorrt_llm.mpi_world_size(), \
+            f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
+        num_heads = config['builder_config']['num_heads'] // world_size
+        hidden_size = config['builder_config']['hidden_size'] // world_size
+        vocab_size = config['builder_config']['vocab_size']
+        num_layers = config['builder_config']['num_layers']
+        multi_query_mode = config['builder_config']['multi_query_mode']
+        paged_kv_cache = config['builder_config'].get('paged_kv_cache', False)
+        tokens_per_block = config['builder_config'].get('tokens_per_block', 64)
+        use_prompt_tuning = config['builder_config'].get(
+            'use_prompt_tuning', False)
+
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+
+        # Device and rank
+        runtime_rank = tensorrt_llm.mpi_rank()
+        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
+        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+
+        # Tokenization and sampling
+        self.END_ID = model_cfg.get('eos_token_id', self.tokenizer.eos_token_id)
+        self.PAD_ID = model_cfg.get('pad_token_id', self.tokenizer.pad_token_id)
+        if self.PAD_ID == None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+
+        print('EOS TOKEN:', self.END_ID)
+        print('Pad token:', self.PAD_ID)
+
+        self.sampling_config = SamplingConfig(end_id=self.END_ID,
+                                              pad_id=self.PAD_ID,
+                                              num_beams=1)
+
+        # Load TRT engine
+        engine_name = get_engine_name(model_cfg['version'], dtype, world_size,
+                                      runtime_rank)
+        serialize_path = engine_dir / engine_name
+        with open(serialize_path, 'rb') as f:
+            engine_buffer = f.read()
+
+        # Initialize generation session for model
+        trt_model_config = ModelConfig(
+            num_heads=num_heads,
+            hidden_size=self.hidden_size,
+            vocab_size=self.vocab_size,
+            num_layers=num_layers,
+            gpt_attention_plugin=use_gpt_attention_plugin,
+            inflight_batching_gpt_attention_plugin=
+            inflight_batching_gpt_attention_plugin,
+            multi_query_mode=multi_query_mode,
+            remove_input_padding=remove_input_padding,
+            paged_kv_cache=paged_kv_cache,
+            tokens_per_block=tokens_per_block,
+            use_prompt_tuning=use_prompt_tuning)
+        self.decoder = tensorrt_llm.runtime.GenerationSession(
+            trt_model_config, engine_buffer, runtime_mapping)
+
+    def eval_forward(self, batch, outputs: Optional[Any] = None):
+        # If the batch mode is generate, we will generate a requested number of tokens using the underlying
+        # model's generate function. Strings will be returned from eval_forward
+        output_logits_batch = []
+        batch = self.rebatch(batch)
+        for tokens, cont_idxs in zip(batch['input_ids'],
+                                     batch['continuation_indices']):
+
+            seqlen = tokens.shape[0]
+            tokens = tokens.tolist()
+            cont_idxs = cont_idxs.tolist()
+            expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+
+            prompt = tokens[:cont_idxs[0]]
+            input_ids = torch.tensor([prompt], dtype=torch.int, device='cuda')
+            input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device='cuda')
+            #print("prompt:", self.tokenizer.decode(prompt))
+            #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
+            #print("Input lengths:", input_lengths)
+            #print(cont_idxs[0])
+            #print("Expected continuation tokens:", len(expected_cont_tokens))
+            self.decoder.setup(input_lengths.size(0),
+                               torch.max(input_lengths).item(),
+                               len(expected_cont_tokens))
+
+            output_idsg, output_logits_list = self.decoder.decode(
+                input_ids, input_lengths, self.sampling_config)
+
+            #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
+
+            output_logits = torch.nn.functional.one_hot(
+                torch.tensor(tokens[1:cont_idxs[0]], device='cuda'),
+                num_classes=self.vocab_size)
+
+            for i in range(len(output_logits_list)):
+                output_logits_list[i] = output_logits_list[i].squeeze()
+
+            next_logit_tensor = torch.stack(output_logits_list)
+            output_logits = torch.cat([output_logits, next_logit_tensor])
+            #print(output_logits.shape)
+            #print(output_ids[0][0][cont_idxs[0]:].tolist())
+            padding = torch.nn.functional.one_hot(torch.full(
+                (seqlen - output_logits.shape[0],),
+                self.PAD_ID,
+                device=output_logits.device),
+                                                  num_classes=self.vocab_size)
+            output_logits = torch.cat([output_logits, padding])
+            #print("Output logits shape:", output_logits.shape)
+            output_logits_batch.append(output_logits)
+
+        return torch.stack(output_logits_batch).to(batch['input_ids'].device)
diff --git a/llmfoundry/models/model_registry.py b/llmfoundry/models/model_registry.py
index 02a709740e..557ce17cc3 100644
--- a/llmfoundry/models/model_registry.py
+++ b/llmfoundry/models/model_registry.py
@@ -4,10 +4,12 @@
 from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM,
                                   ComposerHFT5)
 from llmfoundry.models.mpt import ComposerMPTCausalLM
+from llmfoundry.models.inference_api_wrapper import TRTLLMEvalWrapper
 
 COMPOSER_MODEL_REGISTRY = {
-    'mpt_causal_lm': ComposerMPTCausalLM,
+  l  'mpt_causal_lm': ComposerMPTCausalLM,
     'hf_causal_lm': ComposerHFCausalLM,
     'hf_prefix_lm': ComposerHFPrefixLM,
     'hf_t5': ComposerHFT5,
+    'trtllm': TRTLLMEvalWrapper
 }

From c312d21b9ef3bb5bb4643ababf16cd7ee5063cee Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Thu, 24 Aug 2023 16:32:29 -0700
Subject: [PATCH 02/21] Fix precommit

---
 .../models/inference_api_wrapper/trtllm.py    | 24 +++++++------------
 llmfoundry/models/model_registry.py           |  4 ++--
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index 2a51d590cc..34a2103fc2 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -6,7 +6,7 @@
 
 import json
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Dict
 
 import tensorrt_llm
 import torch
@@ -113,7 +113,7 @@ def __init__(
         self.decoder = tensorrt_llm.runtime.GenerationSession(
             trt_model_config, engine_buffer, runtime_mapping)
 
-    def eval_forward(self, batch, outputs: Optional[Any] = None):
+    def eval_forward(self, batch: Dict, outputs: Optional[Any] = None):
         # If the batch mode is generate, we will generate a requested number of tokens using the underlying
         # model's generate function. Strings will be returned from eval_forward
         output_logits_batch = []
@@ -128,20 +128,17 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
             prompt = tokens[:cont_idxs[0]]
             input_ids = torch.tensor([prompt], dtype=torch.int, device='cuda')
-            input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device='cuda')
-            #print("prompt:", self.tokenizer.decode(prompt))
-            #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
-            #print("Input lengths:", input_lengths)
-            #print(cont_idxs[0])
-            #print("Expected continuation tokens:", len(expected_cont_tokens))
+            input_lengths = torch.tensor([input_ids.size(1)],
+                                         dtype=torch.int,
+                                         device='cuda')
+
             self.decoder.setup(input_lengths.size(0),
                                torch.max(input_lengths).item(),
                                len(expected_cont_tokens))
 
-            output_idsg, output_logits_list = self.decoder.decode(
-                input_ids, input_lengths, self.sampling_config)
-
-            #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
+            _, output_logits_list = self.decoder.decode(input_ids,
+                                                        input_lengths,
+                                                        self.sampling_config)
 
             output_logits = torch.nn.functional.one_hot(
                 torch.tensor(tokens[1:cont_idxs[0]], device='cuda'),
@@ -152,15 +149,12 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
             next_logit_tensor = torch.stack(output_logits_list)
             output_logits = torch.cat([output_logits, next_logit_tensor])
-            #print(output_logits.shape)
-            #print(output_ids[0][0][cont_idxs[0]:].tolist())
             padding = torch.nn.functional.one_hot(torch.full(
                 (seqlen - output_logits.shape[0],),
                 self.PAD_ID,
                 device=output_logits.device),
                                                   num_classes=self.vocab_size)
             output_logits = torch.cat([output_logits, padding])
-            #print("Output logits shape:", output_logits.shape)
             output_logits_batch.append(output_logits)
 
         return torch.stack(output_logits_batch).to(batch['input_ids'].device)
diff --git a/llmfoundry/models/model_registry.py b/llmfoundry/models/model_registry.py
index 557ce17cc3..1b0f4210ac 100644
--- a/llmfoundry/models/model_registry.py
+++ b/llmfoundry/models/model_registry.py
@@ -3,11 +3,11 @@
 
 from llmfoundry.models.hf import (ComposerHFCausalLM, ComposerHFPrefixLM,
                                   ComposerHFT5)
-from llmfoundry.models.mpt import ComposerMPTCausalLM
 from llmfoundry.models.inference_api_wrapper import TRTLLMEvalWrapper
+from llmfoundry.models.mpt import ComposerMPTCausalLM
 
 COMPOSER_MODEL_REGISTRY = {
-  l  'mpt_causal_lm': ComposerMPTCausalLM,
+    'mpt_causal_lm': ComposerMPTCausalLM,
     'hf_causal_lm': ComposerHFCausalLM,
     'hf_prefix_lm': ComposerHFPrefixLM,
     'hf_t5': ComposerHFT5,

From 170359147faf360149f9e0c3c6a45e8a17f4abd4 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Thu, 24 Aug 2023 17:30:01 -0700
Subject: [PATCH 03/21] Add base wrapper

---
 .../models/inference_api_wrapper/__init__.py  |   4 +-
 .../models/inference_api_wrapper/interface.py | 110 ++++++++++++++++++
 .../models/inference_api_wrapper/trtllm.py    |  37 ++++--
 3 files changed, 142 insertions(+), 9 deletions(-)
 create mode 100644 llmfoundry/models/inference_api_wrapper/interface.py

diff --git a/llmfoundry/models/inference_api_wrapper/__init__.py b/llmfoundry/models/inference_api_wrapper/__init__.py
index dbb5530f11..24b0af9ca2 100644
--- a/llmfoundry/models/inference_api_wrapper/__init__.py
+++ b/llmfoundry/models/inference_api_wrapper/__init__.py
@@ -1,6 +1,8 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+from llmfoundry.models.inference_api_wrapper.interface import \
+    InferenceAPIEvalWrapper
 from llmfoundry.models.inference_api_wrapper.trtllm import TRTLLMEvalWrapper
 
-__all__ = ['TRTLLMEvalWrapper']
+__all__ = ['InferenceAPIEvalWrapper', 'TRTLLMEvalWrapper']
diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
new file mode 100644
index 0000000000..3ee896648a
--- /dev/null
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -0,0 +1,110 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, Optional
+
+import torch
+from composer.core.types import Batch
+from composer.metrics import InContextLearningMetric
+# required for loading a python model into composer
+from composer.metrics.nlp import (InContextLearningLMAccuracy,
+                                  InContextLearningLMExpectedCalibrationError,
+                                  InContextLearningMCExpectedCalibrationError,
+                                  InContextLearningMultipleChoiceAccuracy,
+                                  InContextLearningQAAccuracy,
+                                  LanguageCrossEntropy, LanguagePerplexity)
+from composer.models import ComposerModel
+from torchmetrics import Metric
+from transformers import AutoTokenizer
+
+
+class InferenceAPIEvalWrapper(ComposerModel):
+
+    def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
+        self.model_name = model_cfg['version']
+        self.tokenizer = tokenizer
+        self.labels = None
+        # set up training and eval metrics
+        eval_metrics = [
+            LanguageCrossEntropy(),
+            LanguagePerplexity(),
+            InContextLearningLMAccuracy(),
+            InContextLearningMultipleChoiceAccuracy(),
+            InContextLearningQAAccuracy(),
+            InContextLearningLMExpectedCalibrationError(),
+            InContextLearningMCExpectedCalibrationError()
+        ]
+        self.eval_metrics = {
+            metric.__class__.__name__: metric for metric in eval_metrics
+        }
+        super(InferenceAPIEvalWrapper, self).__init__()
+        self.mocked_layer = torch.nn.Linear(2, 3)
+
+    def get_metrics(self, is_train: bool = False):
+        if is_train:
+            metrics = []
+        else:
+            metrics = self.eval_metrics
+
+        return metrics if metrics else {}
+
+    def get_next_token_logit_tensor(self, prompt: str):
+        raise NotImplementedError
+
+    def rebatch(self, batch: Batch):
+        # default is a no-op, but Chat API modifies these
+        return batch
+
+    def eval_forward(self, batch: Batch, outputs: Optional[Any] = None):
+        # If the batch mode is generate, we will generate a requested number of tokens using the underlying
+        # model's generate function. Extra generation kwargs can be passed in via the batch. Strings will
+        # be returned from eval_forward
+        output_logits_batch = []
+        for tokens, cont_idxs in zip(batch['input_ids'],
+                                     batch['continuation_indices']):
+
+            seqlen = tokens.shape[0]
+            tokens = tokens.tolist()
+            cont_idxs = cont_idxs.tolist()
+            expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+            output_logits = torch.nn.functional.one_hot(
+                torch.tensor(tokens[1:cont_idxs[0]]),
+                num_classes=self.tokenizer.pad_token_id + 1)
+            for i in range(len(expected_cont_tokens)):
+                # decode one token at a time
+                prompt = self.tokenizer.decode(tokens[:cont_idxs[0]] +
+                                               expected_cont_tokens[0:i])
+                next_logit_tensor = self.get_next_token_logit_tensor(prompt)
+                if next_logit_tensor is None:
+                    continue
+                output_logits = torch.cat(
+                    [output_logits,
+                     next_logit_tensor.reshape(1, -1)])
+            padding = torch.nn.functional.one_hot(
+                torch.full((seqlen - output_logits.shape[0],),
+                           self.tokenizer.pad_token_id),
+                num_classes=self.tokenizer.pad_token_id + 1)
+            output_logits = torch.cat([output_logits, padding])
+            output_logits_batch.append(output_logits)
+
+        return torch.stack(output_logits_batch).to(batch['input_ids'].device)
+
+    def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None:
+        batch = self.rebatch(batch)
+        self.labels = batch.pop('labels')
+        self.labels[:, :-1] = self.labels[:, 1:].clone()
+        self.labels[:, -1] = -100
+        if isinstance(metric, InContextLearningMetric) and batch.get(
+                'mode', None) == 'icl_task':
+            assert self.labels is not None
+            metric.update(batch, outputs, self.labels)
+        else:
+            metric.update(
+                outputs,
+                self.labels)  # pyright: ignore [reportGeneralTypeIssues]
+
+    def forward(self):
+        pass
+
+    def loss(self):
+        pass
diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index 34a2103fc2..7f6fee9c7a 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -6,12 +6,10 @@
 
 import json
 from pathlib import Path
-from typing import Any, Optional, Dict
+from typing import Any, Optional
 
-import tensorrt_llm
 import torch
 from omegaconf import DictConfig
-from tensorrt_llm.runtime import ModelConfig, SamplingConfig
 from transformers import PreTrainedTokenizer
 
 from llmfoundry.models.inference_api_wrapper.interface import \
@@ -19,6 +17,20 @@
 
 __all__ = ['TRTLLMEvalWrapper']
 
+try:
+    import tensorrt_llm
+    from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+    TRT_LLM_INSTALLED = True
+except ImportError:
+    TRT_LLM_INSTALLED = False
+
+
+def check_if_trt_llm_installed():
+    if not TRT_LLM_INSTALLED:
+        raise ImportError(
+            'TRT-LLM is not installed. It must be installed to use the TRTLLMEValWrapper.'
+        )
+
 
 # From tensorrt_llm/examples/{model_name}/build.py
 def get_engine_name(model: str, dtype: str, tp_size: int, rank: int):
@@ -32,6 +44,7 @@ def __init__(
         model_cfg: DictConfig,
         tokenizer: PreTrainedTokenizer,
     ):
+        check_if_trt_llm_installed()
 
         super().__init__(model_cfg, tokenizer)
 
@@ -113,7 +126,7 @@ def __init__(
         self.decoder = tensorrt_llm.runtime.GenerationSession(
             trt_model_config, engine_buffer, runtime_mapping)
 
-    def eval_forward(self, batch: Dict, outputs: Optional[Any] = None):
+    def eval_forward(self, batch, outputs: Optional[Any] = None):
         # If the batch mode is generate, we will generate a requested number of tokens using the underlying
         # model's generate function. Strings will be returned from eval_forward
         output_logits_batch = []
@@ -131,14 +144,19 @@ def eval_forward(self, batch: Dict, outputs: Optional[Any] = None):
             input_lengths = torch.tensor([input_ids.size(1)],
                                          dtype=torch.int,
                                          device='cuda')
-
+            #print("prompt:", self.tokenizer.decode(prompt))
+            #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
+            #print("Input lengths:", input_lengths)
+            #print(cont_idxs[0])
+            #print("Expected continuation tokens:", len(expected_cont_tokens))
             self.decoder.setup(input_lengths.size(0),
                                torch.max(input_lengths).item(),
                                len(expected_cont_tokens))
 
-            _, output_logits_list = self.decoder.decode(input_ids,
-                                                        input_lengths,
-                                                        self.sampling_config)
+            output_ids, output_logits_list = self.decoder.decode(
+                input_ids, input_lengths, self.sampling_config)
+
+            #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
 
             output_logits = torch.nn.functional.one_hot(
                 torch.tensor(tokens[1:cont_idxs[0]], device='cuda'),
@@ -149,12 +167,15 @@ def eval_forward(self, batch: Dict, outputs: Optional[Any] = None):
 
             next_logit_tensor = torch.stack(output_logits_list)
             output_logits = torch.cat([output_logits, next_logit_tensor])
+            #print(output_logits.shape)
+            #print(output_ids[0][0][cont_idxs[0]:].tolist())
             padding = torch.nn.functional.one_hot(torch.full(
                 (seqlen - output_logits.shape[0],),
                 self.PAD_ID,
                 device=output_logits.device),
                                                   num_classes=self.vocab_size)
             output_logits = torch.cat([output_logits, padding])
+            #print("Output logits shape:", output_logits.shape)
             output_logits_batch.append(output_logits)
 
         return torch.stack(output_logits_batch).to(batch['input_ids'].device)

From 33e52897e868c5e139a9133a4d41bc2812b1bc99 Mon Sep 17 00:00:00 2001
From: nik-mosaic <101217697+nik-mosaic@users.noreply.github.com>
Date: Tue, 12 Dec 2023 10:47:07 -0600
Subject: [PATCH 04/21] Update model_registry.py

---
 llmfoundry/models/model_registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/models/model_registry.py b/llmfoundry/models/model_registry.py
index 332804a44d..25ca5442f3 100644
--- a/llmfoundry/models/model_registry.py
+++ b/llmfoundry/models/model_registry.py
@@ -13,7 +13,7 @@
     'hf_causal_lm': ComposerHFCausalLM,
     'hf_prefix_lm': ComposerHFPrefixLM,
     'hf_t5': ComposerHFT5,
-    'trtllm': TRTLLMEvalWrapper
+    'trtllm': TRTLLMEvalWrapper,
     'openai_causal_lm': OpenAICausalLMEvalWrapper,
     'openai_chat': OpenAIChatAPIEvalWrapper
 }

From da7b235833fbae49cc017fc95e1e5cf070cd96c9 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Wed, 13 Dec 2023 06:39:38 -0800
Subject: [PATCH 05/21] add changes to make llmfoundry install and test trtllm

---
 llmfoundry/__init__.py                        |   7 +-
 .../inference_api_wrapper/.trtllm.py.swo      | Bin 0 -> 16384 bytes
 .../inference_api_wrapper/.trtllm.py.swp      | Bin 0 -> 20480 bytes
 .../models/inference_api_wrapper/__init__.py  |   1 +
 .../models/inference_api_wrapper/trtllm.py    |  31 +++----
 llmfoundry/models/mpt/modeling_mpt.py         |   3 +-
 scripts/eval/eval.py                          |   2 +-
 scripts/eval/evaluate_trtllm_test.py          |  78 ++++++++++++++++++
 setup.py                                      |   2 +-
 9 files changed, 103 insertions(+), 21 deletions(-)
 create mode 100644 llmfoundry/models/inference_api_wrapper/.trtllm.py.swo
 create mode 100644 llmfoundry/models/inference_api_wrapper/.trtllm.py.swp
 create mode 100644 scripts/eval/evaluate_trtllm_test.py

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
index 85f96aadb9..023bfa2372 100644
--- a/llmfoundry/__init__.py
+++ b/llmfoundry/__init__.py
@@ -39,9 +39,10 @@
         is_cuda_available = False
 
     extras = '.[gpu]' if is_cuda_available else '.'
-    raise ImportError(
-        f'Please make sure to pip install {extras} to get the requirements for the LLM example.'
-    ) from e
+    print("ImportError:", e)
+    # raise ImportError(
+    #    f'Please make sure to pip install {extras} to get the requirements for the LLM example.'
+    #) from e
 
 __all__ = [
     'build_text_denoising_dataloader',
diff --git a/llmfoundry/models/inference_api_wrapper/.trtllm.py.swo b/llmfoundry/models/inference_api_wrapper/.trtllm.py.swo
new file mode 100644
index 0000000000000000000000000000000000000000..65e810630861a82ffa1e760d1eee34e247eae123
GIT binary patch
literal 16384
zcmeHO&yO5O6)s{YI1n2|;1q<)J3>8+W_s2e$8rYtu=1|OhF$L}y9tQ3snm4W%rx86
z-L9@)du*-X!i66PB#^jpK}g}(f#eT}xFjM-K>_6g9FVvnt{{aA@Kts9bkFQ~_6S1N
z^0(7n_3G8DuU@^Xt}gdhUf$ZF*WFcu>!XBRdF#FiE<f?i`wz|&8OKT|pLJci_PoE+
zyC&Mb=X$;N)hmLxd;RO8&9C=8-xK_~)n~7^R|x6ci{(+8a8Go?aMbR{S>(&HzAtY_
zu`j~36GVL>MdS&_lYrfmJV}J?D5)UoCgZd0vH6w(%fLJaE|O<guV5q+(pb7opZ?fO
z^Mq`gWxz6E8L$jk1}p=X0n318z%uZE%79AFk*}d`j}`6nir){+{QkSRc8YX=CjVbW
z{u4#|Ycu)9qy4cASOzQumI2FvWxz6E8L$jk1}p=X0n318;J?5C$BX=@Q14s%g&*Jl
z%lH4^e}s@<06z!r0xtmvz~_Mua0$2w5a7?32zecN4LAh840M6#fvdnLfj?a&<PG3w
zz_)>K0pA3MzzaYJc<(7fegnJ>{2KT^unRm6oCnSUzxyyD?*eZCzXZMu1V9g113m{_
z1fBwZ`yoQ!0e%V`1Dn7zz(1cP<bB`|zz=~j@EPC&@Yf53d=K~runSxQ-u)o#1Ss$x
zHaLC&+y*`ly!kjG8Sq))Y2Y8{3Hdwl4)7D;Rp2|oSAb6dZ=XZkfFA?<z+=E)u=(;M
z;5KjwtO8F07XaNaZ<d=ekMs{(T&W2AZpueV7(@f+#Zf;P&^5YWq}(tGO%eGl@Yjs+
z=B*93wb4T3iYt*pq?*ni_o<4HMAUU!)JX2FYsuwV!Oh)$da(P2&07<}T;+15QfoFU
zjKbi7kZ!_#W}aXO44laoaXQJJ)VD~>0u__;LBHM(y0%7d#gXVv@@tKRI&%_*dHsPq
z5TK)OZ%N4f)>H%;g*h3g71o=0T46o6QrMsex<`TsTm^ANdF0cwuPd@uEWM%YWj=RP
zp_ng@15Y$%7O7w)nB>tB{WL8fM>tsnPjLrHmNJ~NVG$1CR29yMp%qF*X)L8;`XtUy
zUb=~6J)zH1bZl#)Syyygk*R)0%Lqq&bb7NAZ_o{6wrQfytD0v>J&rxzGZn(5>S4Mo
zhk@^lsFbQbRmvK4-S_F;j7Q3JI342H(L@Wy<z%T~iHt{yVk(Qk6*}uC57Qb;r$k^>
zKJ*TqUKWHtc+H=kJI0$PH5d=diSLE6cXU=kXSM*0xpuYUE#U*<v!i3?ac?Nj)HW@C
zx?!U%Q~|r22{~qZJS<Rn=*$W`cP7n@vJngUSjcpthDxkfw(8x5Wh=2-*;=13EL@Xw
z?$Az$Rt`PYhN0m8*-bD~kJJO7qJ3vG8U&HZ>!Et!(B{Lc(vxL+d5yl957}v<sqv>r
zPF1S8e6s9J?B!`Hq^h+<2fT%)TlDs%bftA6ak)O0aLL0^gf;4YHBJ`$OCp6Aq=;j+
zQVq>c<<90&CpiZ%9<;Z&cj)FZ53@Y_eK9{3Nc~u(G>R2Xvm}YpWGP1Rv0y=zWD2JF
zz7B8|F-EE6K`Q9WP$o8|jAfJPYV_5>G%-&@HBZdTt+AAZ8TxiI;18X7^0hYWN}U11
z3s<012X>aLsLALEGf!kewM6UHSA*^gMUOcKUmtO-h)h)s)H0KW!dKHca>JPW&3@Gl
z_X0JfaU!B-E(cQ8utgo|EK?4x6*aYdm?n&qGe3~DL!CU&)iqR|Q4?9Cdr;e)%)5t9
zmFe86tZC3~*hk?JrP-!g{U8)oqoy1^j0XcOIrMB8ienMh>p)(fsH-5F3w^<rglsOm
z3|<r{#;Ovnd{Ly=Fg1AuTknVkeTE~ddS*niROun%7n9m@#nEKd(4~7)9KaJJ;TQ9|
zkS$^iz@G^bUAhr?>V{dmRoQwT%%6PGrxQc5fDbYBH9Z(Fdbkwhq`6g|sa@aOs-?rS
zyxmf$Oh(jsc*5{cCHSLf-iIe{-Vcu3kL8G~oQoDUBOG)j<-~6$d~FOCOswd;h>t{G
zpi5IFi&DjXQM603;KafK<;^WDYS5e7RqMUj5%={XR-~PWIWt=INyp5u`H|p~xXM*@
z$|Q6Id6by+j7zsL>1U~;Jpl`f{zCUsj5ew;eHI)dN)#N{qDUS<wnjSQ)FKFihYm$3
z$v{8by0w3>zP-J<(IuLgMQ&1d!>OVeNM^l3#C=aBiq`7PooZf*1DWOP7&A8NJi5i2
zjt0hdx}#5A7tS^Fo<KBzg;-q$QF07(8H0y22Kx>*$<ywJ;6+T`)!}Z}^jqrAJ=~2{
zC1f8*!^;{U;je{S1$RlcL+V9EVdI?Pl-m%G#DMGgQ%Y6?ij?istL9SoSg(mo+4X4L
zqPvL>%skBX>G&50y;4HQtx}H;Z+Q7NgRuLGdjku{7!$E!`4x9*X^8+^+L1>H_9^As
zSIpEVs;SKoMe}3tKIKxMlcyYKcIp1!#+TaLSkOnQXm4S8tAc(YWS6evRQ*u2SKSpz
z-H4O13<g6*S65b7(Zwkbyq#^TXV9DZp?k_RHH>9S^!xv-`u!Zx@BetEHP^53UjKFA
z1z-*MTOspDEoU#wfMvikU>UFsSOzQumI2FvWxz6E8L$le?=YaZFUm{5d6)P4x7h8<
zKbB$d$2)4qnbJGP*bxj=%6eRR!(~^+O>=kX<on5ipW-7<{!wjuqiIV099^#dPwbID
zUCp$LM*i(}Y1ddnb4zGC<ca7mwdj;m*CbDupAx{nOz|lYdv=FYSuI*@i(}98KLb5_
AcmMzZ

literal 0
HcmV?d00001

diff --git a/llmfoundry/models/inference_api_wrapper/.trtllm.py.swp b/llmfoundry/models/inference_api_wrapper/.trtllm.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..f7844d9b9842f3444bea914762c6b151c28895b9
GIT binary patch
literal 20480
zcmeHPUyLM08E@o|s2m!=2ZNa6R_Pg?>HWh&j?sx;ZWmo{?`~!H03Ft*XS!yl-Jb5g
z?w<W~$3<W8L3z*w2rs_Kli&lw13nSrfl(6;8qgR_d@zazpNtO}AN2QCb$4~m&h6gC
z2NTmBe(ZF2)mLBr>Z`A+>-(TP_rUT=cGOxhaDB)yzWve;f7-b2)*pG-7;zY;^!T7_
z{_cC+x$fP(*}bdVZ62BDPP5xT%A3y7e$VZ3=dOi2k2L2DV|F`?H<QTe@mb#=HT&T>
zaN}M2o&Ow#F87mJFX;1_2R&{(k!NqmP89KYHjPshwW8gF^r_xg3{(tEFfdO0&7+6T
z+_^B1nP?a@hi_*$-*jf8P<2}|P%%(3P%%(3P%%(3P%%(3P%-fT%s?7lZF~(Kd{5R%
zC;NNV#NTVO`xDt;cjEok+5Pp|-{&XZXSda#ih+uOih+uOih+uOih+uOih+uOih+uO
zih+uO{{aIIKIFfFcJH7A0L1_E@Bd$a&@g@gJO`Ww&H(Gc7l2vdM&J-&0DrjAFkS$D
z2y}oi11;bl;0W+h;H^W3@iOo%@Hp@-;G4h@xDS{G-nhXqeh&N$_$lxta2mK4xCXcy
z_|*pt<8|O=;CbLXfCqGeHt-qX5O4$V%l8|`FM#g@Tfh=<EAY?v8OFQ7?||<DyTB)b
z>w&-G0OJYZG2k?CBk=lrp(ntAH?A{`r-6roTY%TDHH<OvY2aqyCg2?$miz>G4R{WC
z6u1+38;2x+27U~j1sGreuYmh=z%H-~m;jB(KX3^1Y<>`f;@K6&UXa$zRmWv%xXFW-
z*<cCx`_}5QMSFR%K3Q<-^cq_~ePZcUUU0I&!IG}LMyBMRwymX8QprLj!zI)go$=1`
z*oqw27I(Jivi9U#9wx=x%aXE>h@!sNFLTJ+ZFVXQc&m6{A{gRCo{HV;53B*lP?Yt}
z2V;6~)Zhq#l8n6q%U#?nu>4jDoMG!^4c<8?^}>KTfy)wS6!~5-P_~nXac^k##;#)}
zJhffE<@I<i9tWv6;&$u=o9q)Tza1g(2DPn0G)`<};Mf7ub5w&9#n6kC2T2&msZDu0
zCuQv#l74*GPB5^ILNYb9SCD9aLdpmUK=iz(iqEh`p|<EEy;I#2q;7>hrz;wSO6o=*
z%Ax1FJjhFx?vyZw%N!@%j^c0>rFJ?Fzy^lnUdK;LJnSigW_w}K_XZuaJNA4R%*lIm
zL)c59iLen#ZPyQbn+H`iCo4diyN@VlB4@x|dvnX~IlUo2P?}zRZ^uSsKlSXhV;=9?
zWbyk5?3j~<&5fd)!FXi*&MuFWeI=-3rMhbT_N}XmmFkv8b>GURa^?n`on^{owLT2F
z;~v}vQR<R%U{kDbE(HTG;IbV?Q-jsct41%>+3jt1znqkMA#H}y+)%Y@^$T^g(AP;4
z9;c<AkW1UQc7r`!)K*gWB~Hq-BPVuzpZg{1-E=qF-(I4a_q+t2s#I%+*iq&zZt0P^
ze*b!NW#uGW+H(A{^luN!tU%_5JYhkYvSb`ZA-Wv%QMkozFNnq|RC8T&VTu?bG<Lj%
zvj-rIFU4^f*9>Zgt{E%@(<G>sVA|UfOOZdu*cKDMV@@ky>N7QJ4&Yv#6m*j7wxx(t
z8S*gGitN*@L2~JuVQV~PTMi~)AAT(JOuCno-U-sanuLMnhmKq8t6|vo(jf~Y9@Jzx
zkeUt~%#6)Cb0AewQ+f|!To}3SdNG@2rhI2n3u-V*McQl?Vr#|9+A-Bjb3<t}!&aal
zgGFRrk=^QhK3AR64BZb018gd2z4Q4N_sebIoqR}57=keT3#P>5THUf?MP6##szyU@
zxoDSCDi3Gd+qAvDjTBW{`%o=OX-c@+vQ<|c75jn~TaEcTEHU72wsP~hQRq+*(PE2U
zFFh_66!n@`I(g^vJ}VS8jGm9FPxN5E&~(Y>No_gLsXey3TzU@8WWQscj^lut=P%g!
zCynqgT3gRwu;e(HRzHkKPAa)*FfqeHN0cw^Vo4`uuqCpyu?w6LmlawpN#m?m_B*TE
zV%W}NYk>OVCp4O2_mZiWM{$<#&}N1wv*#r<B>KXvSYKsIa6Wj-ROm_JIy~tmijlF{
za>_>IBxPL=4YKjV@G{IcmSXwLIE0tTI4pS)=|Gl8<Z((K1kQt85o+3?-(Ehowtj47
zWofZx5Hb6iNj?mFs$wFE{f5DJdOS*5sZD8A(`u~8V=gR7O!+d8R<>(np3t40q>Ss#
zxY$~@UzGPkKvt|Q!bsy?sB7D>aNCA{9U@sbH#jdtw5^i6EirD1H8r^#Pg5TEks6*W
zb_sjU#HKKpgbb246`77xrjyhm9Pxoe{wFV~CX|u)kM_m6>|NUR=4FosyA5_aBFF6b
zQXcs~-|OZj$ZzFx$i3m?S4_gzBi3<j978O`g5^i7!-o$Wz%m)~2+lrX4%v!W+PGTU
z1W`<;y}QhbDNml^!yK-Bb@8F*3byP)!kf$3z@}c`<8g}}LstEeHy5lqlsX<pyRkPI
zrfgwuVF5#&I9~7M3ZoVDUYT^yoN+n~<HVrv|BvJQ{1Je@|M9siu2=9~|8?L#;9b1?
z3-C7ZA|T3U<*Gjw0~G@m0~G@m0~G@m0~G@m0~G@m0~G@m1H=H1V2Ud{VhYD8j%E`a
z4EEBFIp6`0#gVR!!#FQZ5F|;%TGvT?!;UFWaXpuw%3zg#yd%Sm2TIiSpwI9^Dg4Ov
zlCd!y1obv+Qevi&EtIKWfm&0lTl62X2d*wz@63$s&EeC6ASUTx$<T>->u`f<LM`!F
zudfxrxl9)9!<k)2d(~jsu{h2=>H9y#IK4!1W_<sjph3g+GQRsC1&#rq1MUERi*NsD
zfV04H;A6mTz&nWDUj)7f>;S7k6Zkvg^%sH10S8zBt^(dfZ2l?WA>cva!vMwTzXTiw
zZUNpxEdD9rNgx7F01WsI4k+FPo&`F<ZNMK9mwyfjfR6xwLR|h;U<`Z~_%~wm6sLa`
zcmh}l)_}huCjWcj8Q?3xJTM3R3o-c@ffs-nr~wqGzlfOpW55@IBfxb4@wNkO6Mt85
zmF@5Heh1^rD|d-f6kQX?Qaou-gh_QPqWW^GwKc2B=bt)*&RY`VWb(n@m$ihS^?AUW
z)9kl1x6!EUbu!Z|%r^+E{y6B#^EV4Id3zY9HEG)YvG1$U4XKk&p-gf34YWZ2;Pfv=
zGQkerx5&ZxUXs@8de8OBa{B#u^DN*y2ndi{v*ihDDUd<x?s;Eydzu>}o$I<QBO%%-
z*;J_GZ9ve}=T?GvE<eXh@pO`bqg3YM7@Usb2fT&>(;bx%Tc=<Fl`dabWLI+7Bo!D#
zq$?zy`Br&wHV`8a(fzhbSqkQab!dGm$eL~=Mp?~CpP&2MMc%^+xSNqhZdO=x5v76E
zQ-Pd~gDq;x02PZ^0}NAC67&wzVUo3#XK7?HL)|82hvj04{tD%!N^!2*-Z3e69KUHa
z`A)=pDY63u*^>GR^Pi|iw{pBj&Q48Jyl#0ZAJrztM$-+^wsEA_vMA79o13VW55!WY
zAVu|3FUX?2Qc)U~%j`rZu53~|p?Q1DG!i70MuHm9duWu|(CfeknCqmDs)Q#}K`8^b
z^<17bm}y=l7b>$$E^T@8{lR0U6xJt@))Q4|$}bVpYs)dGn4DHqqo<g8*;MWn9hZYV
zU*DKko0!YDC3#KRJ(~8EGEi!L!VhT5$Vx(%@FBVj(I><-%U3nahEkH!oZn!dV)H78
zh3G2EqI&VWSW7eorQH_IOfQtJjKm1_LK+DcC|!SQdFqvIkx7$$z98_7T{XvUxfF3G
z-!5BdD7DGgpYx(A`)oCEq5<?lOzfD8TUkL(Da|3vjWTL<6mc^$S<Xxl7Zy8x&hqu#
zVrwaK83zg5_I#h>uyQ93cPwIu6cV$c{k9XkMoz&DTkebStkjNTWel<1TzG?RBd><U
z<xiLcu~_sHct3{Ks>@>*LWJNP!Cs2>QMeaDHXi%CxD{<tz_zRa_A!XKGtp|k7f)J6
zWUzxcAuYO;H8{CUQdWS7>17_ik*7v<$9?>du2foFmT;%oExS}QrgDB=7Ra?eUwlBV
z@q`j;^c~Rv(McqrQLI*UIs2x8Gzt={rikA?nQZMnPn2~~1xcdF4HOA%n7(Am>VaNd
zPdIDqw|dcOSu^cv5A|x3S!nIa2YT^oNj0L+(_ZLRRkm7N{nU%7q_oK$w_dK4e<ll-
z9_S^hC6Qky+7!Q3fk`vN5R16;agjQrtO7F#SuLL9Q{GuNOJFvb=E8i$UQn2;?uW9J
z4r#NH-M3=o!<F`tDo+RzZz;Y=73%KE71GN<UM63r3uQNy<YADZ9em~4sFX*dthyuz
zO-#*9p)}r(xrIamkYMHu5&VeCTQ&0LOUXx4vQAN6?9zW}0K427CGB~o<fV(3-%ygV
e=Sl;a{MZtHVqzbO{_6Q0H5@+_5@hCa!}t%p-wa3q

literal 0
HcmV?d00001

diff --git a/llmfoundry/models/inference_api_wrapper/__init__.py b/llmfoundry/models/inference_api_wrapper/__init__.py
index 2a2a47ecd1..a2a7f7c227 100644
--- a/llmfoundry/models/inference_api_wrapper/__init__.py
+++ b/llmfoundry/models/inference_api_wrapper/__init__.py
@@ -10,3 +10,4 @@
     'OpenAIChatAPIEvalWrapper',
     'InferenceAPIEvalWrapper',
     'TRTLLMEvalWrapper',
+]
diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index 7f6fee9c7a..b1226ca6cd 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -12,8 +12,7 @@
 from omegaconf import DictConfig
 from transformers import PreTrainedTokenizer
 
-from llmfoundry.models.inference_api_wrapper.interface import \
-    InferenceAPIEvalWrapper
+from llmfoundry.models.inference_api_wrapper.interface import InferenceAPIEvalWrapper
 
 __all__ = ['TRTLLMEvalWrapper']
 
@@ -59,13 +58,11 @@ def __init__(
         # Set vars from config
         use_gpt_attention_plugin = config['plugin_config'][
             'gpt_attention_plugin']
-        inflight_batching_gpt_attention_plugin = config['plugin_config'][
-            'inflight_batching_gpt_attention_plugin']
         remove_input_padding = config['plugin_config']['remove_input_padding']
-        if remove_input_padding:
-            raise ValueError(
-                'TRT-LLM Evaluation Wrapper does not support remove_input_padding.'
-            )
+        #if remove_input_padding:
+        #    raise ValueError(
+        #        'TRT-LLM Evaluation Wrapper does not support remove_input_padding.'
+        #    )
         dtype = config['builder_config']['precision']
         world_size = config['builder_config']['tensor_parallel']
         assert world_size == tensorrt_llm.mpi_world_size(), \
@@ -79,6 +76,7 @@ def __init__(
         tokens_per_block = config['builder_config'].get('tokens_per_block', 64)
         use_prompt_tuning = config['builder_config'].get(
             'use_prompt_tuning', False)
+        # Add quant mode here
 
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
@@ -116,13 +114,12 @@ def __init__(
             vocab_size=self.vocab_size,
             num_layers=num_layers,
             gpt_attention_plugin=use_gpt_attention_plugin,
-            inflight_batching_gpt_attention_plugin=
-            inflight_batching_gpt_attention_plugin,
             multi_query_mode=multi_query_mode,
             remove_input_padding=remove_input_padding,
             paged_kv_cache=paged_kv_cache,
             tokens_per_block=tokens_per_block,
-            use_prompt_tuning=use_prompt_tuning)
+            use_prompt_tuning=use_prompt_tuning,
+            gather_all_token_logits = True)
         self.decoder = tensorrt_llm.runtime.GenerationSession(
             trt_model_config, engine_buffer, runtime_mapping)
 
@@ -153,11 +150,14 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                                torch.max(input_lengths).item(),
                                len(expected_cont_tokens))
 
-            output_ids, output_logits_list = self.decoder.decode(
+            output_dict = self.decoder.decode(
                 input_ids, input_lengths, self.sampling_config)
 
-            #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
+            return output_dict['generation_logits']
 
+            #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
+            """
+            # Old logits logic, back before TRT-LLM natively returned logits
             output_logits = torch.nn.functional.one_hot(
                 torch.tensor(tokens[1:cont_idxs[0]], device='cuda'),
                 num_classes=self.vocab_size)
@@ -177,5 +177,6 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             output_logits = torch.cat([output_logits, padding])
             #print("Output logits shape:", output_logits.shape)
             output_logits_batch.append(output_logits)
-
-        return torch.stack(output_logits_batch).to(batch['input_ids'].device)
+            
+            return torch.stack(output_logits_batch).to(batch['input_ids'].device)
+            """
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 8c134e2b9f..e0f06e4dc4 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -31,7 +31,8 @@
         from flash_attn.layers.rotary import \
             RotaryEmbedding as DAILRotaryEmbedding
     except Exception as e:
-        raise e
+        print("No Rotary Embedding. Okay for just eval'ing TRT models")
+        # raise e
 
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index 369a894720..6b4848eede 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -19,7 +19,7 @@
 from transformers import (AutoModelForCausalLM, PreTrainedTokenizerBase,
                           T5ForConditionalGeneration)
 
-from llmfoundry.models import MPTForCausalLM
+# from llmfoundry.models import MPTForCausalLM
 from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
 from llmfoundry.utils.builders import (add_metrics_to_eval_loaders,
                                        build_evaluators, build_logger,
diff --git a/scripts/eval/evaluate_trtllm_test.py b/scripts/eval/evaluate_trtllm_test.py
new file mode 100644
index 0000000000..832f50f0e4
--- /dev/null
+++ b/scripts/eval/evaluate_trtllm_test.py
@@ -0,0 +1,78 @@
+# This file lives in llmfoundry/scripts/eval/evaluate_trtllm_test.py
+# It is not included in the PR because it should not be committed.
+# It must be called from llmfoundry/scripts directory, like so: composer -n 1 eval/evaluate_trtllm_test.py
+# Multi-GPU inference does not work, you must run with -n 1 or CUDA_VISIBLE_DEVICES={A single gpu number}
+# All this can be written in YAML form.
+
+
+from eval import main as run_evaluation
+from omegaconf import OmegaConf as om
+from omegaconf import DictConfig
+
+trt_gpt_config = {
+    'run_name': 'trtllm-eval',
+    'seed': 0,
+    'max_seq_len': 1024,
+    'device_eval_batch_size': 4,
+    'precision': 'amp_fp16',
+    'dist_timeout': 6000,
+    'models':
+    [
+        {
+            'model_name': 'trtllm/gpt',
+            'model':
+            {
+                'name': 'trtllm',
+                'version': 'gpt',
+                'engine_dir': '/workspace/tensorrt-llm-private/examples/gpt/engine_outputs',
+                'log_level': 'error'
+            },
+            'tokenizer':
+            {
+                'name': 'gpt2'
+            }
+        }
+    ],
+    'icl_tasks': './eval/yamls/tasks_v0.2.yaml',
+    'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml',
+}
+
+trt_llama_config = {
+    'run_name': 'trtllm-eval',
+    'seed': 0,
+    'max_seq_len': 2048,
+    'device_eval_batch_size': 4,
+    'precision': 'amp_bf16',
+    'dist_timeout': 6000,
+    'models':
+    [
+        {
+            'model_name': 'trtllm/llama',
+            'model':
+            {
+                'name': 'trtllm',
+                'version': 'llama',
+                'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/trt-models/llama-2-7b-chat/bf16/1-gpu',
+                'log_level': 'error',
+                'eos_token_id': 2,
+                'pad_token_id': 2
+            },
+            'tokenizer':
+            {
+                'name': '/workspace/llama-70b-chat-hf/'
+            }
+        }
+    ],
+    'icl_tasks': './eval/yamls/tasks_v0.2.yaml',
+    'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml',
+    'loggers': {
+        'wandb': {
+            'project': 'nik-quant-eval'
+        }
+    }
+}
+
+om_dict_config: DictConfig = om.create(trt_gpt_config)
+print("OmegaConfig dictionary", om.to_yaml(om_dict_config))
+
+run_evaluation(om_dict_config)
diff --git a/setup.py b/setup.py
index a228105a4c..85890b37d6 100644
--- a/setup.py
+++ b/setup.py
@@ -51,7 +51,7 @@
     'accelerate>=0.20,<0.21',  # for HF inference `device_map`
     'transformers>=4.34.1,<4.35',
     'mosaicml-streaming>=0.7.1,<0.8',
-    'torch>=2.1,<2.1.1',
+    # 'torch>=2.1,<2.1.1', # Already installed in TRT-LLM container
     'datasets>=2.14.5,<2.15',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data
     'sentencepiece==0.1.97',

From a872c3d4febd698344738f8b1ef171881d7c189c Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Wed, 13 Dec 2023 08:25:57 -0800
Subject: [PATCH 06/21] add new yamls, fix trt bugs

---
 .../models/inference_api_wrapper/interface.py |   5 +-
 .../models/inference_api_wrapper/trtllm.py    | 100 +++++++++++-------
 scripts/eval/evaluate_trtllm_test.py          |   4 +-
 .../eval/yamls/mini_eval_gauntlet_v0.2.yaml   |  13 +++
 scripts/eval/yamls/mini_tasks_v0.2.yaml       |   8 ++
 5 files changed, 88 insertions(+), 42 deletions(-)
 create mode 100644 scripts/eval/yamls/mini_eval_gauntlet_v0.2.yaml
 create mode 100644 scripts/eval/yamls/mini_tasks_v0.2.yaml

diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 6a6fc14888..1d30b57b60 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -41,8 +41,9 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
 
     def get_metrics(self, is_train: bool = False):
         if is_train:
-            raise NotImplementedError(
-                'You cannot use inference wrappers for training')
+            metrics = {} # Cannot use inference wrappers for training
+            # raise NotImplementedError(
+            #    'You cannot use inference wrappers for training')
         else:
             metrics = self.eval_metrics
 
diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index b1226ca6cd..b4490a6fb4 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -19,6 +19,7 @@
 try:
     import tensorrt_llm
     from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+    from tensorrt_llm.quantization import QuantMode
     TRT_LLM_INSTALLED = True
 except ImportError:
     TRT_LLM_INSTALLED = False
@@ -32,8 +33,11 @@ def check_if_trt_llm_installed():
 
 
 # From tensorrt_llm/examples/{model_name}/build.py
-def get_engine_name(model: str, dtype: str, tp_size: int, rank: int):
-    return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+def get_engine_name(model: str, dtype: str, tp_size: int, pp_size: int, rank: int):
+    if pp_size == 1:
+        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size,
+                                                  pp_size, rank)
 
 
 class TRTLLMEvalWrapper(InferenceAPIEvalWrapper):
@@ -54,37 +58,67 @@ def __init__(
         config_path = engine_dir / 'config.json'
         with open(config_path, 'r') as f:
             config = json.load(f)
+        
+        dtype = config['builder_config']['precision']
+        tp_size = config['builder_config']['tensor_parallel']
+        pp_size = config['builder_config'].get('pipeline_parallel', 1)
+        world_size = tp_size * pp_size
 
-        # Set vars from config
-        use_gpt_attention_plugin = config['plugin_config'][
-            'gpt_attention_plugin']
+        assert world_size == tensorrt_llm.mpi_world_size(), \
+            f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
+        
+        num_heads = config['builder_config']['num_heads'] // tp_size
+        hidden_size = config['builder_config']['hidden_size'] // tp_size
+        vocab_size = config['builder_config']['vocab_size']
+        num_layers = config['builder_config']['num_layers']
+        use_gpt_attention_plugin = bool(
+            config['plugin_config']['gpt_attention_plugin'])
         remove_input_padding = config['plugin_config']['remove_input_padding']
         #if remove_input_padding:
         #    raise ValueError(
         #        'TRT-LLM Evaluation Wrapper does not support remove_input_padding.'
         #    )
-        dtype = config['builder_config']['precision']
-        world_size = config['builder_config']['tensor_parallel']
-        assert world_size == tensorrt_llm.mpi_world_size(), \
-            f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
-        num_heads = config['builder_config']['num_heads'] // world_size
-        hidden_size = config['builder_config']['hidden_size'] // world_size
-        vocab_size = config['builder_config']['vocab_size']
-        num_layers = config['builder_config']['num_layers']
-        multi_query_mode = config['builder_config']['multi_query_mode']
-        paged_kv_cache = config['builder_config'].get('paged_kv_cache', False)
-        tokens_per_block = config['builder_config'].get('tokens_per_block', 64)
-        use_prompt_tuning = config['builder_config'].get(
-            'use_prompt_tuning', False)
-        # Add quant mode here
+       
+        num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads)
+        paged_kv_cache = config['plugin_config']['paged_kv_cache']
+        tokens_per_block = config['plugin_config']['tokens_per_block']
+        use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce',
+                                                            False)
+
+        quant_mode = QuantMode(config['builder_config']['quant_mode'])
+        if config['builder_config'].get('multi_query_mode', False):
+            tensorrt_llm.logger.warning(
+                "`multi_query_mode` config is deprecated. Please rebuild the engine."
+            )
+            num_kv_heads = 1
+        num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
+
+        model_config = tensorrt_llm.runtime.ModelConfig(
+            vocab_size=vocab_size,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            hidden_size=hidden_size,
+            paged_kv_cache=paged_kv_cache,
+            tokens_per_block=tokens_per_block,
+            gpt_attention_plugin=use_gpt_attention_plugin,
+            remove_input_padding=remove_input_padding,
+            use_custom_all_reduce=use_custom_all_reduce,
+            dtype=dtype,
+            quant_mode=quant_mode,
+            gather_all_token_logits=True)
+
 
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
 
         # Device and rank
         runtime_rank = tensorrt_llm.mpi_rank()
-        runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
-        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+        runtime_mapping = tensorrt_llm.Mapping(world_size,
+                                           runtime_rank,
+                                           tp_size=tp_size,
+                                           pp_size=pp_size)
+        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)        
 
         # Tokenization and sampling
         self.END_ID = model_cfg.get('eos_token_id', self.tokenizer.eos_token_id)
@@ -92,36 +126,26 @@ def __init__(
         if self.PAD_ID == None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
             self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-
+            self.PAD_ID = self.tokenizer.eos_token_id
+          
         print('EOS TOKEN:', self.END_ID)
         print('Pad token:', self.PAD_ID)
 
         self.sampling_config = SamplingConfig(end_id=self.END_ID,
                                               pad_id=self.PAD_ID,
-                                              num_beams=1)
+                                              num_beams=1,
+                                              return_dict=True)
 
         # Load TRT engine
-        engine_name = get_engine_name(model_cfg['version'], dtype, world_size,
+        engine_name = get_engine_name(model_cfg['version'], dtype, tp_size, pp_size,
                                       runtime_rank)
         serialize_path = engine_dir / engine_name
         with open(serialize_path, 'rb') as f:
             engine_buffer = f.read()
 
-        # Initialize generation session for model
-        trt_model_config = ModelConfig(
-            num_heads=num_heads,
-            hidden_size=self.hidden_size,
-            vocab_size=self.vocab_size,
-            num_layers=num_layers,
-            gpt_attention_plugin=use_gpt_attention_plugin,
-            multi_query_mode=multi_query_mode,
-            remove_input_padding=remove_input_padding,
-            paged_kv_cache=paged_kv_cache,
-            tokens_per_block=tokens_per_block,
-            use_prompt_tuning=use_prompt_tuning,
-            gather_all_token_logits = True)
         self.decoder = tensorrt_llm.runtime.GenerationSession(
-            trt_model_config, engine_buffer, runtime_mapping)
+            model_config, engine_buffer, runtime_mapping)
+
 
     def eval_forward(self, batch, outputs: Optional[Any] = None):
         # If the batch mode is generate, we will generate a requested number of tokens using the underlying
diff --git a/scripts/eval/evaluate_trtllm_test.py b/scripts/eval/evaluate_trtllm_test.py
index 832f50f0e4..a0cfc7ac7d 100644
--- a/scripts/eval/evaluate_trtllm_test.py
+++ b/scripts/eval/evaluate_trtllm_test.py
@@ -33,8 +33,8 @@
             }
         }
     ],
-    'icl_tasks': './eval/yamls/tasks_v0.2.yaml',
-    'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml',
+    'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml',
+    'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml',
 }
 
 trt_llama_config = {
diff --git a/scripts/eval/yamls/mini_eval_gauntlet_v0.2.yaml b/scripts/eval/yamls/mini_eval_gauntlet_v0.2.yaml
new file mode 100644
index 0000000000..b35c0f2873
--- /dev/null
+++ b/scripts/eval/yamls/mini_eval_gauntlet_v0.2.yaml
@@ -0,0 +1,13 @@
+eval_gauntlet:
+  weighting: EQUAL
+  subtract_random_baseline: true
+  rescale_accuracy: true
+  averages:
+    core_average:
+    - world_knowledge
+  categories:
+  - name: world_knowledge
+    benchmarks:
+    - name: jeopardy
+      num_fewshot: 3
+      random_baseline: 0
diff --git a/scripts/eval/yamls/mini_tasks_v0.2.yaml b/scripts/eval/yamls/mini_tasks_v0.2.yaml
new file mode 100644
index 0000000000..2366ccfb8d
--- /dev/null
+++ b/scripts/eval/yamls/mini_tasks_v0.2.yaml
@@ -0,0 +1,8 @@
+icl_tasks:
+-
+  label: jeopardy
+  dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl
+  num_fewshot: [3]
+  icl_task_type: language_modeling
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+  has_categories: true

From f01be4271c24b65070b2f73841cdd61d94892a4e Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Wed, 13 Dec 2023 10:25:34 -0800
Subject: [PATCH 07/21] update trt wrapper for new logit format

---
 .../models/inference_api_wrapper/trtllm.py    | 28 +++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index b4490a6fb4..a862b6b026 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -175,9 +175,33 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                                len(expected_cont_tokens))
 
             output_dict = self.decoder.decode(
-                input_ids, input_lengths, self.sampling_config)
+                input_ids, input_lengths, self.sampling_config, return_dict=True)
+            
+            context_logits = output_dict['context_logits']
+            context_logits = context_logits.squeeze()
+            output_logits_list = output_dict['generation_logits']
+            for i in range(len(output_logits_list)):
+                output_logits_list[i] = output_logits_list[i].squeeze()
+            print("Context logits:", context_logits.shape)
+            print("Output logits list:", output_logits_list)
+            print("Output logits 0 shape:", output_logits_list[0].shape)
+            output_logits_tensor = torch.stack(output_logits_list)
+            print("Output logits stacked:", output_logits_tensor.shape)
+            combined_logits = torch.cat([context_logits, output_logits_tensor])
+            print("Combined logits shape:", combined_logits.shape)
+            
+            padding = torch.nn.functional.one_hot(
+                torch.full(
+                    (seqlen - combined_logits.shape[0],),
+                    self.PAD_ID,
+                    device=combined_logits.device
+                ),
+                num_classes=self.vocab_size)
+            padded_combined_logits = torch.cat([combined_logits, padding])
+
+            output_logits_batch.append(padded_combined_logits)
 
-            return output_dict['generation_logits']
+            return torch.stack(output_logits_batch).to(batch['input_ids'].device)
 
             #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
             """

From c5a79da7f7bea0c8239c38c570cce2efc6b923dc Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Wed, 13 Dec 2023 11:10:37 -0800
Subject: [PATCH 08/21] more padding and shape fixes

---
 .../models/inference_api_wrapper/trtllm.py    | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index a862b6b026..103b3e862f 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -182,13 +182,17 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             output_logits_list = output_dict['generation_logits']
             for i in range(len(output_logits_list)):
                 output_logits_list[i] = output_logits_list[i].squeeze()
-            print("Context logits:", context_logits.shape)
-            print("Output logits list:", output_logits_list)
-            print("Output logits 0 shape:", output_logits_list[0].shape)
-            output_logits_tensor = torch.stack(output_logits_list)
-            print("Output logits stacked:", output_logits_tensor.shape)
-            combined_logits = torch.cat([context_logits, output_logits_tensor])
-            print("Combined logits shape:", combined_logits.shape)
+            # print("Context logits:", context_logits.shape)
+            # print("Output logits list:", output_logits_list)
+            if len(output_logits_list) > 0:
+                # print("Output logits 0 shape:", output_logits_list[0].shape)
+                output_logits_tensor = torch.stack(output_logits_list)
+                # print("Output logits stacked:", output_logits_tensor.shape)
+                combined_logits = torch.cat([context_logits, output_logits_tensor])
+            else:
+                combined_logits = context_logits
+
+            # print("Combined logits shape:", combined_logits.shape)
             
             padding = torch.nn.functional.one_hot(
                 torch.full(
@@ -199,12 +203,14 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                 num_classes=self.vocab_size)
             padded_combined_logits = torch.cat([combined_logits, padding])
 
-            output_logits_batch.append(padded_combined_logits)
+            # print("Padded combined logits shape:", padded_combined_logits.shape)
 
-            return torch.stack(output_logits_batch).to(batch['input_ids'].device)
+            output_logits_batch.append(padded_combined_logits)
+            
+        return torch.stack(output_logits_batch).to(batch['input_ids'].device)
 
-            #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
-            """
+        #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
+        """
             # Old logits logic, back before TRT-LLM natively returned logits
             output_logits = torch.nn.functional.one_hot(
                 torch.tensor(tokens[1:cont_idxs[0]], device='cuda'),
@@ -226,5 +232,5 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             #print("Output logits shape:", output_logits.shape)
             output_logits_batch.append(output_logits)
             
-            return torch.stack(output_logits_batch).to(batch['input_ids'].device)
-            """
+        return torch.stack(output_logits_batch).to(batch['input_ids'].device)
+        """

From 19abfe2fc44542f0f240e07f1a674c9518227165 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Wed, 13 Dec 2023 15:01:42 -0800
Subject: [PATCH 09/21] update run script

---
 scripts/eval/evaluate_trtllm_test.py | 46 +++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/scripts/eval/evaluate_trtllm_test.py b/scripts/eval/evaluate_trtllm_test.py
index a0cfc7ac7d..682fb5295b 100644
--- a/scripts/eval/evaluate_trtllm_test.py
+++ b/scripts/eval/evaluate_trtllm_test.py
@@ -9,6 +9,7 @@
 from omegaconf import OmegaConf as om
 from omegaconf import DictConfig
 
+# GPT config is just for quick initial testing purposes
 trt_gpt_config = {
     'run_name': 'trtllm-eval',
     'seed': 0,
@@ -52,7 +53,43 @@
             {
                 'name': 'trtllm',
                 'version': 'llama',
-                'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/trt-models/llama-2-7b-chat/bf16/1-gpu',
+                'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/7B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/1-gpu',
+                'log_level': 'error',
+                'eos_token_id': 2,
+                'pad_token_id': 2
+            },
+            'tokenizer':
+            {
+                'name': '/workspace/llama-7b-chat-hf/'
+            }
+        }
+    ],
+    'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml',
+    'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml',
+    'loggers': {
+        'wandb': {
+            'project': 'nik-quant-eval'
+        }
+    }
+}
+
+
+trt_llama70b_config = {
+    'run_name': 'trtllm-eval',
+    'seed': 0,
+    'max_seq_len': 2048,
+    'device_eval_batch_size': 4,
+    'precision': 'amp_bf16',
+    'dist_timeout': 6000,
+    'models':
+    [
+        {
+            'model_name': 'trtllm/llama',
+            'model':
+            {
+                'name': 'trtllm',
+                'version': 'llama',
+                'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/8-gpu',
                 'log_level': 'error',
                 'eos_token_id': 2,
                 'pad_token_id': 2
@@ -63,8 +100,8 @@
             }
         }
     ],
-    'icl_tasks': './eval/yamls/tasks_v0.2.yaml',
-    'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml',
+    'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml',
+    'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml',
     'loggers': {
         'wandb': {
             'project': 'nik-quant-eval'
@@ -72,7 +109,8 @@
     }
 }
 
-om_dict_config: DictConfig = om.create(trt_gpt_config)
+
+om_dict_config: DictConfig = om.create(trt_llama70b_config)
 print("OmegaConfig dictionary", om.to_yaml(om_dict_config))
 
 run_evaluation(om_dict_config)

From 3a3b334445389c038ac9a53dfcead48b67e33c4c Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Mon, 18 Dec 2023 23:00:53 -0800
Subject: [PATCH 10/21] update utils for multigpu trt models

---
 .../inference_api_wrapper/.trtllm.py.swo      | Bin 16384 -> 0 bytes
 .../models/inference_api_wrapper/interface.py |   7 +-
 .../models/inference_api_wrapper/trtllm.py    |  28 +-
 llmfoundry/utils/builders.py                  |  15 +-
 scripts/eval/eval_trt_multigpu.py             | 407 ++++++++++++++++++
 scripts/eval/evaluate_trtllm_test.py          |  12 +-
 6 files changed, 451 insertions(+), 18 deletions(-)
 delete mode 100644 llmfoundry/models/inference_api_wrapper/.trtllm.py.swo
 create mode 100644 scripts/eval/eval_trt_multigpu.py

diff --git a/llmfoundry/models/inference_api_wrapper/.trtllm.py.swo b/llmfoundry/models/inference_api_wrapper/.trtllm.py.swo
deleted file mode 100644
index 65e810630861a82ffa1e760d1eee34e247eae123..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeHO&yO5O6)s{YI1n2|;1q<)J3>8+W_s2e$8rYtu=1|OhF$L}y9tQ3snm4W%rx86
z-L9@)du*-X!i66PB#^jpK}g}(f#eT}xFjM-K>_6g9FVvnt{{aA@Kts9bkFQ~_6S1N
z^0(7n_3G8DuU@^Xt}gdhUf$ZF*WFcu>!XBRdF#FiE<f?i`wz|&8OKT|pLJci_PoE+
zyC&Mb=X$;N)hmLxd;RO8&9C=8-xK_~)n~7^R|x6ci{(+8a8Go?aMbR{S>(&HzAtY_
zu`j~36GVL>MdS&_lYrfmJV}J?D5)UoCgZd0vH6w(%fLJaE|O<guV5q+(pb7opZ?fO
z^Mq`gWxz6E8L$jk1}p=X0n318z%uZE%79AFk*}d`j}`6nir){+{QkSRc8YX=CjVbW
z{u4#|Ycu)9qy4cASOzQumI2FvWxz6E8L$jk1}p=X0n318;J?5C$BX=@Q14s%g&*Jl
z%lH4^e}s@<06z!r0xtmvz~_Mua0$2w5a7?32zecN4LAh840M6#fvdnLfj?a&<PG3w
zz_)>K0pA3MzzaYJc<(7fegnJ>{2KT^unRm6oCnSUzxyyD?*eZCzXZMu1V9g113m{_
z1fBwZ`yoQ!0e%V`1Dn7zz(1cP<bB`|zz=~j@EPC&@Yf53d=K~runSxQ-u)o#1Ss$x
zHaLC&+y*`ly!kjG8Sq))Y2Y8{3Hdwl4)7D;Rp2|oSAb6dZ=XZkfFA?<z+=E)u=(;M
z;5KjwtO8F07XaNaZ<d=ekMs{(T&W2AZpueV7(@f+#Zf;P&^5YWq}(tGO%eGl@Yjs+
z=B*93wb4T3iYt*pq?*ni_o<4HMAUU!)JX2FYsuwV!Oh)$da(P2&07<}T;+15QfoFU
zjKbi7kZ!_#W}aXO44laoaXQJJ)VD~>0u__;LBHM(y0%7d#gXVv@@tKRI&%_*dHsPq
z5TK)OZ%N4f)>H%;g*h3g71o=0T46o6QrMsex<`TsTm^ANdF0cwuPd@uEWM%YWj=RP
zp_ng@15Y$%7O7w)nB>tB{WL8fM>tsnPjLrHmNJ~NVG$1CR29yMp%qF*X)L8;`XtUy
zUb=~6J)zH1bZl#)Syyygk*R)0%Lqq&bb7NAZ_o{6wrQfytD0v>J&rxzGZn(5>S4Mo
zhk@^lsFbQbRmvK4-S_F;j7Q3JI342H(L@Wy<z%T~iHt{yVk(Qk6*}uC57Qb;r$k^>
zKJ*TqUKWHtc+H=kJI0$PH5d=diSLE6cXU=kXSM*0xpuYUE#U*<v!i3?ac?Nj)HW@C
zx?!U%Q~|r22{~qZJS<Rn=*$W`cP7n@vJngUSjcpthDxkfw(8x5Wh=2-*;=13EL@Xw
z?$Az$Rt`PYhN0m8*-bD~kJJO7qJ3vG8U&HZ>!Et!(B{Lc(vxL+d5yl957}v<sqv>r
zPF1S8e6s9J?B!`Hq^h+<2fT%)TlDs%bftA6ak)O0aLL0^gf;4YHBJ`$OCp6Aq=;j+
zQVq>c<<90&CpiZ%9<;Z&cj)FZ53@Y_eK9{3Nc~u(G>R2Xvm}YpWGP1Rv0y=zWD2JF
zz7B8|F-EE6K`Q9WP$o8|jAfJPYV_5>G%-&@HBZdTt+AAZ8TxiI;18X7^0hYWN}U11
z3s<012X>aLsLALEGf!kewM6UHSA*^gMUOcKUmtO-h)h)s)H0KW!dKHca>JPW&3@Gl
z_X0JfaU!B-E(cQ8utgo|EK?4x6*aYdm?n&qGe3~DL!CU&)iqR|Q4?9Cdr;e)%)5t9
zmFe86tZC3~*hk?JrP-!g{U8)oqoy1^j0XcOIrMB8ienMh>p)(fsH-5F3w^<rglsOm
z3|<r{#;Ovnd{Ly=Fg1AuTknVkeTE~ddS*niROun%7n9m@#nEKd(4~7)9KaJJ;TQ9|
zkS$^iz@G^bUAhr?>V{dmRoQwT%%6PGrxQc5fDbYBH9Z(Fdbkwhq`6g|sa@aOs-?rS
zyxmf$Oh(jsc*5{cCHSLf-iIe{-Vcu3kL8G~oQoDUBOG)j<-~6$d~FOCOswd;h>t{G
zpi5IFi&DjXQM603;KafK<;^WDYS5e7RqMUj5%={XR-~PWIWt=INyp5u`H|p~xXM*@
z$|Q6Id6by+j7zsL>1U~;Jpl`f{zCUsj5ew;eHI)dN)#N{qDUS<wnjSQ)FKFihYm$3
z$v{8by0w3>zP-J<(IuLgMQ&1d!>OVeNM^l3#C=aBiq`7PooZf*1DWOP7&A8NJi5i2
zjt0hdx}#5A7tS^Fo<KBzg;-q$QF07(8H0y22Kx>*$<ywJ;6+T`)!}Z}^jqrAJ=~2{
zC1f8*!^;{U;je{S1$RlcL+V9EVdI?Pl-m%G#DMGgQ%Y6?ij?istL9SoSg(mo+4X4L
zqPvL>%skBX>G&50y;4HQtx}H;Z+Q7NgRuLGdjku{7!$E!`4x9*X^8+^+L1>H_9^As
zSIpEVs;SKoMe}3tKIKxMlcyYKcIp1!#+TaLSkOnQXm4S8tAc(YWS6evRQ*u2SKSpz
z-H4O13<g6*S65b7(Zwkbyq#^TXV9DZp?k_RHH>9S^!xv-`u!Zx@BetEHP^53UjKFA
z1z-*MTOspDEoU#wfMvikU>UFsSOzQumI2FvWxz6E8L$le?=YaZFUm{5d6)P4x7h8<
zKbB$d$2)4qnbJGP*bxj=%6eRR!(~^+O>=kX<on5ipW-7<{!wjuqiIV099^#dPwbID
zUCp$LM*i(}Y1ddnb4zGC<ca7mwdj;m*CbDupAx{nOz|lYdv=FYSuI*@i(}98KLb5_
AcmMzZ

diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 1d30b57b60..6ebc472f91 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -28,7 +28,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
         eval_metrics = [
             LanguageCrossEntropy(),
             LanguagePerplexity(),
-            InContextLearningLMAccuracy(),
+            InContextLearningLMAccuracy()
             InContextLearningMultipleChoiceAccuracy(),
             InContextLearningQAAccuracy(),
             InContextLearningLMExpectedCalibrationError(),
@@ -101,6 +101,11 @@ def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None:
                 'mode', None) == 'icl_task':
             assert self.labels is not None
             metric.update(batch, outputs, self.labels)
+            for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
+                cont_tok_pred = outputs[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
+                cont_tok_targ = self.labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
+                print("Ground Truth Label:", self.tokenizer.decode(self.labels[batch_idx].tolist()[:-1]))
+                print("Model output:", self.tokenizer.decode(cont_tok_pred))
         else:
             raise NotImplementedError(
                 'Inference API wrapper only supports InContextLearningMetrics and mode=icl_task'
diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index 103b3e862f..ccf5f728e6 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -118,7 +118,8 @@ def __init__(
                                            runtime_rank,
                                            tp_size=tp_size,
                                            pp_size=pp_size)
-        torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)        
+        self.device_num = runtime_rank % runtime_mapping.gpus_per_node
+        torch.cuda.set_device(self.device_num)
 
         # Tokenization and sampling
         self.END_ID = model_cfg.get('eos_token_id', self.tokenizer.eos_token_id)
@@ -144,7 +145,10 @@ def __init__(
             engine_buffer = f.read()
 
         self.decoder = tensorrt_llm.runtime.GenerationSession(
-            model_config, engine_buffer, runtime_mapping)
+            model_config, engine_buffer, runtime_mapping, debug_mode=False)
+
+        print("!!! Initialized generation session for rank:", runtime_rank)        
+        torch.cuda.synchronize()
 
 
     def eval_forward(self, batch, outputs: Optional[Any] = None):
@@ -161,27 +165,35 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
 
             prompt = tokens[:cont_idxs[0]]
-            input_ids = torch.tensor([prompt], dtype=torch.int, device='cuda')
+            
+            
+            input_ids = torch.tensor([prompt], dtype=torch.int, device="cuda:" + str(self.device_num))
             input_lengths = torch.tensor([input_ids.size(1)],
                                          dtype=torch.int,
-                                         device='cuda')
-            #print("prompt:", self.tokenizer.decode(prompt))
+                                         device="cuda:"+ str(self.device_num))
+            # print("prompt:", self.tokenizer.decode(prompt))
+            # print("Input device:", input_ids.get_device())
             #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
             #print("Input lengths:", input_lengths)
             #print(cont_idxs[0])
             #print("Expected continuation tokens:", len(expected_cont_tokens))
-            self.decoder.setup(input_lengths.size(0),
+            with torch.no_grad():
+                self.decoder.setup(input_lengths.size(0),
                                torch.max(input_lengths).item(),
                                len(expected_cont_tokens))
 
-            output_dict = self.decoder.decode(
-                input_ids, input_lengths, self.sampling_config, return_dict=True)
+                output_dict = self.decoder.decode(
+                    input_ids, input_lengths, self.sampling_config, return_dict=True)
             
+            torch.cuda.synchronize()
+
             context_logits = output_dict['context_logits']
             context_logits = context_logits.squeeze()
             output_logits_list = output_dict['generation_logits']
+            # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())
             for i in range(len(output_logits_list)):
                 output_logits_list[i] = output_logits_list[i].squeeze()
+                print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()))
             # print("Context logits:", context_logits.shape)
             # print("Output logits list:", output_logits_list)
             if len(output_logits_list) > 0:
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index a672fbee55..58987fb71d 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -43,6 +43,11 @@
 
 log = logging.getLogger(__name__)
 
+try:
+    import tensorrt_llm
+    TENSORRT_LLM = True
+except:
+    TENSORRT_LLM = False
 
 def build_evaluators(
     eval_loader_config: Optional[Union[DictConfig, ListConfig]],
@@ -491,9 +496,13 @@ def _validate_cfg(icl_cfg: DictConfig):
             metric_names = list(icl_cfg.metric_names)
             # TODO: fix Composer bug when copying local paths and destination exists
             destination_path = f'{destination_dir}/{icl_cfg.label}-{num_fewshot}.jsonl'
-            if dist.get_local_rank() == 0 and os.path.exists(destination_path):
-                os.remove(destination_path)
-            dist.barrier()
+            if TENSORRT_LLM:
+                if tensorrt_llm.mpi_rank() == 0 and os.path.exists(destination_path):
+                    os.remove(destination_path)
+            else:
+                if dist.get_global_rank() == 0 and os.path.exists(destination_path):
+                    os.remove(destination_path)
+                dist.barrier()
 
             dataloaders = get_icl_task_dataloader(
                 icl_cfg.icl_task_type,
diff --git a/scripts/eval/eval_trt_multigpu.py b/scripts/eval/eval_trt_multigpu.py
new file mode 100644
index 0000000000..809b1a53e3
--- /dev/null
+++ b/scripts/eval/eval_trt_multigpu.py
@@ -0,0 +1,407 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+import sys
+import time
+import warnings
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import pandas as pd
+import torch
+from composer.loggers.logger_destination import LoggerDestination
+from composer.models.base import ComposerModel
+from composer.trainer import Trainer
+from composer.utils import dist, get_device, reproducibility
+from omegaconf import DictConfig, ListConfig
+from omegaconf import OmegaConf as om
+from transformers import (AutoModelForCausalLM, PreTrainedTokenizerBase,
+                          T5ForConditionalGeneration)
+
+# from llmfoundry.models import MPTForCausalLM
+from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
+from llmfoundry.utils.builders import (add_metrics_to_eval_loaders,
+                                       build_evaluators, build_logger,
+                                       build_tokenizer)
+from llmfoundry.utils.config_utils import pop_config, process_init_device
+
+
+
+def load_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
+               fsdp_config: Optional[Dict], num_retries: int) -> ComposerModel:
+    init_context = process_init_device(model_cfg, fsdp_config)
+
+    retries = 0
+    composer_model = None
+    with init_context:
+        while retries < num_retries and composer_model is None:
+            try:
+                composer_model = COMPOSER_MODEL_REGISTRY[model_cfg.name](
+                    model_cfg, tokenizer)
+            except Exception as e:
+                retries += 1
+                if retries >= num_retries:
+                    raise e
+                else:
+                    print(
+                        f'Got exception {str(e)} while loading model {model_cfg.name}. {num_retries-retries} retries remaining'
+                    )
+
+    assert composer_model is not None
+    return composer_model
+
+
+def evaluate_model(
+    model_cfg: DictConfig,
+    dist_timeout: Union[float, int],
+    run_name: str,
+    seed: int,
+    icl_tasks: Union[str, ListConfig],
+    max_seq_len: int,
+    device_eval_batch_size: int,
+    eval_gauntlet_config: Optional[Union[str, DictConfig]],
+    eval_loader_config: Optional[Union[DictConfig, ListConfig]],
+    fsdp_config: Optional[Dict],
+    num_retries: int,
+    loggers_cfg: Dict[str, Any],
+    python_log_level: Optional[str],
+    precision: str,
+    eval_gauntlet_df: Optional[pd.DataFrame],
+    icl_subset_num_batches: Optional[int],
+):
+
+    print(f'Evaluating model: {model_cfg.model_name}', flush=True)
+    # Build tokenizer and model
+    tokenizer_cfg: Dict[str,
+                        Any] = om.to_container(model_cfg.tokenizer,
+                                               resolve=True)  # type: ignore
+    tokenizer_name = tokenizer_cfg['name']
+    tokenizer_kwargs = tokenizer_cfg.get('kwargs', {})
+    tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
+
+    evaluators, logger_keys, eval_gauntlet_callback = build_evaluators(
+        eval_loader_config,
+        icl_tasks,
+        eval_gauntlet_config,
+        tokenizer=tokenizer,
+        device_eval_batch_size=device_eval_batch_size,
+        icl_seq_len=max_seq_len,
+        icl_subset_num_batches=icl_subset_num_batches,
+    )
+
+    callbacks = []
+    if eval_gauntlet_callback is not None:
+        callbacks.append(eval_gauntlet_callback)
+
+    loggers: List[LoggerDestination] = [
+        build_logger(name, logger_cfg)
+        for name, logger_cfg in loggers_cfg.items()
+    ]
+
+    if fsdp_config and model_cfg.model.get('load_in_8bit', False):
+        raise ValueError(
+            'The FSDP config block is not supported when loading ' +
+            'Hugging Face models in 8bit.')
+
+    if hasattr(model_cfg.model, 'pretrained_lora_id_or_path'):
+        composer_model = load_peft_model(model_cfg.model, tokenizer,
+                                         num_retries)
+    else:
+        composer_model = load_model(model_cfg.model, tokenizer, fsdp_config,
+                                    num_retries)
+
+    # Now add the eval metrics
+    if eval_loader_config is not None:
+        train_metrics = composer_model.get_metrics(is_train=True)
+        evaluators = add_metrics_to_eval_loaders(evaluators, train_metrics)
+
+    if eval_gauntlet_df is None and eval_gauntlet_callback is not None:
+        eval_gauntlet_df = pd.DataFrame(
+            columns=['model_name'] +
+            [avg for avg in eval_gauntlet_callback.averages] +
+            [t.name for t in eval_gauntlet_callback.categories])
+
+    load_path = model_cfg.get('load_path', None)
+    if model_cfg.model.name == 'mpt_causal_lm' and load_path is None:
+        raise ValueError(
+            'MPT causal LMs require a load_path to the checkpoint for model evaluation.'
+            +
+            ' Please check your yaml and the model_cfg to ensure that load_path is set.'
+        )
+
+    assert composer_model is not None
+
+    trainer = Trainer(
+        run_name=run_name,
+        seed=seed,
+        model=composer_model,
+        callbacks=callbacks,
+        loggers=loggers,
+        precision=precision,
+        fsdp_config=fsdp_config,
+        load_path=load_path,
+        load_weights_only=True,
+        progress_bar=False,
+        log_to_console=True,
+        dist_timeout=dist_timeout,
+        python_log_level=python_log_level,
+    )
+
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    a = time.time()
+    trainer.eval(eval_dataloader=evaluators)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    b = time.time()
+    print(f'Ran {model_cfg.model_name} eval in: {b-a} seconds')
+    return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df)
+
+
+def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
+    om.resolve(cfg)
+    model_configs: ListConfig = pop_config(cfg, 'models', must_exist=True)
+    eval_gauntlet_config: Optional[Union[str, DictConfig]] = pop_config(
+        cfg, 'eval_gauntlet', must_exist=False, default_value=None)
+    if eval_gauntlet_config is None:
+        eval_gauntlet_config = pop_config(cfg,
+                                          'model_gauntlet',
+                                          must_exist=False,
+                                          default_value=None)
+        if eval_gauntlet_config:
+            print(
+                'Use of the key `model_gauntlet` is deprecated, please use the key `eval_gauntlet`'
+            )
+
+    fsdp_dict_cfg: Optional[DictConfig] = pop_config(cfg,
+                                                     'fsdp_config',
+                                                     must_exist=False,
+                                                     default_value=None)
+    fsdp_config: Optional[Dict] = om.to_container(
+        fsdp_dict_cfg,
+        resolve=True) if fsdp_dict_cfg is not None else None  # type: ignore
+    assert isinstance(fsdp_config, Dict) or fsdp_config is None
+
+    # Mandatory Evaluation Parameters
+    icl_tasks: Union[str, ListConfig] = pop_config(cfg,
+                                                   'icl_tasks',
+                                                   must_exist=True)
+    max_seq_len: int = pop_config(cfg, 'max_seq_len', must_exist=True)
+    device_eval_batch_size: int = pop_config(cfg,
+                                             'device_eval_batch_size',
+                                             must_exist=True)
+    precision: str = pop_config(cfg,
+                                'precision',
+                                must_exist=False,
+                                default_value=None)
+    python_log_level: Optional[str] = pop_config(cfg,
+                                                 'python_log_level',
+                                                 must_exist=False,
+                                                 default_value='debug')
+
+    # Optional Evaluation Parameters with default values
+    eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config(
+        cfg, 'eval_loader', must_exist=False, default_value=None)
+    seed: int = pop_config(cfg, 'seed', must_exist=False, default_value=17)
+    dist_timeout: Union[float, int] = pop_config(cfg,
+                                                 'dist_timeout',
+                                                 must_exist=False,
+                                                 default_value=600.0)
+    default_run_name: str = os.environ.get('RUN_NAME', 'llm')
+    run_name: str = pop_config(cfg,
+                               'run_name',
+                               must_exist=False,
+                               default_value=default_run_name)
+    num_retries: int = pop_config(cfg,
+                                  'num_retries',
+                                  must_exist=False,
+                                  default_value=3)
+    loggers_cfg: Dict[str, Any] = pop_config(cfg,
+                                             'loggers',
+                                             must_exist=False,
+                                             default_value={})
+    icl_subset_num_batches: int = pop_config(cfg,
+                                             'icl_subset_num_batches',
+                                             must_exist=False,
+                                             default_value=None)
+    # Pop out interpolation variables.
+    pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None)
+
+    # Warn for unused parameters
+    for key in cfg:
+        warnings.warn(
+            f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary.'
+        )
+
+    reproducibility.seed_all(seed)
+    # dist.initialize_dist(get_device(None), timeout=dist_timeout)
+
+    if python_log_level is not None:
+        logging.basicConfig(
+            # Example of format string
+            # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here
+            format=
+            f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s'
+        )
+        logging.getLogger('llmfoundry').setLevel(python_log_level.upper())
+
+    eval_gauntlet_df = None
+    models_df = None
+    composite_scores = None
+    trainers = []
+    for model_cfg in model_configs:
+        (trainer, logger_keys, eval_gauntlet_callback,
+         eval_gauntlet_df) = evaluate_model(
+             model_cfg=model_cfg,
+             dist_timeout=dist_timeout,
+             run_name=run_name,
+             seed=seed,
+             icl_tasks=icl_tasks,
+             max_seq_len=max_seq_len,
+             device_eval_batch_size=device_eval_batch_size,
+             eval_gauntlet_config=eval_gauntlet_config,
+             eval_loader_config=eval_loader_config,
+             fsdp_config=fsdp_config,
+             num_retries=num_retries,
+             loggers_cfg=loggers_cfg,
+             python_log_level=python_log_level,
+             precision=precision,
+             eval_gauntlet_df=eval_gauntlet_df,
+             icl_subset_num_batches=icl_subset_num_batches)
+        trainers.append(trainer)
+
+        if eval_gauntlet_callback is not None:
+            composite_scores = eval_gauntlet_callback.eval_after_all(
+                trainer.state, trainer.logger)
+
+        benchmark_to_taxonomy = {}
+        if eval_gauntlet_callback is not None:
+            for t in eval_gauntlet_callback.categories:
+                for b in t.benchmarks:
+                    benchmark_to_taxonomy[b.name] = t.name
+
+        model_results = calculate_markdown_results(logger_keys, trainer,
+                                                   benchmark_to_taxonomy,
+                                                   model_cfg.model_name)
+
+        if models_df is None:
+            models_df = model_results
+        else:
+            models_df = pd.concat([models_df, model_results], ignore_index=True)
+
+        if eval_gauntlet_df is not None and eval_gauntlet_callback is not None:
+            assert composite_scores is not None
+            row = {'model_name': model_cfg['model_name']}
+            row.update(
+                {k.split('/')[-1]: v for k, v in composite_scores.items()})
+            eval_gauntlet_df = pd.concat(
+                [eval_gauntlet_df, pd.DataFrame([row])], ignore_index=True)
+
+            print(f'Printing gauntlet results for all models')
+
+            print(
+                eval_gauntlet_df.sort_values(
+                    list(eval_gauntlet_callback.averages.keys())[0],
+                    ascending=False).to_markdown(index=False))
+        print(f'Printing complete results for all models')
+        assert models_df is not None
+        print(models_df.to_markdown(index=False))
+
+    return trainers, eval_gauntlet_df
+
+
+def calculate_markdown_results(logger_keys: List[str], trainer: Trainer,
+                               benchmark_to_taxonomy: Dict[str, str],
+                               model_name: str):
+    results = {}
+
+    for key in logger_keys:
+        # dl_name is either 2-tuple (benchmark_name, num_fewshot)
+        # or 3-tuple (benchmark_name, num_fewshot, subcategory)
+        dl_name, metric_name = key.split('/')[1:-1], key.split('/')[-1]
+        if 'Accuracy' not in metric_name:
+            continue
+
+        metric = trainer.state.eval_metrics.get('/'.join(dl_name),
+                                                {}).get(metric_name, None)
+
+        if metric is None:
+            continue
+        if dl_name[1] not in results:
+            results[dl_name[1]] = {}
+
+        if dl_name[0] not in results[dl_name[1]]:
+            results[dl_name[1]][dl_name[0]] = {}
+
+        if metric_name not in results[dl_name[1]][dl_name[0]]:
+            results[dl_name[1]][dl_name[0]][metric_name] = []
+
+        results[dl_name[1]][dl_name[0]][metric_name].append({
+            'val': metric.compute(),
+            'subcat': dl_name[-1] if len(dl_name) == 3 else 'no_subcat'
+        })
+
+    df = pd.DataFrame(columns=[
+        'Category', 'Benchmark', 'Subtask', 'Accuracy', 'Number few shot',
+        'Model'
+    ])
+
+    for num_shot in results:
+        for benchmark in results[num_shot]:
+            for metric in results[num_shot][benchmark]:
+                subscores = results[num_shot][benchmark][metric]
+                if len(subscores) == 1:
+                    row = {
+                        'Category': benchmark_to_taxonomy.get(benchmark, ''),
+                        'Benchmark': benchmark,
+                        'Subtask': None,
+                        'Accuracy': subscores[0]['val'],
+                        'Number few shot': num_shot,
+                        'Model': model_name
+                    }
+                    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
+                else:
+                    row = {
+                        'Category':
+                            benchmark_to_taxonomy.get(benchmark, ''),
+                        'Benchmark':
+                            benchmark,
+                        'Subtask':
+                            'Average',
+                        'Accuracy':
+                            sum(s['val'] for s in subscores) / len(subscores),
+                        'Number few shot':
+                            num_shot,
+                        'Model':
+                            model_name
+                    }
+                    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
+                    for sub in subscores:
+                        row = {
+                            'Category':
+                                benchmark_to_taxonomy.get(benchmark, ''),
+                            'Benchmark':
+                                None,
+                            'Subtask':
+                                sub['subcat'],
+                            'Accuracy':
+                                sub['val'],
+                            'Number few shot':
+                                num_shot,
+                            'Model':
+                                model_name
+                        }
+                        df = pd.concat([df, pd.DataFrame([row])],
+                                       ignore_index=True)
+    return df
+
+
+if __name__ == '__main__':
+    yaml_path, args_list = sys.argv[1], sys.argv[2:]
+    with open(yaml_path) as f:
+        yaml_cfg = om.load(f)
+    cli_cfg = om.from_cli(args_list)
+    cfg = om.merge(yaml_cfg, cli_cfg)
+    assert isinstance(cfg, DictConfig)
+    main(cfg)
diff --git a/scripts/eval/evaluate_trtllm_test.py b/scripts/eval/evaluate_trtllm_test.py
index 682fb5295b..8b1ab7b8ef 100644
--- a/scripts/eval/evaluate_trtllm_test.py
+++ b/scripts/eval/evaluate_trtllm_test.py
@@ -5,7 +5,7 @@
 # All this can be written in YAML form.
 
 
-from eval import main as run_evaluation
+from eval_trt_multigpu import main as run_evaluation
 from omegaconf import OmegaConf as om
 from omegaconf import DictConfig
 
@@ -38,12 +38,12 @@
     'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml',
 }
 
-trt_llama_config = {
+trt_llama7b_config = {
     'run_name': 'trtllm-eval',
     'seed': 0,
     'max_seq_len': 2048,
-    'device_eval_batch_size': 4,
-    'precision': 'amp_bf16',
+    'device_eval_batch_size': 32,
+    'precision': 'amp_fp16',
     'dist_timeout': 6000,
     'models':
     [
@@ -79,7 +79,7 @@
     'seed': 0,
     'max_seq_len': 2048,
     'device_eval_batch_size': 4,
-    'precision': 'amp_bf16',
+    'precision': 'amp_fp16',
     'dist_timeout': 6000,
     'models':
     [
@@ -89,7 +89,7 @@
             {
                 'name': 'trtllm',
                 'version': 'llama',
-                'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/8-gpu',
+                'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/fp8/8-gpu',
                 'log_level': 'error',
                 'eos_token_id': 2,
                 'pad_token_id': 2

From f646eddf762333cdfb91baedac15b72a5cbe4425 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Fri, 22 Dec 2023 02:45:05 -0800
Subject: [PATCH 11/21] Metric device updates

---
 .../models/inference_api_wrapper/interface.py |  9 ++----
 .../models/inference_api_wrapper/trtllm.py    | 29 ++++++++++++++++---
 ...uate_trtllm_test.py => run_trtllm_eval.py} | 16 ++++++----
 3 files changed, 38 insertions(+), 16 deletions(-)
 rename scripts/eval/{evaluate_trtllm_test.py => run_trtllm_eval.py} (84%)

diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 6ebc472f91..78a63d202a 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -28,7 +28,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
         eval_metrics = [
             LanguageCrossEntropy(),
             LanguagePerplexity(),
-            InContextLearningLMAccuracy()
+            InContextLearningLMAccuracy(),
             InContextLearningMultipleChoiceAccuracy(),
             InContextLearningQAAccuracy(),
             InContextLearningLMExpectedCalibrationError(),
@@ -94,18 +94,15 @@ def eval_forward(self, batch: Batch, outputs: Optional[Any] = None):
 
     def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None:
         batch = self.rebatch(batch)
+        metric = metric.to(device=outputs.device)
         self.labels = batch.pop('labels')
         self.labels[:, :-1] = self.labels[:, 1:].clone()
         self.labels[:, -1] = -100
+        # print("Devices:", outputs.get_device(), self.labels.get_device(), metric.device)
         if isinstance(metric, InContextLearningMetric) and batch.get(
                 'mode', None) == 'icl_task':
             assert self.labels is not None
             metric.update(batch, outputs, self.labels)
-            for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
-                cont_tok_pred = outputs[batch_idx].index_select(dim=0, index=cont_idx - 1).argmax(dim=-1)
-                cont_tok_targ = self.labels[batch_idx].index_select(dim=0, index=cont_idx - 1)
-                print("Ground Truth Label:", self.tokenizer.decode(self.labels[batch_idx].tolist()[:-1]))
-                print("Model output:", self.tokenizer.decode(cont_tok_pred))
         else:
             raise NotImplementedError(
                 'Inference API wrapper only supports InContextLearningMetrics and mode=icl_task'
diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index ccf5f728e6..4e77184a24 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -119,6 +119,7 @@ def __init__(
                                            tp_size=tp_size,
                                            pp_size=pp_size)
         self.device_num = runtime_rank % runtime_mapping.gpus_per_node
+        self.device = torch.device('cuda:' + str(self.device_num))
         torch.cuda.set_device(self.device_num)
 
         # Tokenization and sampling
@@ -149,6 +150,26 @@ def __init__(
 
         print("!!! Initialized generation session for rank:", runtime_rank)        
         torch.cuda.synchronize()
+        
+        # Move metrics to proper device (doesn't help, have to do this in update_metric())
+        # for key, value in self.eval_metrics.items():
+        #    self.eval_metrics[key] = value.to(device=self.device)
+        #    print("Eval metric now at:", self.eval_metrics[key].device) 
+
+    def rebatch(self, batch):
+        """
+        Move tensors in batch to the correct GPU.
+        """
+        if isinstance(batch, dict):
+            for key, value in batch.items():
+                batch[key] = self.rebatch(value)
+            return batch
+        elif isinstance(batch, torch.Tensor):
+            return batch.to(device=self.device)
+        elif isinstance(batch, list):
+            return [self.rebatch(b) for b in batch]
+        
+        return batch
 
 
     def eval_forward(self, batch, outputs: Optional[Any] = None):
@@ -167,10 +188,10 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             prompt = tokens[:cont_idxs[0]]
             
             
-            input_ids = torch.tensor([prompt], dtype=torch.int, device="cuda:" + str(self.device_num))
+            input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
             input_lengths = torch.tensor([input_ids.size(1)],
                                          dtype=torch.int,
-                                         device="cuda:"+ str(self.device_num))
+                                         device=self.device)
             # print("prompt:", self.tokenizer.decode(prompt))
             # print("Input device:", input_ids.get_device())
             #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
@@ -193,7 +214,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())
             for i in range(len(output_logits_list)):
                 output_logits_list[i] = output_logits_list[i].squeeze()
-                print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()))
+                # print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()))
             # print("Context logits:", context_logits.shape)
             # print("Output logits list:", output_logits_list)
             if len(output_logits_list) > 0:
@@ -219,7 +240,7 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
             output_logits_batch.append(padded_combined_logits)
             
-        return torch.stack(output_logits_batch).to(batch['input_ids'].device)
+        return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device)
 
         #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
         """
diff --git a/scripts/eval/evaluate_trtllm_test.py b/scripts/eval/run_trtllm_eval.py
similarity index 84%
rename from scripts/eval/evaluate_trtllm_test.py
rename to scripts/eval/run_trtllm_eval.py
index 8b1ab7b8ef..4bce4d4170 100644
--- a/scripts/eval/evaluate_trtllm_test.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -9,6 +9,10 @@
 from omegaconf import OmegaConf as om
 from omegaconf import DictConfig
 
+
+trt_folder_path = '/workspace/TensorRT-LLM/'
+
+
 # GPT config is just for quick initial testing purposes
 trt_gpt_config = {
     'run_name': 'trtllm-eval',
@@ -25,7 +29,7 @@
             {
                 'name': 'trtllm',
                 'version': 'gpt',
-                'engine_dir': '/workspace/tensorrt-llm-private/examples/gpt/engine_outputs',
+                'engine_dir': trt_folder_path + 'examples/gpt/engine_outputs',
                 'log_level': 'error'
             },
             'tokenizer':
@@ -53,7 +57,7 @@
             {
                 'name': 'trtllm',
                 'version': 'llama',
-                'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/7B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/1-gpu',
+                'engine_dir': trt_folder_path + 'examples/llama/tmp/llama/7B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/1-gpu',
                 'log_level': 'error',
                 'eos_token_id': 2,
                 'pad_token_id': 2
@@ -78,7 +82,7 @@
     'run_name': 'trtllm-eval',
     'seed': 0,
     'max_seq_len': 2048,
-    'device_eval_batch_size': 4,
+    'device_eval_batch_size': 8,
     'precision': 'amp_fp16',
     'dist_timeout': 6000,
     'models':
@@ -89,7 +93,7 @@
             {
                 'name': 'trtllm',
                 'version': 'llama',
-                'engine_dir': '/workspace/tensorrt-llm-private/examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/fp8/8-gpu',
+                'engine_dir': trt_folder_path + 'examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/fp8/8-gpu',
                 'log_level': 'error',
                 'eos_token_id': 2,
                 'pad_token_id': 2
@@ -100,8 +104,8 @@
             }
         }
     ],
-    'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml',
-    'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml',
+    'icl_tasks': './eval/yamls/tasks_v0.2.yaml',
+    'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml',
     'loggers': {
         'wandb': {
             'project': 'nik-quant-eval'

From dfa30b813d3e4b2884147358d1f1df0bfc2f7ae2 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Wed, 3 Jan 2024 23:38:32 -0800
Subject: [PATCH 12/21] Update interface to support QA tasks

---
 .../inference_api_wrapper/.trtllm.py.swp      | Bin 20480 -> 0 bytes
 .../models/inference_api_wrapper/interface.py |  10 +-
 .../models/inference_api_wrapper/trtllm.py    |  95 ++++++++----
 scripts/eval/run_trtllm_eval.py               | 128 ++++++++--------
 scripts/eval/yamls/lm_tasks_v0.2.yaml         |  59 ++++++++
 scripts/eval/yamls/mini_tasks_v0.2.yaml       |  23 ++-
 scripts/eval/yamls/qa_mc_tasks_v0.2.yaml      | 138 ++++++++++++++++++
 scripts/eval/yamls/tasks_v0.2.yaml            |  61 ++++----
 8 files changed, 383 insertions(+), 131 deletions(-)
 delete mode 100644 llmfoundry/models/inference_api_wrapper/.trtllm.py.swp
 create mode 100644 scripts/eval/yamls/lm_tasks_v0.2.yaml
 create mode 100644 scripts/eval/yamls/qa_mc_tasks_v0.2.yaml

diff --git a/llmfoundry/models/inference_api_wrapper/.trtllm.py.swp b/llmfoundry/models/inference_api_wrapper/.trtllm.py.swp
deleted file mode 100644
index f7844d9b9842f3444bea914762c6b151c28895b9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 20480
zcmeHPUyLM08E@o|s2m!=2ZNa6R_Pg?>HWh&j?sx;ZWmo{?`~!H03Ft*XS!yl-Jb5g
z?w<W~$3<W8L3z*w2rs_Kli&lw13nSrfl(6;8qgR_d@zazpNtO}AN2QCb$4~m&h6gC
z2NTmBe(ZF2)mLBr>Z`A+>-(TP_rUT=cGOxhaDB)yzWve;f7-b2)*pG-7;zY;^!T7_
z{_cC+x$fP(*}bdVZ62BDPP5xT%A3y7e$VZ3=dOi2k2L2DV|F`?H<QTe@mb#=HT&T>
zaN}M2o&Ow#F87mJFX;1_2R&{(k!NqmP89KYHjPshwW8gF^r_xg3{(tEFfdO0&7+6T
z+_^B1nP?a@hi_*$-*jf8P<2}|P%%(3P%%(3P%%(3P%%(3P%-fT%s?7lZF~(Kd{5R%
zC;NNV#NTVO`xDt;cjEok+5Pp|-{&XZXSda#ih+uOih+uOih+uOih+uOih+uOih+uO
zih+uO{{aIIKIFfFcJH7A0L1_E@Bd$a&@g@gJO`Ww&H(Gc7l2vdM&J-&0DrjAFkS$D
z2y}oi11;bl;0W+h;H^W3@iOo%@Hp@-;G4h@xDS{G-nhXqeh&N$_$lxta2mK4xCXcy
z_|*pt<8|O=;CbLXfCqGeHt-qX5O4$V%l8|`FM#g@Tfh=<EAY?v8OFQ7?||<DyTB)b
z>w&-G0OJYZG2k?CBk=lrp(ntAH?A{`r-6roTY%TDHH<OvY2aqyCg2?$miz>G4R{WC
z6u1+38;2x+27U~j1sGreuYmh=z%H-~m;jB(KX3^1Y<>`f;@K6&UXa$zRmWv%xXFW-
z*<cCx`_}5QMSFR%K3Q<-^cq_~ePZcUUU0I&!IG}LMyBMRwymX8QprLj!zI)go$=1`
z*oqw27I(Jivi9U#9wx=x%aXE>h@!sNFLTJ+ZFVXQc&m6{A{gRCo{HV;53B*lP?Yt}
z2V;6~)Zhq#l8n6q%U#?nu>4jDoMG!^4c<8?^}>KTfy)wS6!~5-P_~nXac^k##;#)}
zJhffE<@I<i9tWv6;&$u=o9q)Tza1g(2DPn0G)`<};Mf7ub5w&9#n6kC2T2&msZDu0
zCuQv#l74*GPB5^ILNYb9SCD9aLdpmUK=iz(iqEh`p|<EEy;I#2q;7>hrz;wSO6o=*
z%Ax1FJjhFx?vyZw%N!@%j^c0>rFJ?Fzy^lnUdK;LJnSigW_w}K_XZuaJNA4R%*lIm
zL)c59iLen#ZPyQbn+H`iCo4diyN@VlB4@x|dvnX~IlUo2P?}zRZ^uSsKlSXhV;=9?
zWbyk5?3j~<&5fd)!FXi*&MuFWeI=-3rMhbT_N}XmmFkv8b>GURa^?n`on^{owLT2F
z;~v}vQR<R%U{kDbE(HTG;IbV?Q-jsct41%>+3jt1znqkMA#H}y+)%Y@^$T^g(AP;4
z9;c<AkW1UQc7r`!)K*gWB~Hq-BPVuzpZg{1-E=qF-(I4a_q+t2s#I%+*iq&zZt0P^
ze*b!NW#uGW+H(A{^luN!tU%_5JYhkYvSb`ZA-Wv%QMkozFNnq|RC8T&VTu?bG<Lj%
zvj-rIFU4^f*9>Zgt{E%@(<G>sVA|UfOOZdu*cKDMV@@ky>N7QJ4&Yv#6m*j7wxx(t
z8S*gGitN*@L2~JuVQV~PTMi~)AAT(JOuCno-U-sanuLMnhmKq8t6|vo(jf~Y9@Jzx
zkeUt~%#6)Cb0AewQ+f|!To}3SdNG@2rhI2n3u-V*McQl?Vr#|9+A-Bjb3<t}!&aal
zgGFRrk=^QhK3AR64BZb018gd2z4Q4N_sebIoqR}57=keT3#P>5THUf?MP6##szyU@
zxoDSCDi3Gd+qAvDjTBW{`%o=OX-c@+vQ<|c75jn~TaEcTEHU72wsP~hQRq+*(PE2U
zFFh_66!n@`I(g^vJ}VS8jGm9FPxN5E&~(Y>No_gLsXey3TzU@8WWQscj^lut=P%g!
zCynqgT3gRwu;e(HRzHkKPAa)*FfqeHN0cw^Vo4`uuqCpyu?w6LmlawpN#m?m_B*TE
zV%W}NYk>OVCp4O2_mZiWM{$<#&}N1wv*#r<B>KXvSYKsIa6Wj-ROm_JIy~tmijlF{
za>_>IBxPL=4YKjV@G{IcmSXwLIE0tTI4pS)=|Gl8<Z((K1kQt85o+3?-(Ehowtj47
zWofZx5Hb6iNj?mFs$wFE{f5DJdOS*5sZD8A(`u~8V=gR7O!+d8R<>(np3t40q>Ss#
zxY$~@UzGPkKvt|Q!bsy?sB7D>aNCA{9U@sbH#jdtw5^i6EirD1H8r^#Pg5TEks6*W
zb_sjU#HKKpgbb246`77xrjyhm9Pxoe{wFV~CX|u)kM_m6>|NUR=4FosyA5_aBFF6b
zQXcs~-|OZj$ZzFx$i3m?S4_gzBi3<j978O`g5^i7!-o$Wz%m)~2+lrX4%v!W+PGTU
z1W`<;y}QhbDNml^!yK-Bb@8F*3byP)!kf$3z@}c`<8g}}LstEeHy5lqlsX<pyRkPI
zrfgwuVF5#&I9~7M3ZoVDUYT^yoN+n~<HVrv|BvJQ{1Je@|M9siu2=9~|8?L#;9b1?
z3-C7ZA|T3U<*Gjw0~G@m0~G@m0~G@m0~G@m0~G@m0~G@m1H=H1V2Ud{VhYD8j%E`a
z4EEBFIp6`0#gVR!!#FQZ5F|;%TGvT?!;UFWaXpuw%3zg#yd%Sm2TIiSpwI9^Dg4Ov
zlCd!y1obv+Qevi&EtIKWfm&0lTl62X2d*wz@63$s&EeC6ASUTx$<T>->u`f<LM`!F
zudfxrxl9)9!<k)2d(~jsu{h2=>H9y#IK4!1W_<sjph3g+GQRsC1&#rq1MUERi*NsD
zfV04H;A6mTz&nWDUj)7f>;S7k6Zkvg^%sH10S8zBt^(dfZ2l?WA>cva!vMwTzXTiw
zZUNpxEdD9rNgx7F01WsI4k+FPo&`F<ZNMK9mwyfjfR6xwLR|h;U<`Z~_%~wm6sLa`
zcmh}l)_}huCjWcj8Q?3xJTM3R3o-c@ffs-nr~wqGzlfOpW55@IBfxb4@wNkO6Mt85
zmF@5Heh1^rD|d-f6kQX?Qaou-gh_QPqWW^GwKc2B=bt)*&RY`VWb(n@m$ihS^?AUW
z)9kl1x6!EUbu!Z|%r^+E{y6B#^EV4Id3zY9HEG)YvG1$U4XKk&p-gf34YWZ2;Pfv=
zGQkerx5&ZxUXs@8de8OBa{B#u^DN*y2ndi{v*ihDDUd<x?s;Eydzu>}o$I<QBO%%-
z*;J_GZ9ve}=T?GvE<eXh@pO`bqg3YM7@Usb2fT&>(;bx%Tc=<Fl`dabWLI+7Bo!D#
zq$?zy`Br&wHV`8a(fzhbSqkQab!dGm$eL~=Mp?~CpP&2MMc%^+xSNqhZdO=x5v76E
zQ-Pd~gDq;x02PZ^0}NAC67&wzVUo3#XK7?HL)|82hvj04{tD%!N^!2*-Z3e69KUHa
z`A)=pDY63u*^>GR^Pi|iw{pBj&Q48Jyl#0ZAJrztM$-+^wsEA_vMA79o13VW55!WY
zAVu|3FUX?2Qc)U~%j`rZu53~|p?Q1DG!i70MuHm9duWu|(CfeknCqmDs)Q#}K`8^b
z^<17bm}y=l7b>$$E^T@8{lR0U6xJt@))Q4|$}bVpYs)dGn4DHqqo<g8*;MWn9hZYV
zU*DKko0!YDC3#KRJ(~8EGEi!L!VhT5$Vx(%@FBVj(I><-%U3nahEkH!oZn!dV)H78
zh3G2EqI&VWSW7eorQH_IOfQtJjKm1_LK+DcC|!SQdFqvIkx7$$z98_7T{XvUxfF3G
z-!5BdD7DGgpYx(A`)oCEq5<?lOzfD8TUkL(Da|3vjWTL<6mc^$S<Xxl7Zy8x&hqu#
zVrwaK83zg5_I#h>uyQ93cPwIu6cV$c{k9XkMoz&DTkebStkjNTWel<1TzG?RBd><U
z<xiLcu~_sHct3{Ks>@>*LWJNP!Cs2>QMeaDHXi%CxD{<tz_zRa_A!XKGtp|k7f)J6
zWUzxcAuYO;H8{CUQdWS7>17_ik*7v<$9?>du2foFmT;%oExS}QrgDB=7Ra?eUwlBV
z@q`j;^c~Rv(McqrQLI*UIs2x8Gzt={rikA?nQZMnPn2~~1xcdF4HOA%n7(Am>VaNd
zPdIDqw|dcOSu^cv5A|x3S!nIa2YT^oNj0L+(_ZLRRkm7N{nU%7q_oK$w_dK4e<ll-
z9_S^hC6Qky+7!Q3fk`vN5R16;agjQrtO7F#SuLL9Q{GuNOJFvb=E8i$UQn2;?uW9J
z4r#NH-M3=o!<F`tDo+RzZz;Y=73%KE71GN<UM63r3uQNy<YADZ9em~4sFX*dthyuz
zO-#*9p)}r(xrIamkYMHu5&VeCTQ&0LOUXx4vQAN6?9zW}0K427CGB~o<fV(3-%ygV
e=Sl;a{MZtHVqzbO{_6Q0H5@+_5@hCa!}t%p-wa3q

diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index 78a63d202a..b2ebfc34b6 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -24,6 +24,7 @@ def __init__(self, model_cfg: Dict, tokenizer: AutoTokenizer):
 
         self.tokenizer = tokenizer
         self.labels = None
+        self.device = None
         # set up training and eval metrics
         eval_metrics = [
             LanguageCrossEntropy(),
@@ -94,11 +95,16 @@ def eval_forward(self, batch: Batch, outputs: Optional[Any] = None):
 
     def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None:
         batch = self.rebatch(batch)
-        metric = metric.to(device=outputs.device)
+        metric = metric.to(device=self.device)
+
         self.labels = batch.pop('labels')
+        if isinstance(metric, InContextLearningQAAccuracy):
+            # Labels are strings, not tokens, for QA tasks.
+            metric.update(outputs, self.labels, batch)
+            return
+        
         self.labels[:, :-1] = self.labels[:, 1:].clone()
         self.labels[:, -1] = -100
-        # print("Devices:", outputs.get_device(), self.labels.get_device(), metric.device)
         if isinstance(metric, InContextLearningMetric) and batch.get(
                 'mode', None) == 'icl_task':
             assert self.labels is not None
diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index 4e77184a24..bcd158f731 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -111,6 +111,7 @@ def __init__(
 
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
+        self.max_seq_len = 2048 #TODO: Do Not hardcode
 
         # Device and rank
         runtime_rank = tensorrt_llm.mpi_rank()
@@ -177,16 +178,80 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
         # model's generate function. Strings will be returned from eval_forward
         output_logits_batch = []
         batch = self.rebatch(batch)
+
+        # Question-answering tasks
+        if 'continuation_indices' not in batch:
+            # batch['continuation_indices'] = torch.tensor([], dtype=torch.int, device=self.device)
+            # print("Batch:", batch)
+            output_strs = []
+
+            for tokens in batch['input_ids']:
+                seqlen = tokens.shape[0]
+                prompt = tokens.tolist()
+                eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0]
+                end_prompt_idx = len(prompt)
+                if eos_occurence.shape[0] >= 1:
+                    end_prompt_idx = eos_occurence[0]
+                prompt = prompt[:end_prompt_idx]
+                input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
+                input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device)
+                #print("prompt:", self.tokenizer.decode(prompt))
+                #print("promp tokens:", prompt)
+                #print("Input lengths:", input_lengths)
+                #print("Generation Length:", batch['generation_length'])
+                
+                with torch.no_grad():
+                    self.decoder.setup(input_lengths.size(0),
+                                        torch.max(input_lengths).item(),
+                                        batch['generation_length'])
+
+                    output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
+                    output_logits_list = output_dict['generation_logits']
+                
+                for i in range(len(output_logits_list)):
+                    output_logits_list[i] = output_logits_list[i].squeeze()
+                
+                #print("Shape:", output_dict['output_ids'].shape)
+                decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):])
+                output_strs.append(decoded_str)
+                #print("Decoded OUTPUT:", decoded_str)
+                #print("-------------")
+                # print("Output ids:", output_dict['output_ids'][0][0].tolist())
+                """
+                context_logits = output_dict['context_logits'].squeeze()
+                output_logits_tensor = torch.stack(output_logits_list)
+                print("Context logits shape:", context_logits.shape)
+                print("Output logits shape:", output_logits_tensor.shape)
+                combined_logits = torch.cat([context_logits, output_logits_tensor])
+                                
+                padding = torch.nn.functional.one_hot(
+                    torch.full(
+                        (self.max_seq_len - combined_logits.shape[0],),
+                        self.PAD_ID,
+                        device=self.device
+                    ),
+                    num_classes=self.vocab_size)
+                padded_combined_logits = torch.cat([combined_logits, padding])
+
+                output_logits_batch.append(padded_combined_logits)
+                """
+            return output_strs
+
+        # Language modeling and multiple choice tasks
         for tokens, cont_idxs in zip(batch['input_ids'],
                                      batch['continuation_indices']):
 
             seqlen = tokens.shape[0]
             tokens = tokens.tolist()
+            # print("Continuation indices:", cont_idxs)
             cont_idxs = cont_idxs.tolist()
-            expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+            if len(cont_idxs) > 1:
+                expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+                prompt = tokens[:cont_idxs[0]]
+            else:
+                prompt = tokens
+                expected_cont_tokens = [tokens[-1]]
 
-            prompt = tokens[:cont_idxs[0]]
-            
             
             input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
             input_lengths = torch.tensor([input_ids.size(1)],
@@ -243,27 +308,3 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
         return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device)
 
         #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
-        """
-            # Old logits logic, back before TRT-LLM natively returned logits
-            output_logits = torch.nn.functional.one_hot(
-                torch.tensor(tokens[1:cont_idxs[0]], device='cuda'),
-                num_classes=self.vocab_size)
-
-            for i in range(len(output_logits_list)):
-                output_logits_list[i] = output_logits_list[i].squeeze()
-
-            next_logit_tensor = torch.stack(output_logits_list)
-            output_logits = torch.cat([output_logits, next_logit_tensor])
-            #print(output_logits.shape)
-            #print(output_ids[0][0][cont_idxs[0]:].tolist())
-            padding = torch.nn.functional.one_hot(torch.full(
-                (seqlen - output_logits.shape[0],),
-                self.PAD_ID,
-                device=output_logits.device),
-                                                  num_classes=self.vocab_size)
-            output_logits = torch.cat([output_logits, padding])
-            #print("Output logits shape:", output_logits.shape)
-            output_logits_batch.append(output_logits)
-            
-        return torch.stack(output_logits_batch).to(batch['input_ids'].device)
-        """
diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py
index 4bce4d4170..5ed4e88f57 100644
--- a/scripts/eval/run_trtllm_eval.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -12,6 +12,10 @@
 
 trt_folder_path = '/workspace/TensorRT-LLM/'
 
+MINI_TASKS = './eval/yamls/mini_tasks_v0.2.yaml'
+QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.2.yaml'
+ALL_TASKS = './eval/yamls/tasks_v0.2.yaml'
+LM_TASKS = './eval/yamls/lm_tasks_v0.2.yaml'
 
 # GPT config is just for quick initial testing purposes
 trt_gpt_config = {
@@ -42,79 +46,69 @@
     'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml',
 }
 
-trt_llama7b_config = {
-    'run_name': 'trtllm-eval',
-    'seed': 0,
-    'max_seq_len': 2048,
-    'device_eval_batch_size': 32,
-    'precision': 'amp_fp16',
-    'dist_timeout': 6000,
-    'models':
-    [
-        {
-            'model_name': 'trtllm/llama',
-            'model':
-            {
-                'name': 'trtllm',
-                'version': 'llama',
-                'engine_dir': trt_folder_path + 'examples/llama/tmp/llama/7B-chat-quality-eval/trt_engines/int8_kv_cache_weight_only/1-gpu',
-                'log_level': 'error',
-                'eos_token_id': 2,
-                'pad_token_id': 2
-            },
-            'tokenizer':
+
+def get_llama_config(engine_dir, tokenizer_name, icl_tasks=QA_MC_TASKS):
+    return {
+        'run_name': 'trtllm-eval',
+        'seed': 0,
+        'max_seq_len': 2048,
+        'device_eval_batch_size': 8, # Llama-7B should be batch size 32
+        'precision': 'amp_fp16',
+        'dist_timeout': 6000,
+        'models':
+        [
             {
-                'name': '/workspace/llama-7b-chat-hf/'
+                'model_name': 'trtllm/llama',
+                'model':
+                {
+                    'name': 'trtllm',
+                    'version': 'llama',
+                    'engine_dir': engine_dir,
+                    'log_level': 'error',
+                    'eos_token_id': 2,
+                    'pad_token_id': 2
+                },
+                'tokenizer':
+                {
+                    'name': tokenizer_name,
+                }
+            }
+        ],
+        'icl_tasks': icl_tasks,
+        'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml',
+        'loggers': {
+            'wandb': {
+                'project': 'nik-quant-eval'
             }
-        }
-    ],
-    'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml',
-    'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml',
-    'loggers': {
-        'wandb': {
-            'project': 'nik-quant-eval'
         }
     }
-}
 
+def engine_dir_str(model_type, model_dir, variant, ngpus=8):
+    return f"{trt_folder_path}examples/{model_type}/tmp/{model_type}/{model_dir}/trt_engines/{variant}/{ngpus}-gpu"
+
+
+LLAMA_TOK_DIR = '/workspace/llama-70b-chat-hf/'
+LLAMA_7B_DIR = '7B-chat-quality-eval'
+LLAMA_70B_DIR = '70B-chat-quality-eval'
+
+
+llama7b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_7B_DIR, 'int8_kv_cache_weight_only', 1), LLAMA_TOK_DIR)
+llama70b_fp8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp8'), LLAMA_TOK_DIR)
+llama70b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'int8_kv_cache_weight_only'), LLAMA_TOK_DIR)
+llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR)
+llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR)
+
+
+def run_eval(config):
+    print("RUNNING EVAL")
+    om_dict_config: DictConfig = om.create(config)
+    print("OmegaConfig dictionary", om.to_yaml(om_dict_config))
+    run_evaluation(om_dict_config)
 
-trt_llama70b_config = {
-    'run_name': 'trtllm-eval',
-    'seed': 0,
-    'max_seq_len': 2048,
-    'device_eval_batch_size': 8,
-    'precision': 'amp_fp16',
-    'dist_timeout': 6000,
-    'models':
-    [
-        {
-            'model_name': 'trtllm/llama',
-            'model':
-            {
-                'name': 'trtllm',
-                'version': 'llama',
-                'engine_dir': trt_folder_path + 'examples/llama/tmp/llama/70B-chat-quality-eval/trt_engines/fp8/8-gpu',
-                'log_level': 'error',
-                'eos_token_id': 2,
-                'pad_token_id': 2
-            },
-            'tokenizer':
-            {
-                'name': '/workspace/llama-70b-chat-hf/'
-            }
-        }
-    ],
-    'icl_tasks': './eval/yamls/tasks_v0.2.yaml',
-    'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml',
-    'loggers': {
-        'wandb': {
-            'project': 'nik-quant-eval'
-        }
-    }
-}
 
+run_eval(llama70b_int8_config)
+run_eval(llama70b_fp16_config)
+run_eval(llama70b_fp8_config)
+run_eval(llama70b_smoothquant_config)
 
-om_dict_config: DictConfig = om.create(trt_llama70b_config)
-print("OmegaConfig dictionary", om.to_yaml(om_dict_config))
 
-run_evaluation(om_dict_config)
diff --git a/scripts/eval/yamls/lm_tasks_v0.2.yaml b/scripts/eval/yamls/lm_tasks_v0.2.yaml
new file mode 100644
index 0000000000..1f550aba6e
--- /dev/null
+++ b/scripts/eval/yamls/lm_tasks_v0.2.yaml
@@ -0,0 +1,59 @@
+icl_tasks:
+-
+  label: jeopardy
+  dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl
+  num_fewshot: [3]
+  icl_task_type: language_modeling
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+  has_categories: true
+-
+  label: bigbench_qa_wikidata
+  dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl
+  num_fewshot: [3]
+  icl_task_type: language_modeling
+-
+  label: bigbench_dyck_languages
+  dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_dyck_languages.jsonl
+  num_fewshot: [5]
+  icl_task_type: language_modeling
+-
+  label: lambada_openai
+  dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl
+  num_fewshot: [0]
+  icl_task_type: language_modeling
+-
+  label: bigbench_cs_algorithms
+  dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_cs_algorithms.jsonl
+  num_fewshot: [10]
+  icl_task_type: language_modeling
+-
+  label: bigbench_operators
+  dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_operators.jsonl
+  num_fewshot: [3]
+  icl_task_type: language_modeling
+-
+  label: simple_arithmetic_nospaces
+  dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_nospaces.jsonl
+  num_fewshot: [5]
+  icl_task_type: language_modeling
+-
+  label: simple_arithmetic_withspaces
+  dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_withspaces.jsonl
+  num_fewshot: [5]
+  icl_task_type: language_modeling
+-
+  label: pubmed_qa_labeled
+  dataset_uri: eval/local_data/reading_comprehension/pubmed_qa_labeled.jsonl
+  num_fewshot: [10]
+  icl_task_type: language_modeling
+-
+  label: squad
+  dataset_uri: eval/local_data/reading_comprehension/squad.jsonl
+  num_fewshot: [3]
+  icl_task_type: language_modeling
+-
+  label: coqa
+  dataset_uri: eval/local_data/reading_comprehension/coqa.jsonl
+  num_fewshot: [0]
+  icl_task_type: language_modeling
+
diff --git a/scripts/eval/yamls/mini_tasks_v0.2.yaml b/scripts/eval/yamls/mini_tasks_v0.2.yaml
index 2366ccfb8d..e5e7c459d9 100644
--- a/scripts/eval/yamls/mini_tasks_v0.2.yaml
+++ b/scripts/eval/yamls/mini_tasks_v0.2.yaml
@@ -1,8 +1,21 @@
 icl_tasks:
+#-
+#  label: jeopardy
+#  dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl
+#  num_fewshot: [3]
+#  icl_task_type: language_modeling
+#  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+#  has_categories: true
 -
-  label: jeopardy
-  dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl
+  label: triviaqa_sm_sub
+  dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl
   num_fewshot: [3]
-  icl_task_type: language_modeling
-  continuation_delimiter: "\nAnswer: " # this separates questions from answers
-  has_categories: true
+  icl_task_type: question_answering
+#-
+#  label: gsm8k
+#  dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl
+#  num_fewshot: [8, 5]
+#  icl_task_type: question_answering
+#  cot_delimiter: ' #### '
+#  continuation_delimiter: "\nA: Let's think step by step. "
+#  question_prelimiter: "Q: "
diff --git a/scripts/eval/yamls/qa_mc_tasks_v0.2.yaml b/scripts/eval/yamls/qa_mc_tasks_v0.2.yaml
new file mode 100644
index 0000000000..f566e593d0
--- /dev/null
+++ b/scripts/eval/yamls/qa_mc_tasks_v0.2.yaml
@@ -0,0 +1,138 @@
+icl_tasks:
+-
+  label: triviaqa_sm_sub
+  dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl
+  num_fewshot: [3]
+  icl_task_type: question_answering
+#BROKEN: https://github.com/mosaicml/llm-foundry/pull/824
+#-
+#  label: gsm8k
+#  dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl
+#  num_fewshot: [8, 5]
+#  icl_task_type: question_answering
+#  cot_delimiter: ' #### '
+#  continuation_delimiter: "\nA: Let's think step by step. "
+#  question_prelimiter: "Q: "
+#-
+#  label: agi_eval_sat_math
+#  dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_sat_math.jsonl
+#  num_fewshot: [3]
+#  icl_task_type: question_answering
+#  cot_delimiter: ' #### '
+#  continuation_delimiter: "\nA: Let's think step by step. "
+#-
+#  label: aqua
+#  dataset_uri: eval/local_data/symbolic_problem_solving/aqua.jsonl
+#  num_fewshot: [3]
+#  icl_task_type: question_answering
+#  cot_delimiter: ' #### '
+#  continuation_delimiter: "\nA: Let's think step by step. "
+#-
+#  label: svamp
+#  dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl
+#  num_fewshot: [5]
+#  icl_task_type: question_answering
+#  continuation_delimiter: "\nUsing the formula below:\n"
+#  cot_delimiter: ' #### '
+#  question_prelimiter: "Q: "
+-
+  label: arc_easy
+  dataset_uri: eval/local_data/world_knowledge/arc_easy.jsonl
+  num_fewshot: [3]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+-
+  label: arc_challenge
+  dataset_uri: eval/local_data/world_knowledge/arc_challenge.jsonl
+  num_fewshot: [3, 25]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+-
+  label: mmlu
+  dataset_uri: eval/local_data/world_knowledge/mmlu.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+  has_categories: true
+-
+  label: copa
+  dataset_uri: eval/local_data/commonsense_reasoning/copa.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+-
+  label: siqa
+  dataset_uri: eval/local_data/commonsense_reasoning/siqa.jsonl
+  num_fewshot: [3]
+  icl_task_type: multiple_choice
+-
+  label: commonsense_qa
+  dataset_uri: eval/local_data/commonsense_reasoning/commonsense_qa.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+-
+  label: piqa
+  dataset_uri: eval/local_data/commonsense_reasoning/piqa.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+-
+  label: openbook_qa
+  dataset_uri: eval/local_data/commonsense_reasoning/openbook_qa.jsonl
+  num_fewshot: [10]
+  icl_task_type: multiple_choice
+-
+  label: bigbench_strange_stories
+  dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strange_stories.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+-
+  label: bigbench_strategy_qa
+  dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strategy_qa.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+-
+  label: hellaswag
+  dataset_uri: eval/local_data/language_understanding/hellaswag.jsonl
+  num_fewshot: [0, 10]
+  icl_task_type: multiple_choice
+-
+  label: winograd
+  dataset_uri: eval/local_data/language_understanding/winograd_wsc.jsonl
+  num_fewshot: [3]
+  icl_task_type: schema
+-
+  label: winogrande
+  dataset_uri: eval/local_data/language_understanding/winogrande.jsonl
+  num_fewshot: [5]
+  icl_task_type: schema
+-
+  label: bigbench_elementary_math_qa
+  dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_elementary_math_qa.jsonl
+  num_fewshot: [1]
+  icl_task_type: multiple_choice
+-
+  label: agi_eval_lsat_ar
+  dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_lsat_ar.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
+-
+  label: agi_eval_lsat_rc
+  dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_rc.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
+-
+  label: agi_eval_lsat_lr
+  dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_lr.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
+-
+  label: boolq
+  dataset_uri: eval/local_data/reading_comprehension/boolq.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+-
+  label: agi_eval_sat_en
+  dataset_uri: eval/local_data/reading_comprehension/agi_eval_sat_en.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
diff --git a/scripts/eval/yamls/tasks_v0.2.yaml b/scripts/eval/yamls/tasks_v0.2.yaml
index f5c9f20880..ea74f84a2f 100644
--- a/scripts/eval/yamls/tasks_v0.2.yaml
+++ b/scripts/eval/yamls/tasks_v0.2.yaml
@@ -11,36 +11,37 @@ icl_tasks:
   dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl
   num_fewshot: [3]
   icl_task_type: question_answering
--
-  label: gsm8k
-  dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl
-  num_fewshot: [8, 5]
-  icl_task_type: question_answering
-  cot_delimiter: ' #### '
-  continuation_delimiter: "\nA: Let's think step by step. "
-  question_prelimiter: "Q: "
--
-  label: agi_eval_sat_math
-  dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_sat_math.jsonl
-  num_fewshot: [3]
-  icl_task_type: question_answering
-  cot_delimiter: ' #### '
-  continuation_delimiter: "\nA: Let's think step by step. "
--
-  label: aqua
-  dataset_uri: eval/local_data/symbolic_problem_solving/aqua.jsonl
-  num_fewshot: [3]
-  icl_task_type: question_answering
-  cot_delimiter: ' #### '
-  continuation_delimiter: "\nA: Let's think step by step. "
--
-  label: svamp
-  dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl
-  num_fewshot: [5]
-  icl_task_type: question_answering
-  continuation_delimiter: "\nUsing the formula below:\n"
-  cot_delimiter: ' #### '
-  question_prelimiter: "Q: "
+#BROKEN
+#-
+#  label: gsm8k
+#  dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl
+#  num_fewshot: [8, 5]
+#  icl_task_type: question_answering
+#  cot_delimiter: ' #### '
+#  continuation_delimiter: "\nA: Let's think step by step. "
+#  question_prelimiter: "Q: "
+#-
+#  label: agi_eval_sat_math
+#  dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_sat_math.jsonl
+#  num_fewshot: [3]
+#  icl_task_type: question_answering
+#  cot_delimiter: ' #### '
+#  continuation_delimiter: "\nA: Let's think step by step. "
+#-
+#  label: aqua
+#  dataset_uri: eval/local_data/symbolic_problem_solving/aqua.jsonl
+#  num_fewshot: [3]
+#  icl_task_type: question_answering
+#  cot_delimiter: ' #### '
+#  continuation_delimiter: "\nA: Let's think step by step. "
+#-
+#  label: svamp
+#  dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl
+#  num_fewshot: [5]
+#  icl_task_type: question_answering
+#  continuation_delimiter: "\nUsing the formula below:\n"
+#  cot_delimiter: ' #### '
+#  question_prelimiter: "Q: "
 -
   label: bigbench_qa_wikidata
   dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl

From 1c6037cb15c932f926ce473b893cd7ec23570e77 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Sat, 6 Jan 2024 02:01:42 -0800
Subject: [PATCH 13/21] Update scripts, fix batching

---
 .../models/inference_api_wrapper/trtllm.py    | 56 ++++++++++++++++---
 scripts/eval/eval_trt_multigpu.py             |  5 +-
 scripts/eval/run_trtllm_eval.py               | 10 ++--
 3 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index bcd158f731..3c3beb9309 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -181,10 +181,56 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
         # Question-answering tasks
         if 'continuation_indices' not in batch:
-            # batch['continuation_indices'] = torch.tensor([], dtype=torch.int, device=self.device)
-            # print("Batch:", batch)
-            output_strs = []
+            # Batched version
+            batch_size = len(batch['input_ids'])
+            prompt_lens = []
+            output_strs = [] # QA tasks return strings, not logits
+            max_prompt_len = 0
+            
+            for tokens in batch['input_ids']:
+                prompt = tokens.tolist()
+                eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0]
+                end_prompt_idx = len(prompt)
+                if eos_occurence.shape[0] >= 1:
+                    end_prompt_idx = eos_occurence[0]
+                prompt_lens.append(end_prompt_idx)
+                if end_prompt_idx > max_prompt_len:
+                    max_prompt_len = end_prompt_idx
+
+            #if batch_size == 1:
+            #    # Remove pad tokens
+            #    prompt = batch['input_ids'][0].tolist()[:prompt_lens[0]]
+            #    input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
+            #    input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device)
+            # else:
+            # Keep padding
+            input_ids = torch.narrow(batch['input_ids'], 1, 0, max_prompt_len).to(dtype=torch.int, device=self.device) 
+            # input_ids = batch['input_ids'].to(dtype=torch.int, device=self.device)
+            input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
+           
+            #if tensorrt_llm.mpi_rank() == 0:
+            #    print("Prompt:", input_ids[7])
+            #print("Input shape:", input_ids.shape)
+            #print("Input lengths:", input_lengths)
 
+            with torch.no_grad():
+                self.decoder.setup(batch_size,
+                                    input_lengths.max().item(),
+                                    batch['generation_length'])
+                output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
+       
+            #if tensorrt_llm.mpi_rank() == 0:
+            #    print("Output:", [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']] for i in range(batch_size)])
+            # print("Output shape:", output_dict['output_ids'].shape)
+            #inp_len = input_ids.size(1)
+
+            decoded_strs = [self.tokenizer.decode(output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']]) for i in range(batch_size)]
+            output_strs += decoded_strs
+            print("decoded strs:", decoded_strs)
+            return output_strs
+
+            # Non-batched version
+            output_strs = []
             for tokens in batch['input_ids']:
                 seqlen = tokens.shape[0]
                 prompt = tokens.tolist()
@@ -206,10 +252,6 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                                         batch['generation_length'])
 
                     output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
-                    output_logits_list = output_dict['generation_logits']
-                
-                for i in range(len(output_logits_list)):
-                    output_logits_list[i] = output_logits_list[i].squeeze()
                 
                 #print("Shape:", output_dict['output_ids'].shape)
                 decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):])
diff --git a/scripts/eval/eval_trt_multigpu.py b/scripts/eval/eval_trt_multigpu.py
index 809b1a53e3..1a63fac90c 100644
--- a/scripts/eval/eval_trt_multigpu.py
+++ b/scripts/eval/eval_trt_multigpu.py
@@ -25,7 +25,7 @@
                                        build_evaluators, build_logger,
                                        build_tokenizer)
 from llmfoundry.utils.config_utils import pop_config, process_init_device
-
+import tensorrt_llm
 
 
 def load_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase,
@@ -132,6 +132,9 @@ def evaluate_model(
 
     assert composer_model is not None
 
+    if tensorrt_llm.mpi_rank() > 0:
+        loggers = None
+
     trainer = Trainer(
         run_name=run_name,
         seed=seed,
diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py
index 5ed4e88f57..d37168e900 100644
--- a/scripts/eval/run_trtllm_eval.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -47,7 +47,7 @@
 }
 
 
-def get_llama_config(engine_dir, tokenizer_name, icl_tasks=QA_MC_TASKS):
+def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
     return {
         'run_name': 'trtllm-eval',
         'seed': 0,
@@ -105,10 +105,10 @@ def run_eval(config):
     print("OmegaConfig dictionary", om.to_yaml(om_dict_config))
     run_evaluation(om_dict_config)
 
-
-run_eval(llama70b_int8_config)
-run_eval(llama70b_fp16_config)
-run_eval(llama70b_fp8_config)
+#run_eval(llama7b_int8_config)
+#run_eval(llama70b_int8_config)
+#run_eval(llama70b_fp16_config)
+#run_eval(llama70b_fp8_config)
 run_eval(llama70b_smoothquant_config)
 
 

From 3e5b5eedd4a75c401d5df9a8a47f6524fd550cbd Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Thu, 11 Jan 2024 04:55:42 -0800
Subject: [PATCH 14/21] Update foundry:

---
 .../models/inference_api_wrapper/trtllm.py    | 101 ++++++++----------
 scripts/eval/run_trtllm_eval.py               |  37 +------
 2 files changed, 50 insertions(+), 88 deletions(-)

diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index 3c3beb9309..1bc2b8a1eb 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -4,10 +4,13 @@
 """Implements a TRT-LLM evaluation model wrapped around a
 :class:`.ComposerModel`."""
 
+import os
+import sys
 import json
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, List, Tuple
 
+import warnings
 import torch
 from omegaconf import DictConfig
 from transformers import PreTrainedTokenizer
@@ -49,6 +52,10 @@ def __init__(
     ):
         check_if_trt_llm_installed()
 
+        if tensorrt_llm.mpi_rank() != 0:
+            f = open(os.devnull, 'w')
+            sys.stdout = f
+            sys.stderr = f 
         super().__init__(model_cfg, tokenizer)
 
         tensorrt_llm.logger.set_level(model_cfg['log_level'])
@@ -74,10 +81,10 @@ def __init__(
         use_gpt_attention_plugin = bool(
             config['plugin_config']['gpt_attention_plugin'])
         remove_input_padding = config['plugin_config']['remove_input_padding']
-        #if remove_input_padding:
-        #    raise ValueError(
-        #        'TRT-LLM Evaluation Wrapper does not support remove_input_padding.'
-        #    )
+        if remove_input_padding:
+            raise ValueError(
+                'TRT-LLM Evaluation Wrapper does not support remove_input_padding.'
+            )
        
         num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads)
         paged_kv_cache = config['plugin_config']['paged_kv_cache']
@@ -111,7 +118,6 @@ def __init__(
 
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
-        self.max_seq_len = 2048 #TODO: Do Not hardcode
 
         # Device and rank
         runtime_rank = tensorrt_llm.mpi_rank()
@@ -151,11 +157,6 @@ def __init__(
 
         print("!!! Initialized generation session for rank:", runtime_rank)        
         torch.cuda.synchronize()
-        
-        # Move metrics to proper device (doesn't help, have to do this in update_metric())
-        # for key, value in self.eval_metrics.items():
-        #    self.eval_metrics[key] = value.to(device=self.device)
-        #    print("Eval metric now at:", self.eval_metrics[key].device) 
 
     def rebatch(self, batch):
         """
@@ -169,24 +170,42 @@ def rebatch(self, batch):
             return batch.to(device=self.device)
         elif isinstance(batch, list):
             return [self.rebatch(b) for b in batch]
-        
         return batch
+    
+
+        # Remove potential additional dim, cast to int32
+        batch_input_ids = [
+            x.flatten().type(torch.int32) for x in batch_input_ids
+        ]
+        input_lengths = [x.size(0) for x in batch_input_ids]
+        max_length = max(input_lengths)
+        # Right padding for trt-llm
+        paddings = [
+            torch.ones(max_length - l, dtype=torch.int32, device=self.device) * pad_id
+            for l in input_lengths
+        ]
+        batch_input_ids = [
+            torch.cat([x, pad]) for x, pad in zip(batch_input_ids, paddings)
+        ]
+        batch_input_ids = torch.stack(batch_input_ids)
+        input_lengths = torch.tensor(input_lengths, dtype=torch.int32, device=self.device)
+        return batch_input_ids, input_lengths
 
 
     def eval_forward(self, batch, outputs: Optional[Any] = None):
-        # If the batch mode is generate, we will generate a requested number of tokens using the underlying
-        # model's generate function. Strings will be returned from eval_forward
+        # Run TRTLLM forward pass
         output_logits_batch = []
         batch = self.rebatch(batch)
 
         # Question-answering tasks
         if 'continuation_indices' not in batch:
+            """
             # Batched version
             batch_size = len(batch['input_ids'])
             prompt_lens = []
-            output_strs = [] # QA tasks return strings, not logits
             max_prompt_len = 0
-            
+            # prompt_list = []
+
             for tokens in batch['input_ids']:
                 prompt = tokens.tolist()
                 eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0]
@@ -197,21 +216,12 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                 if end_prompt_idx > max_prompt_len:
                     max_prompt_len = end_prompt_idx
 
-            #if batch_size == 1:
-            #    # Remove pad tokens
-            #    prompt = batch['input_ids'][0].tolist()[:prompt_lens[0]]
-            #    input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
-            #    input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device)
-            # else:
-            # Keep padding
             input_ids = torch.narrow(batch['input_ids'], 1, 0, max_prompt_len).to(dtype=torch.int, device=self.device) 
-            # input_ids = batch['input_ids'].to(dtype=torch.int, device=self.device)
             input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
            
-            #if tensorrt_llm.mpi_rank() == 0:
-            #    print("Prompt:", input_ids[7])
-            #print("Input shape:", input_ids.shape)
-            #print("Input lengths:", input_lengths)
+            print("Prompt:", input_ids)
+            print("Input shape:", input_ids.shape)
+            print("Input lengths:", input_lengths)
 
             with torch.no_grad():
                 self.decoder.setup(batch_size,
@@ -219,15 +229,14 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                                     batch['generation_length'])
                 output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
        
-            #if tensorrt_llm.mpi_rank() == 0:
-            #    print("Output:", [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']] for i in range(batch_size)])
-            # print("Output shape:", output_dict['output_ids'].shape)
-            #inp_len = input_ids.size(1)
-
-            decoded_strs = [self.tokenizer.decode(output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']]) for i in range(batch_size)]
-            output_strs += decoded_strs
-            print("decoded strs:", decoded_strs)
-            return output_strs
+            output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']] for i in range(batch_size)]
+
+            print("Output:", output_ids)
+        
+            decoded_strs = [self.tokenizer.decode(out) for out in output_ids]
+            # print("decoded strs:", decoded_strs)
+            return decoded_strs
+            """
 
             # Non-batched version
             output_strs = []
@@ -259,24 +268,6 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                 #print("Decoded OUTPUT:", decoded_str)
                 #print("-------------")
                 # print("Output ids:", output_dict['output_ids'][0][0].tolist())
-                """
-                context_logits = output_dict['context_logits'].squeeze()
-                output_logits_tensor = torch.stack(output_logits_list)
-                print("Context logits shape:", context_logits.shape)
-                print("Output logits shape:", output_logits_tensor.shape)
-                combined_logits = torch.cat([context_logits, output_logits_tensor])
-                                
-                padding = torch.nn.functional.one_hot(
-                    torch.full(
-                        (self.max_seq_len - combined_logits.shape[0],),
-                        self.PAD_ID,
-                        device=self.device
-                    ),
-                    num_classes=self.vocab_size)
-                padded_combined_logits = torch.cat([combined_logits, padding])
-
-                output_logits_batch.append(padded_combined_logits)
-                """
             return output_strs
 
         # Language modeling and multiple choice tasks
diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py
index d37168e900..cd53857179 100644
--- a/scripts/eval/run_trtllm_eval.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -17,35 +17,6 @@
 ALL_TASKS = './eval/yamls/tasks_v0.2.yaml'
 LM_TASKS = './eval/yamls/lm_tasks_v0.2.yaml'
 
-# GPT config is just for quick initial testing purposes
-trt_gpt_config = {
-    'run_name': 'trtllm-eval',
-    'seed': 0,
-    'max_seq_len': 1024,
-    'device_eval_batch_size': 4,
-    'precision': 'amp_fp16',
-    'dist_timeout': 6000,
-    'models':
-    [
-        {
-            'model_name': 'trtllm/gpt',
-            'model':
-            {
-                'name': 'trtllm',
-                'version': 'gpt',
-                'engine_dir': trt_folder_path + 'examples/gpt/engine_outputs',
-                'log_level': 'error'
-            },
-            'tokenizer':
-            {
-                'name': 'gpt2'
-            }
-        }
-    ],
-    'icl_tasks': './eval/yamls/mini_tasks_v0.2.yaml',
-    'eval_gauntlet': './eval/yamls/mini_eval_gauntlet_v0.2.yaml',
-}
-
 
 def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
     return {
@@ -105,10 +76,10 @@ def run_eval(config):
     print("OmegaConfig dictionary", om.to_yaml(om_dict_config))
     run_evaluation(om_dict_config)
 
-#run_eval(llama7b_int8_config)
-#run_eval(llama70b_int8_config)
-#run_eval(llama70b_fp16_config)
+# run_eval(llama7b_int8_config)
+run_eval(llama70b_int8_config)
+# run_eval(llama70b_fp16_config)
 #run_eval(llama70b_fp8_config)
-run_eval(llama70b_smoothquant_config)
+# run_eval(llama70b_smoothquant_config)
 
 

From 9cc6deaa12bf793e4ac9b82191b2d4e4fa15b1c7 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Sun, 24 Mar 2024 17:29:04 -0700
Subject: [PATCH 15/21] update wrappers

---
 .../models/inference_api_wrapper/interface.py |   4 +-
 .../models/inference_api_wrapper/trtllm.py    | 255 +++++++++++++-----
 scripts/eval/eval.py                          |  19 +-
 scripts/eval/eval_trt_multigpu.py             |   4 +-
 4 files changed, 207 insertions(+), 75 deletions(-)

diff --git a/llmfoundry/models/inference_api_wrapper/interface.py b/llmfoundry/models/inference_api_wrapper/interface.py
index b0e0f4adee..493ee14f02 100644
--- a/llmfoundry/models/inference_api_wrapper/interface.py
+++ b/llmfoundry/models/inference_api_wrapper/interface.py
@@ -102,6 +102,8 @@ def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None:
             return
         
         self.labels[:, :-1] = self.labels[:, 1:].clone()
+        #print("**** Labels:",self.tokenizer.decode(self.labels[0]))
+        #print("*******")
         self.labels[:, -1] = -100
         if isinstance(metric, InContextLearningMetric) and batch.get(
                 'mode', None) == 'icl_task':
@@ -111,7 +113,7 @@ def update_metric(self, batch: Any, outputs: Any, metric: Metric) -> None:
             raise NotImplementedError(
                 'Inference API wrapper only supports InContextLearningMetrics and mode=icl_task'
             )
-
+        
     def forward(self):
         raise NotImplementedError(
             "Inference API wrapper doesn't support forward")
diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index 1bc2b8a1eb..efe62fa85e 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -37,10 +37,7 @@ def check_if_trt_llm_installed():
 
 # From tensorrt_llm/examples/{model_name}/build.py
 def get_engine_name(model: str, dtype: str, tp_size: int, pp_size: int, rank: int):
-    if pp_size == 1:
-        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
-    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size,
-                                                  pp_size, rank)
+    return 'rank{}.engine'.format(rank)
 
 
 class TRTLLMEvalWrapper(InferenceAPIEvalWrapper):
@@ -66,34 +63,43 @@ def __init__(
         with open(config_path, 'r') as f:
             config = json.load(f)
         
-        dtype = config['builder_config']['precision']
-        tp_size = config['builder_config']['tensor_parallel']
-        pp_size = config['builder_config'].get('pipeline_parallel', 1)
+        pretrained_config = config['pretrained_config']
+        quantization_config = pretrained_config['quantization']
+        build_config = config['build_config']
+        plugin_config = build_config['plugin_config']
+ 
+        dtype = pretrained_config['dtype']
+        tp_size = pretrained_config['mapping']['tp_size']
+        pp_size = pretrained_config['mapping'].get('pp_size', 1)
         world_size = tp_size * pp_size
 
         assert world_size == tensorrt_llm.mpi_world_size(), \
             f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
         
-        num_heads = config['builder_config']['num_heads'] // tp_size
-        hidden_size = config['builder_config']['hidden_size'] // tp_size
-        vocab_size = config['builder_config']['vocab_size']
-        num_layers = config['builder_config']['num_layers']
-        use_gpt_attention_plugin = bool(
-            config['plugin_config']['gpt_attention_plugin'])
-        remove_input_padding = config['plugin_config']['remove_input_padding']
+        num_heads = pretrained_config['num_attention_heads'] // tp_size
+        hidden_size = pretrained_config['hidden_size'] // tp_size
+        
+        max_batch_size = build_config['max_batch_size']
+        vocab_size = pretrained_config['vocab_size']
+        num_layers = pretrained_config['num_hidden_layers']
+        
+        use_gpt_attention_plugin = bool(plugin_config['gpt_attention_plugin'])
+        remove_input_padding = plugin_config['remove_input_padding']
         if remove_input_padding:
             raise ValueError(
                 'TRT-LLM Evaluation Wrapper does not support remove_input_padding.'
             )
        
-        num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads)
-        paged_kv_cache = config['plugin_config']['paged_kv_cache']
-        tokens_per_block = config['plugin_config']['tokens_per_block']
-        use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce',
+        num_kv_heads = build_config.get('num_key_value_heads', num_heads)
+        paged_kv_cache = plugin_config['paged_kv_cache']
+        tokens_per_block = plugin_config['tokens_per_block']
+        use_custom_all_reduce = plugin_config.get('use_custom_all_reduce',
                                                             False)
-
-        quant_mode = QuantMode(config['builder_config']['quant_mode'])
-        if config['builder_config'].get('multi_query_mode', False):
+        quant_mode = QuantMode.from_quant_algo(
+            quant_algo=quantization_config['quant_algo'],
+            kv_cache_quant_algo=quantization_config['kv_cache_quant_algo'])
+        
+        if pretrained_config.get('multi_query_mode', False):
             tensorrt_llm.logger.warning(
                 "`multi_query_mode` config is deprecated. Please rebuild the engine."
             )
@@ -101,6 +107,8 @@ def __init__(
         num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
 
         model_config = tensorrt_llm.runtime.ModelConfig(
+            max_batch_size=max_batch_size,
+            max_beam_width=1,
             vocab_size=vocab_size,
             num_layers=num_layers,
             num_heads=num_heads,
@@ -113,11 +121,14 @@ def __init__(
             use_custom_all_reduce=use_custom_all_reduce,
             dtype=dtype,
             quant_mode=quant_mode,
-            gather_all_token_logits=True)
+            gather_context_logits=build_config.get('gather_context_logits', False),
+            gather_generation_logits=build_config.get('gather_generation_logits', False),
+        ) 
 
 
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
+        self.max_output_len = build_config['max_output_len']
 
         # Device and rank
         runtime_rank = tensorrt_llm.mpi_rank()
@@ -129,6 +140,7 @@ def __init__(
         self.device = torch.device('cuda:' + str(self.device_num))
         torch.cuda.set_device(self.device_num)
 
+        print("My rank:", runtime_rank)
         # Tokenization and sampling
         self.END_ID = model_cfg.get('eos_token_id', self.tokenizer.eos_token_id)
         self.PAD_ID = model_cfg.get('pad_token_id', self.tokenizer.pad_token_id)
@@ -144,10 +156,8 @@ def __init__(
                                               pad_id=self.PAD_ID,
                                               num_beams=1,
                                               return_dict=True)
-
         # Load TRT engine
-        engine_name = get_engine_name(model_cfg['version'], dtype, tp_size, pp_size,
-                                      runtime_rank)
+        engine_name = 'rank{}.engine'.format(runtime_rank)
         serialize_path = engine_dir / engine_name
         with open(serialize_path, 'rb') as f:
             engine_buffer = f.read()
@@ -158,6 +168,7 @@ def __init__(
         print("!!! Initialized generation session for rank:", runtime_rank)        
         torch.cuda.synchronize()
 
+
     def rebatch(self, batch):
         """
         Move tensors in batch to the correct GPU.
@@ -174,6 +185,7 @@ def rebatch(self, batch):
     
 
         # Remove potential additional dim, cast to int32
+        """
         batch_input_ids = [
             x.flatten().type(torch.int32) for x in batch_input_ids
         ]
@@ -187,10 +199,10 @@ def rebatch(self, batch):
         batch_input_ids = [
             torch.cat([x, pad]) for x, pad in zip(batch_input_ids, paddings)
         ]
-        batch_input_ids = torch.stack(batch_input_ids)
+        batch_input_ids = torch.stack(batch_input_ids).to(device=self.device)
         input_lengths = torch.tensor(input_lengths, dtype=torch.int32, device=self.device)
         return batch_input_ids, input_lengths
-
+        """
 
     def eval_forward(self, batch, outputs: Optional[Any] = None):
         # Run TRTLLM forward pass
@@ -199,16 +211,13 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
         # Question-answering tasks
         if 'continuation_indices' not in batch:
-            """
             # Batched version
             batch_size = len(batch['input_ids'])
             prompt_lens = []
             max_prompt_len = 0
-            # prompt_list = []
-
             for tokens in batch['input_ids']:
                 prompt = tokens.tolist()
-                eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0]
+                eos_occurence = (tokens == self.END_ID).nonzero(as_tuple=True)[0]
                 end_prompt_idx = len(prompt)
                 if eos_occurence.shape[0] >= 1:
                     end_prompt_idx = eos_occurence[0]
@@ -220,27 +229,51 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
            
             print("Prompt:", input_ids)
-            print("Input shape:", input_ids.shape)
-            print("Input lengths:", input_lengths)
-
+            #print("Input shape:", input_ids.shape)
+            #print("Input lengths:", input_lengths)
+            max_generation_length = 256
             with torch.no_grad():
                 self.decoder.setup(batch_size,
                                     input_lengths.max().item(),
-                                    batch['generation_length'])
+                                    batch.get('generation_length', max_generation_length))
                 output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
-       
-            output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch['generation_length']] for i in range(batch_size)]
+                #self.decoder.setup(1,
+                #                    input_lengths[:1].max().item(),
+                #                    batch.get('generation_length', max_generation_length))
+                #output_dict2 = self.decoder.decode(input_ids[:1,:], input_lengths[:1], self.sampling_config, return_dict=True)
+
+            #answer1 = output_dict['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length]
+            #answer2 = output_dict2['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length]
+            #all_equal = torch.equal(answer1, answer2)
+            """
+            if not all_equal:
+                print("Prompt:", input_ids[0])
+                print("Answer 1:", self.tokenizer.decode(answer1))
+                print("Answer 2:", self.tokenizer.decode(answer2))
+                print("Shape 1:", answer1.shape)
+                print("Shape 2", answer2.shape)
+                difference = answer1 - answer2
+                nonzero_indices = difference.nonzero(as_tuple=True)
+                nonzero = difference[difference.nonzero(as_tuple=True)]            
+                print("EQUAL?", all_equal)
+                print("Difference:", difference)
+                print("nonzero indices:", nonzero_indices)
+                print("Nonzero Elements", nonzero)
+                quit()
+            """
+            output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch.get('generation_length', max_generation_length)] for i in range(batch_size)]
 
-            print("Output:", output_ids)
+            #print("Output:", output_ids)
         
             decoded_strs = [self.tokenizer.decode(out) for out in output_ids]
-            # print("decoded strs:", decoded_strs)
+            print("decoded strs:", decoded_strs)
             return decoded_strs
-            """
-
+            
             # Non-batched version
+            """
             output_strs = []
             for tokens in batch['input_ids']:
+                #print("RAW Tokens:", tokens)
                 seqlen = tokens.shape[0]
                 prompt = tokens.tolist()
                 eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0]
@@ -250,51 +283,133 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                 prompt = prompt[:end_prompt_idx]
                 input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
                 input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device)
-                #print("prompt:", self.tokenizer.decode(prompt))
+                print("prompt:", self.tokenizer.decode(prompt))
                 #print("promp tokens:", prompt)
                 #print("Input lengths:", input_lengths)
                 #print("Generation Length:", batch['generation_length'])
-                
+                #print("Batch keys:", batch.keys())                
+                torch.cuda.synchronize()
                 with torch.no_grad():
-                    self.decoder.setup(input_lengths.size(0),
-                                        torch.max(input_lengths).item(),
-                                        batch['generation_length'])
-
-                    output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
+                    self.decoder.setup(batch_size=input_lengths.size(0),
+                                        max_context_length=torch.max(input_lengths).item(),
+                                        max_new_tokens=batch.get('generation_length', 200))
+                    output_dict = self.decoder.decode(
+                                    input_ids,
+                                    input_lengths,
+                                    self.sampling_config, 
+                                    #stopping_criteria=batch['stopping_criteria'],
+                                    return_dict=True)
                 
                 #print("Shape:", output_dict['output_ids'].shape)
                 decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):])
                 output_strs.append(decoded_str)
-                #print("Decoded OUTPUT:", decoded_str)
+                print("Decoded OUTPUT:", decoded_str)
                 #print("-------------")
-                # print("Output ids:", output_dict['output_ids'][0][0].tolist())
+                #print("Output ids:", output_dict['output_ids'][0][0].tolist())
             return output_strs
+            """
 
+        #################
+        # Batched version of language modeling/multiple choice tasks
+        batch_size = len(batch['input_ids'])
+        seqlen = batch['input_ids'].shape[1]
+        #print("Seq len:", seqlen)
+        prompt_lens = []
+        continuation_lens = []
+        for tokens, cont_idxs in zip(batch['input_ids'],
+                                     batch['continuation_indices']):            
+            tokens = tokens.tolist()
+            cont_idxs = cont_idxs.tolist()
+            expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+            prompt = tokens[:cont_idxs[0]]
+            prompt_lens.append(cont_idxs[0])
+            continuation_lens.append(len(expected_cont_tokens))
+    
+        input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
+        input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int)
+        for i in range(batch_size):
+            input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]]
+        
+        #print("New batch shape", input_ids.shape)
+        #print("Continuation lengths:", continuation_lens)
+        #print("Prompt:", input_ids)
+        #print("Input shape:", input_ids.shape)
+        #print("Input lengths:", input_lengths)
+        with torch.no_grad():
+            self.decoder.setup(batch_size,
+                                input_lengths.max().item(),
+                                 max(continuation_lens))
+            output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
+        torch.cuda.synchronize()
+
+        output_logits_list = output_dict['generation_logits']
+        #print("Output logits list", output_logits_list)
+        # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())
+        
+        # output_logits_list length is == max(continuation_lens)
+        # Output logits_list[i] is of shape (batch_size, vocab_size)
+        if len(output_logits_list) > 0:
+            #print("Shape:", output_logits_list[0].shape)
+            output_logits_tensor = torch.stack(output_logits_list, dim=1)
+        else:
+            output_logits_tensor = None 
+        
+        #if output_logits_tensor is not None:
+            #print("Output logits tensor shape:", output_logits_tensor.shape)
+
+        # Put together logits
+        # We loop through batch_size dimension rather than deal with NestedTensor
+        output_logits_batch = []
+        for i in range(batch_size):
+            # First create context "logits" (one-hot vector with 1 at token position)
+            tokens = input_ids[i].tolist()
+            context_psuedologits = torch.nn.functional.one_hot(
+                torch.tensor(tokens[1:prompt_lens[i]], device=self.device),
+                num_classes=self.vocab_size)
+            # Then add generation logits (up to continuation_length)
+            if output_logits_tensor is not None:
+                output_logits_trimmed = output_logits_tensor[i][:continuation_lens[i]]
+                # print("Output logits trimmed shape:", output_logits_trimmed.shape)
+                combined_logits = torch.cat([context_psuedologits, output_logits_trimmed])
+            else:
+                combined_logits = context_psuedologits
+            # Then pad with Padding token "logits" to end of sequence length
+            padding = torch.nn.functional.one_hot(
+                torch.full(
+                    (seqlen - combined_logits.shape[0],),
+                    self.PAD_ID,
+                    device=self.device
+                ),
+                num_classes=self.vocab_size)
+            padded_combined_logits = torch.cat([combined_logits, padding])   
+            output_logits_batch.append(padded_combined_logits)
+
+        return torch.stack(output_logits_batch).to(self.device)
+        ###############################################
+        # NON BATCHED VERSION
         # Language modeling and multiple choice tasks
+        """
         for tokens, cont_idxs in zip(batch['input_ids'],
                                      batch['continuation_indices']):
-
+            # print("******************************")
             seqlen = tokens.shape[0]
             tokens = tokens.tolist()
+            # print("Tokens:", tokens)
             # print("Continuation indices:", cont_idxs)
             cont_idxs = cont_idxs.tolist()
-            if len(cont_idxs) > 1:
-                expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
-                prompt = tokens[:cont_idxs[0]]
-            else:
-                prompt = tokens
-                expected_cont_tokens = [tokens[-1]]
+            expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+            # print("Expected continuation tokens:", expected_cont_tokens)
+            prompt = tokens[:cont_idxs[0]]
 
             
             input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
             input_lengths = torch.tensor([input_ids.size(1)],
                                          dtype=torch.int,
                                          device=self.device)
-            # print("prompt:", self.tokenizer.decode(prompt))
+            # print("*** PROMPT:", self.tokenizer.decode(prompt))
             # print("Input device:", input_ids.get_device())
-            #print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
-            #print("Input lengths:", input_lengths)
-            #print(cont_idxs[0])
+            # print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
+            # print("Input lengths:", input_lengths)
             #print("Expected continuation tokens:", len(expected_cont_tokens))
             with torch.no_grad():
                 self.decoder.setup(input_lengths.size(0),
@@ -306,23 +421,22 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             
             torch.cuda.synchronize()
 
-            context_logits = output_dict['context_logits']
-            context_logits = context_logits.squeeze()
+            context_psuedologits = torch.nn.functional.one_hot(
+                torch.tensor(tokens[1:cont_idxs[0]], device=self.device),
+                num_classes=self.vocab_size)
             output_logits_list = output_dict['generation_logits']
             # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())
             for i in range(len(output_logits_list)):
                 output_logits_list[i] = output_logits_list[i].squeeze()
-                # print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()))
-            # print("Context logits:", context_logits.shape)
-            # print("Output logits list:", output_logits_list)
+                # print("*** Output string:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()))
+            #print("Context logits:", context_psuedologits.shape)
             if len(output_logits_list) > 0:
-                # print("Output logits 0 shape:", output_logits_list[0].shape)
                 output_logits_tensor = torch.stack(output_logits_list)
                 # print("Output logits stacked:", output_logits_tensor.shape)
-                combined_logits = torch.cat([context_logits, output_logits_tensor])
+                combined_logits = torch.cat([context_psuedologits, output_logits_tensor])
             else:
-                combined_logits = context_logits
-
+                combined_logits = context_psuedologits
+            #print("Seqlen", seqlen)
             # print("Combined logits shape:", combined_logits.shape)
             
             padding = torch.nn.functional.one_hot(
@@ -341,3 +455,4 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
         return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device)
 
         #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
+        """
\ No newline at end of file
diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py
index e36e08575b..12f8c631f6 100644
--- a/scripts/eval/eval.py
+++ b/scripts/eval/eval.py
@@ -28,6 +28,16 @@
                                        build_tokenizer)
 from llmfoundry.utils.config_utils import (log_config, pop_config,
                                            process_init_device)
+try:
+    import tensorrt_llm
+    TRTLLM = True
+    if tensorrt_llm.mpi_world_size() > 1:
+        TRTLLM_MULTIGPU = True
+    else:
+        TRTLLM_MULTIGPU = False
+except:
+    TRTLLM = False
+    TRTLLM_MULTIGPU = False
 
 log = logging.getLogger(__name__)
 
@@ -97,7 +107,7 @@ def evaluate_model(
         icl_seq_len=max_seq_len,
         icl_subset_num_batches=icl_subset_num_batches,
     )
-
+    
     callbacks = []
     if eval_gauntlet_callback is not None:
         callbacks.append(eval_gauntlet_callback)
@@ -153,6 +163,9 @@ def evaluate_model(
 
     log.info(f'Building trainer for {model_cfg.model_name}...')
 
+    #if TRTLLM_MULTIGPU and tensorrt_llm.mpi_rank() > 0:
+    #    loggers = None
+
     trainer = Trainer(
         run_name=run_name,
         seed=seed,
@@ -272,7 +285,9 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
         )
 
     reproducibility.seed_all(seed)
-    dist.initialize_dist(get_device(None), timeout=dist_timeout)
+    
+    if not TRTLLM_MULTIGPU:
+        dist.initialize_dist(get_device(None), timeout=dist_timeout)
 
     if python_log_level is not None:
         logging.basicConfig(
diff --git a/scripts/eval/eval_trt_multigpu.py b/scripts/eval/eval_trt_multigpu.py
index 1a63fac90c..6d850dcc3c 100644
--- a/scripts/eval/eval_trt_multigpu.py
+++ b/scripts/eval/eval_trt_multigpu.py
@@ -132,8 +132,8 @@ def evaluate_model(
 
     assert composer_model is not None
 
-    if tensorrt_llm.mpi_rank() > 0:
-        loggers = None
+    # if tensorrt_llm.mpi_rank() > 0:
+    #    loggers = None
 
     trainer = Trainer(
         run_name=run_name,

From 86de60198005f05d67e98fe62b7763de92c728f4 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Sun, 24 Mar 2024 21:35:05 -0700
Subject: [PATCH 16/21] update runner

---
 scripts/eval/run_trtllm_eval.py | 74 ++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 15 deletions(-)

diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py
index cd53857179..bd4d4c2a30 100644
--- a/scripts/eval/run_trtllm_eval.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -5,6 +5,7 @@
 # All this can be written in YAML form.
 
 
+#from eval import main as run_evaluation
 from eval_trt_multigpu import main as run_evaluation
 from omegaconf import OmegaConf as om
 from omegaconf import DictConfig
@@ -13,12 +14,50 @@
 trt_folder_path = '/workspace/TensorRT-LLM/'
 
 MINI_TASKS = './eval/yamls/mini_tasks_v0.2.yaml'
-QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.2.yaml'
-ALL_TASKS = './eval/yamls/tasks_v0.2.yaml'
-LM_TASKS = './eval/yamls/lm_tasks_v0.2.yaml'
+QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.3.yaml'
+ALL_TASKS = './eval/yamls/tasks_v0.3.yaml'
+LM_TASKS = './eval/yamls/lm_tasks_v0.3.yaml'
+GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml'
+
+def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS):
+    return {
+        'run_name': 'trtllm-eval',
+        'seed': 0,
+        'max_seq_len': 2048,
+        'device_eval_batch_size': 8,
+        'precision': 'amp_bf16',
+        'dist_timeout': 6000,
+        'models':
+        [
+            {
+                'model_name': 'trtllm/dbrx',
+                'model':
+                {
+                    'name': 'trtllm',
+                    'version': 'dbrx',
+                    'engine_dir': engine_dir,
+                    'log_level': 'error',
+                    'eos_token_id': 2,
+                    'pad_token_id': 2
+                },
+                'tokenizer':
+                {
+                    'name': tokenizer_name,
+                }
+            }
+        ],
+        'icl_tasks': icl_tasks,
+        'eval_gauntlet': EVAL_GAUNTLET,
+        'loggers': {
+            'wandb': {
+                'project': 'nik-dbrx-quant-eval'
+            }
+        }
+    }
 
 
-def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
+
+def get_llama_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS):
     return {
         'run_name': 'trtllm-eval',
         'seed': 0,
@@ -46,7 +85,7 @@ def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
             }
         ],
         'icl_tasks': icl_tasks,
-        'eval_gauntlet': './eval/yamls/eval_gauntlet_v0.2.yaml',
+        'eval_gauntlet': EVAL_GAUNTLET,
         'loggers': {
             'wandb': {
                 'project': 'nik-quant-eval'
@@ -58,17 +97,22 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8):
     return f"{trt_folder_path}examples/{model_type}/tmp/{model_type}/{model_dir}/trt_engines/{variant}/{ngpus}-gpu"
 
 
-LLAMA_TOK_DIR = '/workspace/llama-70b-chat-hf/'
+LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/'
+DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_23_hf_ckpt/'
+
 LLAMA_7B_DIR = '7B-chat-quality-eval'
 LLAMA_70B_DIR = '70B-chat-quality-eval'
 
+# LLama URLs
+# fp8_engine_dir = '/mnt/workdisk/nikhil/engines-quality-eval/llama-2-70b-chat-tp8-fp8'
+# llama7b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_7B_DIR, 'int8_kv_cache_weight_only', 1), LLAMA_TOK_DIR)
+#llama70b_fp8_config = get_llama_config(fp8_engine_dir, LLAMA_TOK_DIR)
+# llama70b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'int8_kv_cache_weight_only'), LLAMA_TOK_DIR)
+# llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR)
+# llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR)
 
-llama7b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_7B_DIR, 'int8_kv_cache_weight_only', 1), LLAMA_TOK_DIR)
-llama70b_fp8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp8'), LLAMA_TOK_DIR)
-llama70b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'int8_kv_cache_weight_only'), LLAMA_TOK_DIR)
-llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR)
-llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR)
-
+dbrx_bf16_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_23_tllm_engine_bf16'
+dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR)
 
 def run_eval(config):
     print("RUNNING EVAL")
@@ -77,9 +121,9 @@ def run_eval(config):
     run_evaluation(om_dict_config)
 
 # run_eval(llama7b_int8_config)
-run_eval(llama70b_int8_config)
+# run_eval(llama70b_int8_config)
 # run_eval(llama70b_fp16_config)
-#run_eval(llama70b_fp8_config)
+# run_eval(llama70b_fp8_config)
 # run_eval(llama70b_smoothquant_config)
-
+run_eval(dbrx_bf16_config)
 

From 1f3eeb621e898795a39d1184839c9154b54fa0a8 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Sun, 24 Mar 2024 21:45:15 -0700
Subject: [PATCH 17/21] update script

---
 scripts/eval/run_trtllm_eval.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py
index bd4d4c2a30..b3f1fca6e9 100644
--- a/scripts/eval/run_trtllm_eval.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -5,8 +5,8 @@
 # All this can be written in YAML form.
 
 
-#from eval import main as run_evaluation
-from eval_trt_multigpu import main as run_evaluation
+from eval import main as run_evaluation
+#from eval_trt_multigpu import main as run_evaluation
 from omegaconf import OmegaConf as om
 from omegaconf import DictConfig
 
@@ -17,7 +17,7 @@
 QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.3.yaml'
 ALL_TASKS = './eval/yamls/tasks_v0.3.yaml'
 LM_TASKS = './eval/yamls/lm_tasks_v0.3.yaml'
-GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml'
+EVAL_GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml'
 
 def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS):
     return {
@@ -43,6 +43,10 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS):
                 'tokenizer':
                 {
                     'name': tokenizer_name,
+                    'kwargs':
+                    {
+                        'trust_remote_code': 'True'
+                    }
                 }
             }
         ],

From 8b3a4b10fc5b2988a0dbf3d18d487b426ae321aa Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Sun, 24 Mar 2024 21:56:13 -0700
Subject: [PATCH 18/21] Remove prints

---
 scripts/eval/run_trtllm_eval.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py
index b3f1fca6e9..12d25a3668 100644
--- a/scripts/eval/run_trtllm_eval.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -102,7 +102,7 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8):
 
 
 LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/'
-DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_23_hf_ckpt/'
+DBRX_TOK_DIR = '/workspace/dbrx/03_23_hf_ckpt/'
 
 LLAMA_7B_DIR = '7B-chat-quality-eval'
 LLAMA_70B_DIR = '70B-chat-quality-eval'
@@ -116,7 +116,9 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8):
 # llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR)
 
 dbrx_bf16_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_23_tllm_engine_bf16'
+dbrx_int8_engine_dir = '/workspace/dbrx/03_23_tllm_engine_int8'
 dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR)
+dbrx_int8_config = get_dbrx_config(dbrx_int8_engine_dir, DBRX_TOK_DIR)
 
 def run_eval(config):
     print("RUNNING EVAL")
@@ -129,5 +131,6 @@ def run_eval(config):
 # run_eval(llama70b_fp16_config)
 # run_eval(llama70b_fp8_config)
 # run_eval(llama70b_smoothquant_config)
-run_eval(dbrx_bf16_config)
+# run_eval(dbrx_bf16_config)
+run_eval(dbrx_int8_config)
 

From 8382411d4615d761cf943125ba6a4ae65ac9cd06 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Sun, 24 Mar 2024 22:51:51 -0700
Subject: [PATCH 19/21] update wrappers

---
 llmfoundry/models/inference_api_wrapper/trtllm.py |  9 +++++----
 scripts/eval/run_trtllm_eval.py                   | 14 +++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index efe62fa85e..2ec26ae192 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -217,10 +217,10 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             max_prompt_len = 0
             for tokens in batch['input_ids']:
                 prompt = tokens.tolist()
-                eos_occurence = (tokens == self.END_ID).nonzero(as_tuple=True)[0]
+                pad_start = (tokens == self.PAD_ID).nonzero(as_tuple=True)[0]
                 end_prompt_idx = len(prompt)
-                if eos_occurence.shape[0] >= 1:
-                    end_prompt_idx = eos_occurence[0]
+                if pad_start.shape[0] >= 1:
+                    end_prompt_idx = pad_start[0]
                 prompt_lens.append(end_prompt_idx)
                 if end_prompt_idx > max_prompt_len:
                     max_prompt_len = end_prompt_idx
@@ -228,7 +228,8 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             input_ids = torch.narrow(batch['input_ids'], 1, 0, max_prompt_len).to(dtype=torch.int, device=self.device) 
             input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
            
-            print("Prompt:", input_ids)
+            torch.set_printoptions(threshold=10_000)
+            print("Prompt0:", input_ids[0])
             #print("Input shape:", input_ids.shape)
             #print("Input lengths:", input_lengths)
             max_generation_length = 256
diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py
index 12d25a3668..26d829dd20 100644
--- a/scripts/eval/run_trtllm_eval.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -19,12 +19,12 @@
 LM_TASKS = './eval/yamls/lm_tasks_v0.3.yaml'
 EVAL_GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml'
 
-def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS):
+def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
     return {
         'run_name': 'trtllm-eval',
         'seed': 0,
         'max_seq_len': 2048,
-        'device_eval_batch_size': 8,
+        'device_eval_batch_size': 64,
         'precision': 'amp_bf16',
         'dist_timeout': 6000,
         'models':
@@ -37,8 +37,8 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS):
                     'version': 'dbrx',
                     'engine_dir': engine_dir,
                     'log_level': 'error',
-                    'eos_token_id': 2,
-                    'pad_token_id': 2
+                    'eos_token_id': 100257,
+                    'pad_token_id': 100277,
                 },
                 'tokenizer':
                 {
@@ -102,7 +102,7 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8):
 
 
 LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/'
-DBRX_TOK_DIR = '/workspace/dbrx/03_23_hf_ckpt/'
+DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_23_hf_ckpt/'
 
 LLAMA_7B_DIR = '7B-chat-quality-eval'
 LLAMA_70B_DIR = '70B-chat-quality-eval'
@@ -131,6 +131,6 @@ def run_eval(config):
 # run_eval(llama70b_fp16_config)
 # run_eval(llama70b_fp8_config)
 # run_eval(llama70b_smoothquant_config)
-# run_eval(dbrx_bf16_config)
-run_eval(dbrx_int8_config)
+run_eval(dbrx_bf16_config)
+# run_eval(dbrx_int8_config)
 

From a92f7ccd82a9085e208995e2b772325ef74b7296 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Tue, 26 Mar 2024 02:11:21 -0700
Subject: [PATCH 20/21] update wrapper to properly support MC tasks

---
 .../models/inference_api_wrapper/trtllm.py    | 344 +++++++++---------
 scripts/eval/run_trtllm_eval.py               |  12 +-
 2 files changed, 187 insertions(+), 169 deletions(-)

diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index 2ec26ae192..ed765c2572 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -211,7 +211,10 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
 
         # Question-answering tasks
         if 'continuation_indices' not in batch:
-            # Batched version
+            # Batched version. For some reason
+            # GSM-8k gives bad outputs when we batch on BF16 version.
+            # So, we will not batch.
+            """
             batch_size = len(batch['input_ids'])
             prompt_lens = []
             max_prompt_len = 0
@@ -229,10 +232,11 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
             input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
            
             torch.set_printoptions(threshold=10_000)
-            print("Prompt0:", input_ids[0])
+            #print("Prompt0:", input_ids[0])
             #print("Input shape:", input_ids.shape)
             #print("Input lengths:", input_lengths)
             max_generation_length = 256
+            torch.cuda.synchronize()
             with torch.no_grad():
                 self.decoder.setup(batch_size,
                                     input_lengths.max().item(),
@@ -242,49 +246,47 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                 #                    input_lengths[:1].max().item(),
                 #                    batch.get('generation_length', max_generation_length))
                 #output_dict2 = self.decoder.decode(input_ids[:1,:], input_lengths[:1], self.sampling_config, return_dict=True)
+            torch.cuda.synchronize()
 
             #answer1 = output_dict['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length]
             #answer2 = output_dict2['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length]
             #all_equal = torch.equal(answer1, answer2)
-            """
-            if not all_equal:
-                print("Prompt:", input_ids[0])
-                print("Answer 1:", self.tokenizer.decode(answer1))
-                print("Answer 2:", self.tokenizer.decode(answer2))
-                print("Shape 1:", answer1.shape)
-                print("Shape 2", answer2.shape)
-                difference = answer1 - answer2
-                nonzero_indices = difference.nonzero(as_tuple=True)
-                nonzero = difference[difference.nonzero(as_tuple=True)]            
-                print("EQUAL?", all_equal)
-                print("Difference:", difference)
-                print("nonzero indices:", nonzero_indices)
-                print("Nonzero Elements", nonzero)
-                quit()
-            """
+            # if not all_equal:
+            #     print("Prompt:", input_ids[0])
+            #     print("Answer 1:", self.tokenizer.decode(answer1))
+            #     print("Answer 2:", self.tokenizer.decode(answer2))
+            #     print("Shape 1:", answer1.shape)
+            #     print("Shape 2", answer2.shape)
+            #     difference = answer1 - answer2
+            #     nonzero_indices = difference.nonzero(as_tuple=True)
+            #     nonzero = difference[difference.nonzero(as_tuple=True)]            
+            #     print("EQUAL?", all_equal)
+            #     print("Difference:", difference)
+            #     print("nonzero indices:", nonzero_indices)
+            #     print("Nonzero Elements", nonzero)
+            #     quit()
             output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch.get('generation_length', max_generation_length)] for i in range(batch_size)]
 
-            #print("Output:", output_ids)
+            print("Output:", output_ids)
         
             decoded_strs = [self.tokenizer.decode(out) for out in output_ids]
             print("decoded strs:", decoded_strs)
             return decoded_strs
-            
-            # Non-batched version
             """
+            # Non-batched version
             output_strs = []
             for tokens in batch['input_ids']:
                 #print("RAW Tokens:", tokens)
                 seqlen = tokens.shape[0]
                 prompt = tokens.tolist()
-                eos_occurence = (tokens == 2).nonzero(as_tuple=True)[0]
+                pad_start = (tokens == self.PAD_ID).nonzero(as_tuple=True)[0]
                 end_prompt_idx = len(prompt)
-                if eos_occurence.shape[0] >= 1:
-                    end_prompt_idx = eos_occurence[0]
+                if pad_start.shape[0] >= 1:
+                    end_prompt_idx = pad_start[0]
                 prompt = prompt[:end_prompt_idx]
                 input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
                 input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device)
-                print("prompt:", self.tokenizer.decode(prompt))
+                #print("prompt:", self.tokenizer.decode(prompt))
                 #print("promp tokens:", prompt)
                 #print("Input lengths:", input_lengths)
                 #print("Generation Length:", batch['generation_length'])
@@ -304,156 +306,172 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                 #print("Shape:", output_dict['output_ids'].shape)
                 decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):])
                 output_strs.append(decoded_str)
-                print("Decoded OUTPUT:", decoded_str)
+                #print("Decoded OUTPUT:", decoded_str)
                 #print("-------------")
                 #print("Output ids:", output_dict['output_ids'][0][0].tolist())
             return output_strs
-            """
 
-        #################
-        # Batched version of language modeling/multiple choice tasks
-        batch_size = len(batch['input_ids'])
-        seqlen = batch['input_ids'].shape[1]
-        #print("Seq len:", seqlen)
-        prompt_lens = []
-        continuation_lens = []
-        for tokens, cont_idxs in zip(batch['input_ids'],
-                                     batch['continuation_indices']):            
-            tokens = tokens.tolist()
-            cont_idxs = cont_idxs.tolist()
-            expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
-            prompt = tokens[:cont_idxs[0]]
-            prompt_lens.append(cont_idxs[0])
-            continuation_lens.append(len(expected_cont_tokens))
-    
-        input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
-        input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int)
-        for i in range(batch_size):
-            input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]]
-        
-        #print("New batch shape", input_ids.shape)
-        #print("Continuation lengths:", continuation_lens)
-        #print("Prompt:", input_ids)
-        #print("Input shape:", input_ids.shape)
-        #print("Input lengths:", input_lengths)
-        with torch.no_grad():
-            self.decoder.setup(batch_size,
-                                input_lengths.max().item(),
-                                 max(continuation_lens))
-            output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
-        torch.cuda.synchronize()
 
-        output_logits_list = output_dict['generation_logits']
-        #print("Output logits list", output_logits_list)
-        # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())
-        
-        # output_logits_list length is == max(continuation_lens)
-        # Output logits_list[i] is of shape (batch_size, vocab_size)
-        if len(output_logits_list) > 0:
-            #print("Shape:", output_logits_list[0].shape)
-            output_logits_tensor = torch.stack(output_logits_list, dim=1)
+        elif 'gold_indices' in batch:
+            # Multiple choice tasks
+            batch_size = len(batch['input_ids'])
+            prompt_lens = []
+            for tokens, cont_idxs in zip(batch['input_ids'],
+                                        batch['continuation_indices']):
+                tokens = tokens.tolist() 
+                cont_idxs = cont_idxs.tolist()
+                prompt = tokens[:cont_idxs[-1] + 1]
+                prompt_lens.append(cont_idxs[-1] + 1)
+            
+            input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
+            input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int)
+            for i in range(batch_size):
+                #print("Prompt:", self.tokenizer.decode(batch['input_ids'][i][:prompt_lens[i]]))
+                input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]]
+            with torch.no_grad():
+                self.decoder.setup(batch_size,
+                                    input_lengths.max().item(),
+                                    1)
+                output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
+            torch.cuda.synchronize() 
+            #print(output_dict.keys())
+            logits = output_dict['context_logits']
+            return logits
         else:
-            output_logits_tensor = None 
+            #################
+            # Batched version of language modeling tasks
+            batch_size = len(batch['input_ids'])
+            seqlen = batch['input_ids'].shape[1]
+            #print("Seq len:", seqlen)
+            prompt_lens = []
+            continuation_lens = []
+            for tokens, cont_idxs in zip(batch['input_ids'],
+                                        batch['continuation_indices']):            
+                tokens = tokens.tolist()
+                cont_idxs = cont_idxs.tolist()
+                expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+                prompt = tokens[:cont_idxs[0]]
+                prompt_lens.append(cont_idxs[0])
+                continuation_lens.append(len(expected_cont_tokens))
         
-        #if output_logits_tensor is not None:
-            #print("Output logits tensor shape:", output_logits_tensor.shape)
-
-        # Put together logits
-        # We loop through batch_size dimension rather than deal with NestedTensor
-        output_logits_batch = []
-        for i in range(batch_size):
-            # First create context "logits" (one-hot vector with 1 at token position)
-            tokens = input_ids[i].tolist()
-            context_psuedologits = torch.nn.functional.one_hot(
-                torch.tensor(tokens[1:prompt_lens[i]], device=self.device),
-                num_classes=self.vocab_size)
-            # Then add generation logits (up to continuation_length)
-            if output_logits_tensor is not None:
-                output_logits_trimmed = output_logits_tensor[i][:continuation_lens[i]]
-                # print("Output logits trimmed shape:", output_logits_trimmed.shape)
-                combined_logits = torch.cat([context_psuedologits, output_logits_trimmed])
-            else:
-                combined_logits = context_psuedologits
-            # Then pad with Padding token "logits" to end of sequence length
-            padding = torch.nn.functional.one_hot(
-                torch.full(
-                    (seqlen - combined_logits.shape[0],),
-                    self.PAD_ID,
-                    device=self.device
-                ),
-                num_classes=self.vocab_size)
-            padded_combined_logits = torch.cat([combined_logits, padding])   
-            output_logits_batch.append(padded_combined_logits)
-
-        return torch.stack(output_logits_batch).to(self.device)
-        ###############################################
-        # NON BATCHED VERSION
-        # Language modeling and multiple choice tasks
-        """
-        for tokens, cont_idxs in zip(batch['input_ids'],
-                                     batch['continuation_indices']):
-            # print("******************************")
-            seqlen = tokens.shape[0]
-            tokens = tokens.tolist()
-            # print("Tokens:", tokens)
-            # print("Continuation indices:", cont_idxs)
-            cont_idxs = cont_idxs.tolist()
-            expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
-            # print("Expected continuation tokens:", expected_cont_tokens)
-            prompt = tokens[:cont_idxs[0]]
+            input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
+            input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int)
+            for i in range(batch_size):
+                input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]]
 
-            
-            input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
-            input_lengths = torch.tensor([input_ids.size(1)],
-                                         dtype=torch.int,
-                                         device=self.device)
-            # print("*** PROMPT:", self.tokenizer.decode(prompt))
-            # print("Input device:", input_ids.get_device())
-            # print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
-            # print("Input lengths:", input_lengths)
-            #print("Expected continuation tokens:", len(expected_cont_tokens))
             with torch.no_grad():
-                self.decoder.setup(input_lengths.size(0),
-                               torch.max(input_lengths).item(),
-                               len(expected_cont_tokens))
-
-                output_dict = self.decoder.decode(
-                    input_ids, input_lengths, self.sampling_config, return_dict=True)
-            
+                self.decoder.setup(batch_size,
+                                    input_lengths.max().item(),
+                                    max(continuation_lens))
+                output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
             torch.cuda.synchronize()
 
-            context_psuedologits = torch.nn.functional.one_hot(
-                torch.tensor(tokens[1:cont_idxs[0]], device=self.device),
-                num_classes=self.vocab_size)
             output_logits_list = output_dict['generation_logits']
-            # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())
-            for i in range(len(output_logits_list)):
-                output_logits_list[i] = output_logits_list[i].squeeze()
-                # print("*** Output string:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()))
-            #print("Context logits:", context_psuedologits.shape)
+            #print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()))
+            
+            # output_logits_list length is == max(continuation_lens)
+            # Output logits_list[i] is of shape (batch_size, vocab_size)
             if len(output_logits_list) > 0:
-                output_logits_tensor = torch.stack(output_logits_list)
-                # print("Output logits stacked:", output_logits_tensor.shape)
-                combined_logits = torch.cat([context_psuedologits, output_logits_tensor])
+                output_logits_tensor = torch.stack(output_logits_list, dim=1)
             else:
-                combined_logits = context_psuedologits
-            #print("Seqlen", seqlen)
-            # print("Combined logits shape:", combined_logits.shape)
-            
-            padding = torch.nn.functional.one_hot(
-                torch.full(
-                    (seqlen - combined_logits.shape[0],),
-                    self.PAD_ID,
-                    device=combined_logits.device
-                ),
-                num_classes=self.vocab_size)
-            padded_combined_logits = torch.cat([combined_logits, padding])
-
-            # print("Padded combined logits shape:", padded_combined_logits.shape)
-
-            output_logits_batch.append(padded_combined_logits)
-            
-        return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device)
+                output_logits_tensor = None 
+
+            # Put together logits
+            # We loop through batch_size dimension rather than deal with NestedTensor
+            output_logits_batch = []
+            for i in range(batch_size):
+                # First create context "logits" (one-hot vector with 1 at token position)
+                tokens = input_ids[i].tolist()
+                context_psuedologits = torch.nn.functional.one_hot(
+                    torch.tensor(tokens[1:prompt_lens[i]], device=self.device),
+                    num_classes=self.vocab_size)
+                # Then add generation logits (up to continuation_length)
+                if output_logits_tensor is not None:
+                    output_logits_trimmed = output_logits_tensor[i][:continuation_lens[i]]
+                    # print("Output logits trimmed shape:", output_logits_trimmed.shape)
+                    combined_logits = torch.cat([context_psuedologits, output_logits_trimmed])
+                else:
+                    combined_logits = context_psuedologits
+                # Then pad with Padding token "logits" to end of sequence length
+                padding = torch.nn.functional.one_hot(
+                    torch.full(
+                        (seqlen - combined_logits.shape[0],),
+                        self.PAD_ID,
+                        device=self.device
+                    ),
+                    num_classes=self.vocab_size)
+                padded_combined_logits = torch.cat([combined_logits, padding])   
+                output_logits_batch.append(padded_combined_logits)
+
+            return torch.stack(output_logits_batch).to(self.device)
+            ###############################################
+            # NON BATCHED VERSION
+            # Language modeling and multiple choice tasks
+            """
+            for tokens, cont_idxs in zip(batch['input_ids'],
+                                        batch['continuation_indices']):
+                # print("******************************")
+                seqlen = tokens.shape[0]
+                tokens = tokens.tolist()
+                # print("Tokens:", tokens)
+                # print("Continuation indices:", cont_idxs)
+                cont_idxs = cont_idxs.tolist()
+                expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
+                # print("Expected continuation tokens:", expected_cont_tokens)
+                prompt = tokens[:cont_idxs[0]]
 
-        #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
-        """
\ No newline at end of file
+                
+                input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
+                input_lengths = torch.tensor([input_ids.size(1)],
+                                            dtype=torch.int,
+                                            device=self.device)
+                # print("*** PROMPT:", self.tokenizer.decode(prompt))
+                # print("Input device:", input_ids.get_device())
+                # print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
+                # print("Input lengths:", input_lengths)
+                #print("Expected continuation tokens:", len(expected_cont_tokens))
+                with torch.no_grad():
+                    self.decoder.setup(input_lengths.size(0),
+                                torch.max(input_lengths).item(),
+                                len(expected_cont_tokens))
+
+                    output_dict = self.decoder.decode(
+                        input_ids, input_lengths, self.sampling_config, return_dict=True)
+                
+                torch.cuda.synchronize()
+
+                context_psuedologits = torch.nn.functional.one_hot(
+                    torch.tensor(tokens[1:cont_idxs[0]], device=self.device),
+                    num_classes=self.vocab_size)
+                output_logits_list = output_dict['generation_logits']
+                # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())
+                for i in range(len(output_logits_list)):
+                    output_logits_list[i] = output_logits_list[i].squeeze()
+                    # print("*** Output string:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()))
+                #print("Context logits:", context_psuedologits.shape)
+                if len(output_logits_list) > 0:
+                    output_logits_tensor = torch.stack(output_logits_list)
+                    # print("Output logits stacked:", output_logits_tensor.shape)
+                    combined_logits = torch.cat([context_psuedologits, output_logits_tensor])
+                else:
+                    combined_logits = context_psuedologits
+                #print("Seqlen", seqlen)
+                # print("Combined logits shape:", combined_logits.shape)
+                
+                padding = torch.nn.functional.one_hot(
+                    torch.full(
+                        (seqlen - combined_logits.shape[0],),
+                        self.PAD_ID,
+                        device=combined_logits.device
+                    ),
+                    num_classes=self.vocab_size)
+                padded_combined_logits = torch.cat([combined_logits, padding])
+
+                # print("Padded combined logits shape:", padded_combined_logits.shape)
+
+                output_logits_batch.append(padded_combined_logits)
+                
+            return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device)
+
+            #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
+            """
diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py
index 26d829dd20..aa91e1b941 100644
--- a/scripts/eval/run_trtllm_eval.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -24,7 +24,7 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
         'run_name': 'trtllm-eval',
         'seed': 0,
         'max_seq_len': 2048,
-        'device_eval_batch_size': 64,
+        'device_eval_batch_size': 8,
         'precision': 'amp_bf16',
         'dist_timeout': 6000,
         'models':
@@ -102,7 +102,7 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8):
 
 
 LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/'
-DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_23_hf_ckpt/'
+DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_25_hf_ckpt/'
 
 LLAMA_7B_DIR = '7B-chat-quality-eval'
 LLAMA_70B_DIR = '70B-chat-quality-eval'
@@ -115,8 +115,8 @@ def engine_dir_str(model_type, model_dir, variant, ngpus=8):
 # llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR)
 # llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR)
 
-dbrx_bf16_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_23_tllm_engine_bf16'
-dbrx_int8_engine_dir = '/workspace/dbrx/03_23_tllm_engine_int8'
+dbrx_bf16_engine_dir = '/workspace/dbrx/03_25_tllm_engine_bf16_all_logits'
+dbrx_int8_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_25_tllm_engine_int8_all_logits'
 dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR)
 dbrx_int8_config = get_dbrx_config(dbrx_int8_engine_dir, DBRX_TOK_DIR)
 
@@ -131,6 +131,6 @@ def run_eval(config):
 # run_eval(llama70b_fp16_config)
 # run_eval(llama70b_fp8_config)
 # run_eval(llama70b_smoothquant_config)
-run_eval(dbrx_bf16_config)
-# run_eval(dbrx_int8_config)
+# run_eval(dbrx_bf16_config)
+run_eval(dbrx_int8_config)
 

From db3afef2fb3f9ccd5bd23e770737655ed7df9ce7 Mon Sep 17 00:00:00 2001
From: nik-mosaic <None>
Date: Mon, 15 Jul 2024 21:14:35 -0700
Subject: [PATCH 21/21] Update TRT wrapper and imports

---
 .../models/inference_api_wrapper/trtllm.py    | 349 +++++++-----------
 scripts/eval/run_trtllm_eval.py               |  58 ++-
 setup.py                                      |   2 +-
 3 files changed, 155 insertions(+), 254 deletions(-)

diff --git a/llmfoundry/models/inference_api_wrapper/trtllm.py b/llmfoundry/models/inference_api_wrapper/trtllm.py
index ed765c2572..87a81600c9 100644
--- a/llmfoundry/models/inference_api_wrapper/trtllm.py
+++ b/llmfoundry/models/inference_api_wrapper/trtllm.py
@@ -48,7 +48,7 @@ def __init__(
         tokenizer: PreTrainedTokenizer,
     ):
         check_if_trt_llm_installed()
-
+        # Only print on rank 0
         if tensorrt_llm.mpi_rank() != 0:
             f = open(os.devnull, 'w')
             sys.stdout = f
@@ -85,11 +85,6 @@ def __init__(
         
         use_gpt_attention_plugin = bool(plugin_config['gpt_attention_plugin'])
         remove_input_padding = plugin_config['remove_input_padding']
-        if remove_input_padding:
-            raise ValueError(
-                'TRT-LLM Evaluation Wrapper does not support remove_input_padding.'
-            )
-       
         num_kv_heads = build_config.get('num_key_value_heads', num_heads)
         paged_kv_cache = plugin_config['paged_kv_cache']
         tokens_per_block = plugin_config['tokens_per_block']
@@ -165,10 +160,9 @@ def __init__(
         self.decoder = tensorrt_llm.runtime.GenerationSession(
             model_config, engine_buffer, runtime_mapping, debug_mode=False)
 
-        print("!!! Initialized generation session for rank:", runtime_rank)        
+        print("!!! Initialized generation session for rank:", runtime_rank)
         torch.cuda.synchronize()
 
-
     def rebatch(self, batch):
         """
         Move tensors in batch to the correct GPU.
@@ -182,213 +176,200 @@ def rebatch(self, batch):
         elif isinstance(batch, list):
             return [self.rebatch(b) for b in batch]
         return batch
-    
 
-        # Remove potential additional dim, cast to int32
-        """
-        batch_input_ids = [
-            x.flatten().type(torch.int32) for x in batch_input_ids
-        ]
-        input_lengths = [x.size(0) for x in batch_input_ids]
-        max_length = max(input_lengths)
-        # Right padding for trt-llm
-        paddings = [
-            torch.ones(max_length - l, dtype=torch.int32, device=self.device) * pad_id
-            for l in input_lengths
-        ]
-        batch_input_ids = [
-            torch.cat([x, pad]) for x, pad in zip(batch_input_ids, paddings)
-        ]
-        batch_input_ids = torch.stack(batch_input_ids).to(device=self.device)
-        input_lengths = torch.tensor(input_lengths, dtype=torch.int32, device=self.device)
-        return batch_input_ids, input_lengths
-        """
 
     def eval_forward(self, batch, outputs: Optional[Any] = None):
-        # Run TRTLLM forward pass
+        # Run TRT-LLM Forward Pass without any input padding
         output_logits_batch = []
         batch = self.rebatch(batch)
+        batch_size = len(batch['input_ids'])
+        prompt_lens = []
+        unpadded_input_ids_list = []
 
-        # Question-answering tasks
         if 'continuation_indices' not in batch:
-            # Batched version. For some reason
-            # GSM-8k gives bad outputs when we batch on BF16 version.
-            # So, we will not batch.
-            """
-            batch_size = len(batch['input_ids'])
-            prompt_lens = []
+            # Question-answering tasks
             max_prompt_len = 0
             for tokens in batch['input_ids']:
-                prompt = tokens.tolist()
                 pad_start = (tokens == self.PAD_ID).nonzero(as_tuple=True)[0]
-                end_prompt_idx = len(prompt)
+                eos_start = (tokens == self.END_ID).nonzero(as_tuple=True)[0]
+                end_prompt_idx = len(tokens.tolist())
                 if pad_start.shape[0] >= 1:
                     end_prompt_idx = pad_start[0]
+                if eos_start.shape[0] >= 1 and eos_start[0] < end_prompt_idx:
+                    end_prompt_idx = eos_start[0]
                 prompt_lens.append(end_prompt_idx)
                 if end_prompt_idx > max_prompt_len:
                     max_prompt_len = end_prompt_idx
+            
+            for i in range(batch_size):
+                #print("Prompt:\n", self.tokenizer.decode(batch['input_ids'][i][:prompt_lens[i]].tolist()))
+                unpadded_input_ids_list += batch['input_ids'][i][:prompt_lens[i]].tolist()
 
-            input_ids = torch.narrow(batch['input_ids'], 1, 0, max_prompt_len).to(dtype=torch.int, device=self.device) 
+            unpadded_input_ids = torch.tensor(unpadded_input_ids_list, dtype=torch.int, device=self.device)
             input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
-           
-            torch.set_printoptions(threshold=10_000)
-            #print("Prompt0:", input_ids[0])
-            #print("Input shape:", input_ids.shape)
-            #print("Input lengths:", input_lengths)
-            max_generation_length = 256
+
+            MAX_GEN_LEN = 256
+            max_generation_length = batch.get('generation_length', MAX_GEN_LEN)
             torch.cuda.synchronize()
             with torch.no_grad():
                 self.decoder.setup(batch_size,
                                     input_lengths.max().item(),
-                                    batch.get('generation_length', max_generation_length))
-                output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
-                #self.decoder.setup(1,
-                #                    input_lengths[:1].max().item(),
-                #                    batch.get('generation_length', max_generation_length))
-                #output_dict2 = self.decoder.decode(input_ids[:1,:], input_lengths[:1], self.sampling_config, return_dict=True)
+                                    max_generation_length)
+                output_dict = self.decoder.decode(unpadded_input_ids, input_lengths, self.sampling_config, return_dict=True)
             torch.cuda.synchronize()
 
-            #answer1 = output_dict['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length]
-            #answer2 = output_dict2['output_ids'][0].squeeze()[prompt_lens[0]:prompt_lens[0]+max_generation_length]
-            #all_equal = torch.equal(answer1, answer2)
-            # if not all_equal:
-            #     print("Prompt:", input_ids[0])
-            #     print("Answer 1:", self.tokenizer.decode(answer1))
-            #     print("Answer 2:", self.tokenizer.decode(answer2))
-            #     print("Shape 1:", answer1.shape)
-            #     print("Shape 2", answer2.shape)
-            #     difference = answer1 - answer2
-            #     nonzero_indices = difference.nonzero(as_tuple=True)
-            #     nonzero = difference[difference.nonzero(as_tuple=True)]            
-            #     print("EQUAL?", all_equal)
-            #     print("Difference:", difference)
-            #     print("nonzero indices:", nonzero_indices)
-            #     print("Nonzero Elements", nonzero)
-            #     quit()
-            output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+batch.get('generation_length', max_generation_length)] for i in range(batch_size)]
-
-            print("Output:", output_ids)
-        
+            output_ids = [output_dict['output_ids'][i][0].tolist()[prompt_lens[i]:prompt_lens[i]+max_generation_length] for i in range(batch_size)]
             decoded_strs = [self.tokenizer.decode(out) for out in output_ids]
-            print("decoded strs:", decoded_strs)
+            # print("Output string:", decoded_strs)
             return decoded_strs
+        elif 'gold_indices' in batch:
+            # Multiple choice tasks
+            seqlen = batch['input_ids'].shape[1]
+            """"
+            Generate one-step at a time
             """
-            # Non-batched version
-            output_strs = []
-            for tokens in batch['input_ids']:
-                #print("RAW Tokens:", tokens)
-                seqlen = tokens.shape[0]
-                prompt = tokens.tolist()
-                pad_start = (tokens == self.PAD_ID).nonzero(as_tuple=True)[0]
-                end_prompt_idx = len(prompt)
-                if pad_start.shape[0] >= 1:
-                    end_prompt_idx = pad_start[0]
-                prompt = prompt[:end_prompt_idx]
-                input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
-                input_lengths = torch.tensor([input_ids.size(1)], dtype=torch.int, device=self.device)
-                #print("prompt:", self.tokenizer.decode(prompt))
-                #print("promp tokens:", prompt)
-                #print("Input lengths:", input_lengths)
-                #print("Generation Length:", batch['generation_length'])
-                #print("Batch keys:", batch.keys())                
-                torch.cuda.synchronize()
-                with torch.no_grad():
-                    self.decoder.setup(batch_size=input_lengths.size(0),
-                                        max_context_length=torch.max(input_lengths).item(),
-                                        max_new_tokens=batch.get('generation_length', 200))
-                    output_dict = self.decoder.decode(
-                                    input_ids,
-                                    input_lengths,
-                                    self.sampling_config, 
-                                    #stopping_criteria=batch['stopping_criteria'],
-                                    return_dict=True)
+            prompt_lens = [cont_idxs.tolist()[-1] + 1 for cont_idxs in batch['continuation_indices']]            
+            logits_list = []
+            with torch.no_grad():
+                for tokens, cont_idxs in zip(batch['input_ids'], batch['continuation_indices']):
+                    cont_idxs = cont_idxs.tolist()
+                    #print("Continuation Indices:", cont_idxs)
+                    #print("Continuation tokens:", self.tokenizer.decode(tokens.tolist()[cont_idxs[0]:cont_idxs[-1] + 1]))
+                    cont_length = cont_idxs[-1] + 1 - cont_idxs[0]
+                    logits = torch.nn.functional.one_hot(
+                                tokens[1:cont_idxs[0]],
+                                num_classes=self.vocab_size,
+                            ).to(device=self.device)
+                    for i in range(cont_length):
+                        # decode one token at a time
+                        self.decoder.setup(1, cont_idxs[0]+i, 1)
+                        output_dict = self.decoder.decode(tokens[:cont_idxs[0]+i].to(dtype=torch.int32, device=self.device),
+                                            torch.tensor([cont_idxs[0]+i], dtype=torch.int, device=self.device), 
+                                            self.sampling_config,
+                                            return_dict=True)
+                        next_logit_tensor = torch.squeeze(output_dict['generation_logits'][0])
+                        #print("Decoded output:\n", self.tokenizer.decode(output_dict['output_ids'][0].squeeze()))
+                        # append next logit to logits tensor
+                        logits = torch.cat([logits, next_logit_tensor.reshape(1, -1)])
+
+                    padding = torch.nn.functional.one_hot(
+                                torch.full((max(prompt_lens) - logits.shape[0],), self.PAD_ID),
+                                num_classes=self.vocab_size,
+                            ).to(device=next_logit_tensor.device)
+                    logits = torch.cat([logits, padding])
+                    logits_list.append(logits)  
                 
-                #print("Shape:", output_dict['output_ids'].shape)
-                decoded_str = self.tokenizer.decode(output_dict['output_ids'][0][0].tolist()[len(prompt):])
-                output_strs.append(decoded_str)
-                #print("Decoded OUTPUT:", decoded_str)
-                #print("-------------")
-                #print("Output ids:", output_dict['output_ids'][0][0].tolist())
-            return output_strs
-
+            return torch.stack(logits_list).to(device=self.device, dtype=torch.float)
+            """
+            Normal (context logits) version
+            """
+            torch.cuda.synchronize()
+            continuation_starts = [cont_idxs.tolist()[0] for cont_idxs in batch['continuation_indices']]            
+            prompt_lens = [cont_idxs.tolist()[-1] + 1 for cont_idxs in batch['continuation_indices']]            
 
-        elif 'gold_indices' in batch:
-            # Multiple choice tasks
-            batch_size = len(batch['input_ids'])
-            prompt_lens = []
-            for tokens, cont_idxs in zip(batch['input_ids'],
-                                        batch['continuation_indices']):
-                tokens = tokens.tolist() 
-                cont_idxs = cont_idxs.tolist()
-                prompt = tokens[:cont_idxs[-1] + 1]
-                prompt_lens.append(cont_idxs[-1] + 1)
-            
-            input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
-            input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int)
-            for i in range(batch_size):
-                #print("Prompt:", self.tokenizer.decode(batch['input_ids'][i][:prompt_lens[i]]))
-                input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]]
+            logits_list = []
             with torch.no_grad():
+                # Batched version: 
+                # Doesn't work because TRT-LLM has bug with batched context logits.
+                """
+                for i in range(batch_size):
+                    #print("Prompt:", self.tokenizer.decode(batch['input_ids'][i][:prompt_lens[i]].tolist()))
+                    unpadded_input_ids_list += batch['input_ids'][i][:prompt_lens[i]].tolist()
+
+                unpadded_input_ids = torch.tensor(unpadded_input_ids_list, dtype=torch.int, device=self.device)
+                input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
                 self.decoder.setup(batch_size,
                                     input_lengths.max().item(),
                                     1)
-                output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
-            torch.cuda.synchronize() 
-            #print(output_dict.keys())
-            logits = output_dict['context_logits']
+                output_dict = self.decoder.decode(unpadded_input_ids, input_lengths, self.sampling_config, return_dict=True)
+                logits = output_dict['context_logits']
+                """    
+                # Unbatched version
+                for tokens, cont_idxs in zip(batch['input_ids'], batch['continuation_indices']):
+                    # Tensorrt-LLM Input must be int32 tensor, not int64 tensor!
+                    prompt_len = cont_idxs.tolist()[-1] + 1
+                    self.decoder.setup(1, prompt_len, 1)
+                    output_dict = self.decoder.decode(tokens[:prompt_len].to(dtype=torch.int32, device=self.device),
+                                        torch.tensor([prompt_len], dtype=torch.int, device=self.device), 
+                                        self.sampling_config,
+                                        return_dict=True)
+                    context_logits = torch.squeeze(output_dict['context_logits'])
+                    prompt_psuedologits = torch.nn.functional.one_hot(
+                        tokens[1:cont_idxs[0]],
+                        num_classes=self.vocab_size)
+                    context_logits = context_logits[cont_idxs[0]:]
+                    context_logits = torch.cat([prompt_psuedologits, context_logits])
+
+                    pad_len = max(prompt_lens) - context_logits.shape[0]
+                    if pad_len != 0:
+                        padding = torch.nn.functional.one_hot(
+                            torch.full((pad_len,), self.PAD_ID),
+                            num_classes=self.vocab_size,
+                        ).to(device=context_logits.device)
+                        context_logits = torch.cat([context_logits, padding])
+
+                    logits_list.append(context_logits)
+            torch.cuda.synchronize()
+            
+            return torch.stack(logits_list).to(device=self.device, dtype=torch.float)
+            """
+            # Batched version 
+            # Context Logits beyond input lengths should be one-hot vectors
+            # with a one in the padding token position.        
+            """
+            for i in range(batch_size):
+                pad_len = logits.shape[1] - prompt_lens[i]
+                if pad_len == 0:
+                    continue
+                padding = torch.nn.functional.one_hot(
+                    torch.full((pad_len,), self.PAD_ID),
+                    num_classes=logits.shape[2],
+                    )
+                logits[i,prompt_lens[i]:,:] = padding
             return logits
         else:
-            #################
-            # Batched version of language modeling tasks
-            batch_size = len(batch['input_ids'])
+            # Language Modeling Tasks
             seqlen = batch['input_ids'].shape[1]
-            #print("Seq len:", seqlen)
-            prompt_lens = []
             continuation_lens = []
             for tokens, cont_idxs in zip(batch['input_ids'],
                                         batch['continuation_indices']):            
                 tokens = tokens.tolist()
                 cont_idxs = cont_idxs.tolist()
                 expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
-                prompt = tokens[:cont_idxs[0]]
                 prompt_lens.append(cont_idxs[0])
                 continuation_lens.append(len(expected_cont_tokens))
         
-            input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
-            input_ids = torch.full((batch_size, max(prompt_lens)), fill_value=self.PAD_ID, device=self.device, dtype=torch.int)
             for i in range(batch_size):
-                input_ids[i][:prompt_lens[i]] = batch['input_ids'][i][:prompt_lens[i]]
-
+                unpadded_input_ids_list += batch['input_ids'][i][:prompt_lens[i]].tolist()
+            
+            unpadded_input_ids = torch.tensor(unpadded_input_ids_list, dtype=torch.int, device=self.device)
+            input_lengths = torch.tensor(prompt_lens, dtype=torch.int, device=self.device)
+            
+            torch.cuda.synchronize()
             with torch.no_grad():
                 self.decoder.setup(batch_size,
                                     input_lengths.max().item(),
                                     max(continuation_lens))
-                output_dict = self.decoder.decode(input_ids, input_lengths, self.sampling_config, return_dict=True)
+                output_dict = self.decoder.decode(unpadded_input_ids, input_lengths, self.sampling_config, return_dict=True)
             torch.cuda.synchronize()
 
             output_logits_list = output_dict['generation_logits']
-            #print("Output ids:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()))
-            
-            # output_logits_list length is == max(continuation_lens)
-            # Output logits_list[i] is of shape (batch_size, vocab_size)
             if len(output_logits_list) > 0:
                 output_logits_tensor = torch.stack(output_logits_list, dim=1)
             else:
                 output_logits_tensor = None 
 
             # Put together logits
-            # We loop through batch_size dimension rather than deal with NestedTensor
             output_logits_batch = []
             for i in range(batch_size):
+                prior_data = 0 if i == 0 else sum(prompt_lens[:i])
                 # First create context "logits" (one-hot vector with 1 at token position)
-                tokens = input_ids[i].tolist()
                 context_psuedologits = torch.nn.functional.one_hot(
-                    torch.tensor(tokens[1:prompt_lens[i]], device=self.device),
+                    torch.tensor(unpadded_input_ids_list[prior_data+1:prior_data+prompt_lens[i]], device=self.device),
                     num_classes=self.vocab_size)
                 # Then add generation logits (up to continuation_length)
                 if output_logits_tensor is not None:
                     output_logits_trimmed = output_logits_tensor[i][:continuation_lens[i]]
-                    # print("Output logits trimmed shape:", output_logits_trimmed.shape)
                     combined_logits = torch.cat([context_psuedologits, output_logits_trimmed])
                 else:
                     combined_logits = context_psuedologits
@@ -404,74 +385,4 @@ def eval_forward(self, batch, outputs: Optional[Any] = None):
                 output_logits_batch.append(padded_combined_logits)
 
             return torch.stack(output_logits_batch).to(self.device)
-            ###############################################
-            # NON BATCHED VERSION
-            # Language modeling and multiple choice tasks
-            """
-            for tokens, cont_idxs in zip(batch['input_ids'],
-                                        batch['continuation_indices']):
-                # print("******************************")
-                seqlen = tokens.shape[0]
-                tokens = tokens.tolist()
-                # print("Tokens:", tokens)
-                # print("Continuation indices:", cont_idxs)
-                cont_idxs = cont_idxs.tolist()
-                expected_cont_tokens = tokens[cont_idxs[0]:cont_idxs[-1] + 1]
-                # print("Expected continuation tokens:", expected_cont_tokens)
-                prompt = tokens[:cont_idxs[0]]
 
-                
-                input_ids = torch.tensor([prompt], dtype=torch.int, device=self.device)
-                input_lengths = torch.tensor([input_ids.size(1)],
-                                            dtype=torch.int,
-                                            device=self.device)
-                # print("*** PROMPT:", self.tokenizer.decode(prompt))
-                # print("Input device:", input_ids.get_device())
-                # print("Input ids data:", input_ids, len(input_ids), input_ids[0].shape)
-                # print("Input lengths:", input_lengths)
-                #print("Expected continuation tokens:", len(expected_cont_tokens))
-                with torch.no_grad():
-                    self.decoder.setup(input_lengths.size(0),
-                                torch.max(input_lengths).item(),
-                                len(expected_cont_tokens))
-
-                    output_dict = self.decoder.decode(
-                        input_ids, input_lengths, self.sampling_config, return_dict=True)
-                
-                torch.cuda.synchronize()
-
-                context_psuedologits = torch.nn.functional.one_hot(
-                    torch.tensor(tokens[1:cont_idxs[0]], device=self.device),
-                    num_classes=self.vocab_size)
-                output_logits_list = output_dict['generation_logits']
-                # print("Output ids:", output_dict['output_ids'][0][0][cont_idxs[0]:].tolist())
-                for i in range(len(output_logits_list)):
-                    output_logits_list[i] = output_logits_list[i].squeeze()
-                    # print("*** Output string:", self.tokenizer.decode(output_dict['output_ids'][0][0][cont_idxs[0]:].tolist()))
-                #print("Context logits:", context_psuedologits.shape)
-                if len(output_logits_list) > 0:
-                    output_logits_tensor = torch.stack(output_logits_list)
-                    # print("Output logits stacked:", output_logits_tensor.shape)
-                    combined_logits = torch.cat([context_psuedologits, output_logits_tensor])
-                else:
-                    combined_logits = context_psuedologits
-                #print("Seqlen", seqlen)
-                # print("Combined logits shape:", combined_logits.shape)
-                
-                padding = torch.nn.functional.one_hot(
-                    torch.full(
-                        (seqlen - combined_logits.shape[0],),
-                        self.PAD_ID,
-                        device=combined_logits.device
-                    ),
-                    num_classes=self.vocab_size)
-                padded_combined_logits = torch.cat([combined_logits, padding])
-
-                # print("Padded combined logits shape:", padded_combined_logits.shape)
-
-                output_logits_batch.append(padded_combined_logits)
-                
-            return torch.stack(output_logits_batch).to(self.device) #(batch['input_ids'].device)
-
-            #print("Decoded output:", self.tokenizer.decode(output_ids[0][0][cont_idxs[0]:].tolist()))
-            """
diff --git a/scripts/eval/run_trtllm_eval.py b/scripts/eval/run_trtllm_eval.py
index aa91e1b941..39fc9eb29e 100644
--- a/scripts/eval/run_trtllm_eval.py
+++ b/scripts/eval/run_trtllm_eval.py
@@ -17,6 +17,7 @@
 QA_MC_TASKS = './eval/yamls/qa_mc_tasks_v0.3.yaml'
 ALL_TASKS = './eval/yamls/tasks_v0.3.yaml'
 LM_TASKS = './eval/yamls/lm_tasks_v0.3.yaml'
+BROKEN_TASKS = './eval/yamls/broken_tasks.yaml'
 EVAL_GAUNTLET = './eval/yamls/eval_gauntlet_v0.3.yaml'
 
 def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
@@ -24,7 +25,7 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
         'run_name': 'trtllm-eval',
         'seed': 0,
         'max_seq_len': 2048,
-        'device_eval_batch_size': 8,
+        'device_eval_batch_size': 4,
         'precision': 'amp_bf16',
         'dist_timeout': 6000,
         'models':
@@ -37,8 +38,6 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
                     'version': 'dbrx',
                     'engine_dir': engine_dir,
                     'log_level': 'error',
-                    'eos_token_id': 100257,
-                    'pad_token_id': 100277,
                 },
                 'tokenizer':
                 {
@@ -61,13 +60,13 @@ def get_dbrx_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
 
 
 
-def get_llama_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS):
+def get_llama_config(engine_dir, tokenizer_name, icl_tasks=MINI_TASKS):
     return {
         'run_name': 'trtllm-eval',
         'seed': 0,
-        'max_seq_len': 2048,
-        'device_eval_batch_size': 8, # Llama-7B should be batch size 32
-        'precision': 'amp_fp16',
+        'max_seq_len': 1024,
+        'device_eval_batch_size': 4, # Llama-7B should be batch size 32
+        'precision': 'amp_bf16',
         'dist_timeout': 6000,
         'models':
         [
@@ -79,20 +78,21 @@ def get_llama_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS):
                     'version': 'llama',
                     'engine_dir': engine_dir,
                     'log_level': 'error',
-                    'eos_token_id': 2,
-                    'pad_token_id': 2
+                    'end_token_id': 128009,
+                    'pad_token_id': 128001,
+ 
                 },
                 'tokenizer':
                 {
                     'name': tokenizer_name,
-                }
+               }
             }
         ],
         'icl_tasks': icl_tasks,
         'eval_gauntlet': EVAL_GAUNTLET,
         'loggers': {
             'wandb': {
-                'project': 'nik-quant-eval'
+                'project': 'nik-llama3-eval'
             }
         }
     }
@@ -100,25 +100,20 @@ def get_llama_config(engine_dir, tokenizer_name, icl_tasks=ALL_TASKS):
 def engine_dir_str(model_type, model_dir, variant, ngpus=8):
     return f"{trt_folder_path}examples/{model_type}/tmp/{model_type}/{model_dir}/trt_engines/{variant}/{ngpus}-gpu"
 
+LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/models/llama3-70b-instruct-hf/'
+DBRX_TOK_DIR = '/mnt/workdisk/nikhil/models/dbrx-hf/03_25_hf_ckpt/'
 
-LLAMA_TOK_DIR = '/mnt/workdisk/nikhil/llama-70b-chat-hf/'
-DBRX_TOK_DIR = '/mnt/workdisk/nikhil/dbrx/03_25_hf_ckpt/'
+# LLama URLs
+#llama_bf16_engine_dir = '/mnt/workdisk/nikhil/engines/llama3_70b_bf16_logits_v0.10'
+#llama_fp8_engine_dir = '/mnt/workdisk/nikhil/engines/llama3_70b_fp8_logits_0521/'
+llama_fp8_engine_dir = '/mnt/workdisk/nikhil/engines/llama3_70b_fp8_logits_v2_v0.10'
+llama70b_config = get_llama_config(llama_fp8_engine_dir, LLAMA_TOK_DIR)
 
-LLAMA_7B_DIR = '7B-chat-quality-eval'
-LLAMA_70B_DIR = '70B-chat-quality-eval'
+#dbrx_bf16_engine_dir = '/mnt/workdisk/nikhil/engines/dbrx_bf16_logits_0521/'
+#dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR)
 
-# LLama URLs
-# fp8_engine_dir = '/mnt/workdisk/nikhil/engines-quality-eval/llama-2-70b-chat-tp8-fp8'
-# llama7b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_7B_DIR, 'int8_kv_cache_weight_only', 1), LLAMA_TOK_DIR)
-#llama70b_fp8_config = get_llama_config(fp8_engine_dir, LLAMA_TOK_DIR)
-# llama70b_int8_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'int8_kv_cache_weight_only'), LLAMA_TOK_DIR)
-# llama70b_smoothquant_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'sq0.8'), LLAMA_TOK_DIR)
-# llama70b_fp16_config = get_llama_config(engine_dir_str('llama', LLAMA_70B_DIR, 'fp16'), LLAMA_TOK_DIR)
-
-dbrx_bf16_engine_dir = '/workspace/dbrx/03_25_tllm_engine_bf16_all_logits'
-dbrx_int8_engine_dir = '/mnt/workdisk/nikhil/dbrx/03_25_tllm_engine_int8_all_logits'
-dbrx_bf16_config = get_dbrx_config(dbrx_bf16_engine_dir, DBRX_TOK_DIR)
-dbrx_int8_config = get_dbrx_config(dbrx_int8_engine_dir, DBRX_TOK_DIR)
+dbrx_fp8_engine_dir = '/mnt/workdisk/nikhil/engines/dbrx_fp8_logits_v2_v0.10' 
+dbrx_fp8_config = get_dbrx_config(dbrx_fp8_engine_dir, DBRX_TOK_DIR)
 
 def run_eval(config):
     print("RUNNING EVAL")
@@ -126,11 +121,6 @@ def run_eval(config):
     print("OmegaConfig dictionary", om.to_yaml(om_dict_config))
     run_evaluation(om_dict_config)
 
-# run_eval(llama7b_int8_config)
-# run_eval(llama70b_int8_config)
-# run_eval(llama70b_fp16_config)
-# run_eval(llama70b_fp8_config)
-# run_eval(llama70b_smoothquant_config)
-# run_eval(dbrx_bf16_config)
-run_eval(dbrx_int8_config)
+# run_eval(dbrx_fp8_config)
+run_eval(llama70b_config)
 
diff --git a/setup.py b/setup.py
index 7534d24503..d8dfddb058 100644
--- a/setup.py
+++ b/setup.py
@@ -55,7 +55,7 @@
     'accelerate>=0.25,<0.26',  # for HF inference `device_map`
     'transformers>=4.38.2,<4.39',
     'mosaicml-streaming>=0.7.4,<0.8',
-    'torch>=2.2.1,<2.3',
+    #'torch>=2.2.1,<2.3',
     'datasets>=2.16,<2.17',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data
     'sentencepiece==0.1.97',