From 521aff74e0dad4cf86918082664b654a2e4490d3 Mon Sep 17 00:00:00 2001
From: Michal Moskal <michal@moskal.me>
Date: Thu, 21 Nov 2024 21:52:14 +0000
Subject: [PATCH] allow overriding n_vocab; support phi 3/3.5

---
 README.md                   | 16 +++++++++++++++-
 llgtrt/src/config_info.json |  6 ++++++
 llgtrt/src/tokenizer.rs     | 10 +++++++++-
 scripts/trtbld.sh           |  3 ++-
 4 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 2c6bb62..1c5cfcf 100644
--- a/README.md
+++ b/README.md
@@ -121,6 +121,20 @@ The paths to `llgtrt.json5` and `chat_template.j2` are controlled by command lin
 
 You can specify multiple JSON5 config files, and they will be merged in the order specified (with later ones overriding earlier ones). This way, you can separate configuration for the tokenizer, runtime, and guidance parser.
 
+### Running phi-3
+
+The phi-3 tokenizer, while based on llama2 is slightly different.
+Drop the following `llgtrt.json5` file in engine folder:
+
+```json5
+{
+  "tokenizer": {
+    "bos_token": null,
+    "n_vocab_override": 32064
+  }
+}
+```
+
 ## Development
 
 First, build the Docker container to be used in the dev container. If you have already followed the steps above, you can skip this. Otherwise, run `./docker/build.sh`.
@@ -134,7 +148,7 @@ The basic structure of the server borrows inspiration from [npuichigo/openai_trt
 ## TODO
 
 - [ ] multi-LoRA?
-- [ ] test phi-3.5
+- [x] test phi-3.5
 - [ ] multi-modal input
 - [ ] when streaming, and stop is set, we need to buffer the output so as not to return the stop sequence itself
 - [ ] logprobs (currently only work with TP>1; TRTLLM bug?)
diff --git a/llgtrt/src/config_info.json b/llgtrt/src/config_info.json
index 22a41f5..c822397 100644
--- a/llgtrt/src/config_info.json
+++ b/llgtrt/src/config_info.json
@@ -31,6 +31,9 @@
   "#": "Tokenizer configuration (defaults to tokenizer_config.json contents)\nTypically no changes are needed here, except for chat_template\nwhich is best overridden with --chat-template filename.j2 option.",
   "json_start_token": {
    "#": "This is <|python_tag|> for Llama 3 models."
+  },
+  "n_vocab_override": {
+   "#": "Use to override tokenizer vocabulary size.\nUse 32064 for phi3."
   }
  },
  "llguidance": {
@@ -46,6 +49,9 @@
    "step_lexer_fuel": {
     "#": "Maximum lexer fuel for computation of the whole token mask.\nDefault: 500_000 (~10ms)"
    },
+   "step_max_items": {
+    "#": "Number of Earley items created for the whole token mask.\nDefault: 100_000 (~3ms)"
+   },
    "max_lexer_states": {
     "#": "Maximum number of lexer states.\nDefault: 10_000"
    },
diff --git a/llgtrt/src/tokenizer.rs b/llgtrt/src/tokenizer.rs
index 1d5ae8c..3a0b9a3 100644
--- a/llgtrt/src/tokenizer.rs
+++ b/llgtrt/src/tokenizer.rs
@@ -33,6 +33,10 @@ pub struct TokenizerConfig {
 
     /// This is <|python_tag|> for Llama 3 models.
     pub json_start_token: Option<String>,
+
+    /// Use to override tokenizer vocabulary size.
+    /// Use 32064 for phi3.
+    pub n_vocab_override: Option<usize>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -64,6 +68,7 @@ impl Default for TokenizerConfig {
             pad_token: None,
             cls_token: None,
             mask_token: None,
+            n_vocab_override: None,
         }
     }
 }
@@ -76,7 +81,10 @@ pub fn setup_tokenizer(
 
     let tokenizer = format!("{}/tokenizer.json", tokenizer_folder);
     log::info!("Loading tokenizer from {:?}", tokenizer);
-    let tok_env = toktrie_hf_tokenizers::ByteTokenizerEnv::from_name(&tokenizer, None)?;
+    let tok_env = toktrie_hf_tokenizers::ByteTokenizerEnv::from_name(
+        &tokenizer,
+        config.tokenizer.n_vocab_override,
+    )?;
     let tok_env: TokEnv = Arc::new(tok_env);
     let trie = tok_env.tok_trie();
     let mut info = trie.info().clone();
diff --git a/scripts/trtbld.sh b/scripts/trtbld.sh
index c5317a0..b3a6bc3 100755
--- a/scripts/trtbld.sh
+++ b/scripts/trtbld.sh
@@ -6,7 +6,8 @@ SELF=./scripts/trtbld.sh
 
 CACHE=${CACHE:-/root/trt-cache}
 MODEL=${MODEL:-Meta-Llama-3.1-8B-Instruct}
-LLAMA_EXAMPLE=$(pwd)/TensorRT-LLM/examples/llama
+MODEL_TYPE=${MODEL_TYPE:-llama}
+LLAMA_EXAMPLE=$(pwd)/TensorRT-LLM/examples/$MODEL_TYPE
 MODEL_SRC=$CACHE/$MODEL-hf
 
 CKPT=$CACHE/$MODEL-ckpt