allow overriding n_vocab; support phi 3/3.5

guidance-ai · Nov 21, 2024 · 521aff7 · 521aff7
1 parent e85be12
commit 521aff7
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -121,6 +121,20 @@ The paths to `llgtrt.json5` and `chat_template.j2` are controlled by command lin
 
 You can specify multiple JSON5 config files, and they will be merged in the order specified (with later ones overriding earlier ones). This way, you can separate configuration for the tokenizer, runtime, and guidance parser.
 
+### Running phi-3
+
+The phi-3 tokenizer, while based on llama2 is slightly different.
+Drop the following `llgtrt.json5` file in engine folder:
+
+```json5
+{
+  "tokenizer": {
+    "bos_token": null,
+    "n_vocab_override": 32064
+  }
+}
+```
+
 ## Development
 
 First, build the Docker container to be used in the dev container. If you have already followed the steps above, you can skip this. Otherwise, run `./docker/build.sh`.
@@ -134,7 +148,7 @@ The basic structure of the server borrows inspiration from [npuichigo/openai_trt
 ## TODO
 
 - [ ] multi-LoRA?
-- [ ] test phi-3.5
+- [x] test phi-3.5
 - [ ] multi-modal input
 - [ ] when streaming, and stop is set, we need to buffer the output so as not to return the stop sequence itself
 - [ ] logprobs (currently only work with TP>1; TRTLLM bug?)

diff --git a/llgtrt/src/config_info.json b/llgtrt/src/config_info.json
@@ -31,6 +31,9 @@
   "#": "Tokenizer configuration (defaults to tokenizer_config.json contents)\nTypically no changes are needed here, except for chat_template\nwhich is best overridden with --chat-template filename.j2 option.",
   "json_start_token": {
    "#": "This is <|python_tag|> for Llama 3 models."
+  },
+  "n_vocab_override": {
+   "#": "Use to override tokenizer vocabulary size.\nUse 32064 for phi3."
   }
  },
  "llguidance": {
@@ -46,6 +49,9 @@
    "step_lexer_fuel": {
     "#": "Maximum lexer fuel for computation of the whole token mask.\nDefault: 500_000 (~10ms)"
    },
+   "step_max_items": {
+    "#": "Number of Earley items created for the whole token mask.\nDefault: 100_000 (~3ms)"
+   },
    "max_lexer_states": {
     "#": "Maximum number of lexer states.\nDefault: 10_000"
    },

diff --git a/llgtrt/src/tokenizer.rs b/llgtrt/src/tokenizer.rs
@@ -33,6 +33,10 @@ pub struct TokenizerConfig {
 
     /// This is <|python_tag|> for Llama 3 models.
     pub json_start_token: Option<String>,
+
+    /// Use to override tokenizer vocabulary size.
+    /// Use 32064 for phi3.
+    pub n_vocab_override: Option<usize>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -64,6 +68,7 @@ impl Default for TokenizerConfig {
             pad_token: None,
             cls_token: None,
             mask_token: None,
+            n_vocab_override: None,
         }
     }
 }
@@ -76,7 +81,10 @@ pub fn setup_tokenizer(
 
     let tokenizer = format!("{}/tokenizer.json", tokenizer_folder);
     log::info!("Loading tokenizer from {:?}", tokenizer);
-    let tok_env = toktrie_hf_tokenizers::ByteTokenizerEnv::from_name(&tokenizer, None)?;
+    let tok_env = toktrie_hf_tokenizers::ByteTokenizerEnv::from_name(
+        &tokenizer,
+        config.tokenizer.n_vocab_override,
+    )?;
     let tok_env: TokEnv = Arc::new(tok_env);
     let trie = tok_env.tok_trie();
     let mut info = trie.info().clone();

diff --git a/scripts/trtbld.sh b/scripts/trtbld.sh
@@ -6,7 +6,8 @@ SELF=./scripts/trtbld.sh
 
 CACHE=${CACHE:-/root/trt-cache}
 MODEL=${MODEL:-Meta-Llama-3.1-8B-Instruct}
-LLAMA_EXAMPLE=$(pwd)/TensorRT-LLM/examples/llama
+MODEL_TYPE=${MODEL_TYPE:-llama}
+LLAMA_EXAMPLE=$(pwd)/TensorRT-LLM/examples/$MODEL_TYPE
 MODEL_SRC=$CACHE/$MODEL-hf
 
 CKPT=$CACHE/$MODEL-ckpt