allow passing in a parsd dictionary to the tokenizer

I would like to compress the tiktoken dictionary in copilot chat, as they're pretty big (especially with 200k.) This allows us to manually parse the file ourselves instead of having to pass it in to be read in its standard format.
microsoft · Jul 3, 2024 · e874685 · e874685
1 parent 9cad244
commit e874685
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 6 deletions.
diff --git a/tokenizer_ts/src/tikTokenizer.ts b/tokenizer_ts/src/tikTokenizer.ts
@@ -73,19 +73,19 @@ export class TikTokenizer {
    * Take the encoder tokens mapping from OpenAI tiktoken dump to build the encoder
    * For gpt-3.5-turbo/gpt4, you can download the BPE tokens mapping from:
    * https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
-   * @param tikTokenBpeFile BPE rank file path
+   * @param tikTokenBpeFileOrDict BPE rank file path or parsed dictionary
    * @param specialTokensEncoder special tokens encoder
    * @param regexPattern regex pattern to split the input text
    * @param cacheSize cache size
    */
   constructor(
-    tikTokenBpeFile: string,
+    tikTokenBpeFileOrDict: string | Map<Uint8Array, number>,
     specialTokensEncoder: ReadonlyMap<string, number>,
     regexPattern: string,
     cacheSize: number = 8192
   ) {
     this.cache = new LRUCache(cacheSize);
-    const bpeDict = loadTikTokenBpe(tikTokenBpeFile);
+    const bpeDict = typeof tikTokenBpeFileOrDict === 'string' ? loadTikTokenBpe(tikTokenBpeFileOrDict) : tikTokenBpeFileOrDict;
     this.init(bpeDict, specialTokensEncoder, regexPattern);
   }
 

diff --git a/tokenizer_ts/src/tokenizerBuilder.ts b/tokenizer_ts/src/tokenizerBuilder.ts
@@ -285,20 +285,20 @@ export async function createByEncoderName(
 
 /**
  * Create a tokenizer from a file
- * @param tikTokenBpeFile BPE rank file in tiktoken format
+ * @param tikTokenBpeFileOrDict BPE rank file in tiktoken format or parsed dictionary
  * @param specialTokensEncoder special tokens mapping
  * @param regexPattern regex pattern
  * @param cacheSize cache size
  * @returns TikTokenizer tokenizer
  */
 export function createTokenizer(
-  tikTokenBpeFile: string,
+  tikTokenBpeFileOrDict: string | Map<Uint8Array, number>,
   specialTokensEncoder: ReadonlyMap<string, number>,
   regexPattern: string,
   cacheSize: number = 8192
 ): TikTokenizer {
   const tikTokenizer = new TikTokenizer(
-    tikTokenBpeFile,
+    tikTokenBpeFileOrDict,
     specialTokensEncoder,
     regexPattern,
     cacheSize