allow passing in a parsd dictionary to the tokenizer (#49)

* allow passing in a parsd dictionary to the tokenizer I would like to compress the tiktoken dictionary in copilot chat, as they're pretty big (especially with 200k.) This allows us to manually parse the file ourselves instead of having to pass it in to be read in its standard format. * v1.0.8
microsoft · Jul 11, 2024 · d0bcf05 · d0bcf05
1 parent 9cad244
commit d0bcf05
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 9 deletions.
diff --git a/tokenizer_ts/package-lock.json b/tokenizer_ts/package-lock.json
diff --git a/tokenizer_ts/package.json b/tokenizer_ts/package.json
@@ -2,7 +2,7 @@
   "name": "@microsoft/tiktokenizer",
   "displayName": "tiktokenizer",
   "description": "Tokenizer for OpenAI large language models.",
-  "version": "1.0.7",
+  "version": "1.0.8",
   "author": {
     "name": "Microsoft Corporation"
   },

diff --git a/tokenizer_ts/src/tikTokenizer.ts b/tokenizer_ts/src/tikTokenizer.ts
@@ -73,19 +73,19 @@ export class TikTokenizer {
    * Take the encoder tokens mapping from OpenAI tiktoken dump to build the encoder
    * For gpt-3.5-turbo/gpt4, you can download the BPE tokens mapping from:
    * https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
-   * @param tikTokenBpeFile BPE rank file path
+   * @param tikTokenBpeFileOrDict BPE rank file path or parsed dictionary
    * @param specialTokensEncoder special tokens encoder
    * @param regexPattern regex pattern to split the input text
    * @param cacheSize cache size
    */
   constructor(
-    tikTokenBpeFile: string,
+    tikTokenBpeFileOrDict: string | Map<Uint8Array, number>,
     specialTokensEncoder: ReadonlyMap<string, number>,
     regexPattern: string,
     cacheSize: number = 8192
   ) {
     this.cache = new LRUCache(cacheSize);
-    const bpeDict = loadTikTokenBpe(tikTokenBpeFile);
+    const bpeDict = typeof tikTokenBpeFileOrDict === 'string' ? loadTikTokenBpe(tikTokenBpeFileOrDict) : tikTokenBpeFileOrDict;
     this.init(bpeDict, specialTokensEncoder, regexPattern);
   }
 

diff --git a/tokenizer_ts/src/tokenizerBuilder.ts b/tokenizer_ts/src/tokenizerBuilder.ts
@@ -285,20 +285,20 @@ export async function createByEncoderName(
 
 /**
  * Create a tokenizer from a file
- * @param tikTokenBpeFile BPE rank file in tiktoken format
+ * @param tikTokenBpeFileOrDict BPE rank file in tiktoken format or parsed dictionary
  * @param specialTokensEncoder special tokens mapping
  * @param regexPattern regex pattern
  * @param cacheSize cache size
  * @returns TikTokenizer tokenizer
  */
 export function createTokenizer(
-  tikTokenBpeFile: string,
+  tikTokenBpeFileOrDict: string | Map<Uint8Array, number>,
   specialTokensEncoder: ReadonlyMap<string, number>,
   regexPattern: string,
   cacheSize: number = 8192
 ): TikTokenizer {
   const tikTokenizer = new TikTokenizer(
-    tikTokenBpeFile,
+    tikTokenBpeFileOrDict,
     specialTokensEncoder,
     regexPattern,
     cacheSize