From d0bcf059092f75933e56b80d2258ed5477b2f687 Mon Sep 17 00:00:00 2001 From: Connor Peet Date: Thu, 11 Jul 2024 11:16:15 -0700 Subject: [PATCH] allow passing in a parsd dictionary to the tokenizer (#49) * allow passing in a parsd dictionary to the tokenizer I would like to compress the tiktoken dictionary in copilot chat, as they're pretty big (especially with 200k.) This allows us to manually parse the file ourselves instead of having to pass it in to be read in its standard format. * v1.0.8 --- tokenizer_ts/package-lock.json | 4 ++-- tokenizer_ts/package.json | 2 +- tokenizer_ts/src/tikTokenizer.ts | 6 +++--- tokenizer_ts/src/tokenizerBuilder.ts | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tokenizer_ts/package-lock.json b/tokenizer_ts/package-lock.json index d180ff9..1cf40e9 100644 --- a/tokenizer_ts/package-lock.json +++ b/tokenizer_ts/package-lock.json @@ -1,12 +1,12 @@ { "name": "@microsoft/tiktokenizer", - "version": "1.0.6", + "version": "1.0.8", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@microsoft/tiktokenizer", - "version": "1.0.6", + "version": "1.0.8", "license": "MIT", "devDependencies": { "@types/mocha": "^5.2.7", diff --git a/tokenizer_ts/package.json b/tokenizer_ts/package.json index 310d6fe..55ddcf0 100644 --- a/tokenizer_ts/package.json +++ b/tokenizer_ts/package.json @@ -2,7 +2,7 @@ "name": "@microsoft/tiktokenizer", "displayName": "tiktokenizer", "description": "Tokenizer for OpenAI large language models.", - "version": "1.0.7", + "version": "1.0.8", "author": { "name": "Microsoft Corporation" }, diff --git a/tokenizer_ts/src/tikTokenizer.ts b/tokenizer_ts/src/tikTokenizer.ts index 0d85fbb..3f49603 100644 --- a/tokenizer_ts/src/tikTokenizer.ts +++ b/tokenizer_ts/src/tikTokenizer.ts @@ -73,19 +73,19 @@ export class TikTokenizer { * Take the encoder tokens mapping from OpenAI tiktoken dump to build the encoder * For gpt-3.5-turbo/gpt4, you can download the BPE tokens mapping from: * https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken - * @param tikTokenBpeFile BPE rank file path + * @param tikTokenBpeFileOrDict BPE rank file path or parsed dictionary * @param specialTokensEncoder special tokens encoder * @param regexPattern regex pattern to split the input text * @param cacheSize cache size */ constructor( - tikTokenBpeFile: string, + tikTokenBpeFileOrDict: string | Map, specialTokensEncoder: ReadonlyMap, regexPattern: string, cacheSize: number = 8192 ) { this.cache = new LRUCache(cacheSize); - const bpeDict = loadTikTokenBpe(tikTokenBpeFile); + const bpeDict = typeof tikTokenBpeFileOrDict === 'string' ? loadTikTokenBpe(tikTokenBpeFileOrDict) : tikTokenBpeFileOrDict; this.init(bpeDict, specialTokensEncoder, regexPattern); } diff --git a/tokenizer_ts/src/tokenizerBuilder.ts b/tokenizer_ts/src/tokenizerBuilder.ts index 42cb4be..5d4b156 100644 --- a/tokenizer_ts/src/tokenizerBuilder.ts +++ b/tokenizer_ts/src/tokenizerBuilder.ts @@ -285,20 +285,20 @@ export async function createByEncoderName( /** * Create a tokenizer from a file - * @param tikTokenBpeFile BPE rank file in tiktoken format + * @param tikTokenBpeFileOrDict BPE rank file in tiktoken format or parsed dictionary * @param specialTokensEncoder special tokens mapping * @param regexPattern regex pattern * @param cacheSize cache size * @returns TikTokenizer tokenizer */ export function createTokenizer( - tikTokenBpeFile: string, + tikTokenBpeFileOrDict: string | Map, specialTokensEncoder: ReadonlyMap, regexPattern: string, cacheSize: number = 8192 ): TikTokenizer { const tikTokenizer = new TikTokenizer( - tikTokenBpeFile, + tikTokenBpeFileOrDict, specialTokensEncoder, regexPattern, cacheSize