diff --git a/tokenizer_ts/package-lock.json b/tokenizer_ts/package-lock.json index d180ff9..1cf40e9 100644 --- a/tokenizer_ts/package-lock.json +++ b/tokenizer_ts/package-lock.json @@ -1,12 +1,12 @@ { "name": "@microsoft/tiktokenizer", - "version": "1.0.6", + "version": "1.0.8", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@microsoft/tiktokenizer", - "version": "1.0.6", + "version": "1.0.8", "license": "MIT", "devDependencies": { "@types/mocha": "^5.2.7", diff --git a/tokenizer_ts/package.json b/tokenizer_ts/package.json index 310d6fe..55ddcf0 100644 --- a/tokenizer_ts/package.json +++ b/tokenizer_ts/package.json @@ -2,7 +2,7 @@ "name": "@microsoft/tiktokenizer", "displayName": "tiktokenizer", "description": "Tokenizer for OpenAI large language models.", - "version": "1.0.7", + "version": "1.0.8", "author": { "name": "Microsoft Corporation" }, diff --git a/tokenizer_ts/src/tikTokenizer.ts b/tokenizer_ts/src/tikTokenizer.ts index 0d85fbb..3f49603 100644 --- a/tokenizer_ts/src/tikTokenizer.ts +++ b/tokenizer_ts/src/tikTokenizer.ts @@ -73,19 +73,19 @@ export class TikTokenizer { * Take the encoder tokens mapping from OpenAI tiktoken dump to build the encoder * For gpt-3.5-turbo/gpt4, you can download the BPE tokens mapping from: * https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken - * @param tikTokenBpeFile BPE rank file path + * @param tikTokenBpeFileOrDict BPE rank file path or parsed dictionary * @param specialTokensEncoder special tokens encoder * @param regexPattern regex pattern to split the input text * @param cacheSize cache size */ constructor( - tikTokenBpeFile: string, + tikTokenBpeFileOrDict: string | Map, specialTokensEncoder: ReadonlyMap, regexPattern: string, cacheSize: number = 8192 ) { this.cache = new LRUCache(cacheSize); - const bpeDict = loadTikTokenBpe(tikTokenBpeFile); + const bpeDict = typeof tikTokenBpeFileOrDict === 'string' ? loadTikTokenBpe(tikTokenBpeFileOrDict) : tikTokenBpeFileOrDict; this.init(bpeDict, specialTokensEncoder, regexPattern); } diff --git a/tokenizer_ts/src/tokenizerBuilder.ts b/tokenizer_ts/src/tokenizerBuilder.ts index 42cb4be..5d4b156 100644 --- a/tokenizer_ts/src/tokenizerBuilder.ts +++ b/tokenizer_ts/src/tokenizerBuilder.ts @@ -285,20 +285,20 @@ export async function createByEncoderName( /** * Create a tokenizer from a file - * @param tikTokenBpeFile BPE rank file in tiktoken format + * @param tikTokenBpeFileOrDict BPE rank file in tiktoken format or parsed dictionary * @param specialTokensEncoder special tokens mapping * @param regexPattern regex pattern * @param cacheSize cache size * @returns TikTokenizer tokenizer */ export function createTokenizer( - tikTokenBpeFile: string, + tikTokenBpeFileOrDict: string | Map, specialTokensEncoder: ReadonlyMap, regexPattern: string, cacheSize: number = 8192 ): TikTokenizer { const tikTokenizer = new TikTokenizer( - tikTokenBpeFile, + tikTokenBpeFileOrDict, specialTokensEncoder, regexPattern, cacheSize