Skip to content

Commit

Permalink
allow passing in a parsd dictionary to the tokenizer
Browse files Browse the repository at this point in the history
I would like to compress the tiktoken dictionary in copilot chat, as
they're pretty big (especially with 200k.) This allows us to manually
parse the file ourselves instead of having to pass it in to be read in
its standard format.
  • Loading branch information
connor4312 committed Jul 3, 2024
1 parent 9cad244 commit e874685
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 6 deletions.
6 changes: 3 additions & 3 deletions tokenizer_ts/src/tikTokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,19 +73,19 @@ export class TikTokenizer {
* Take the encoder tokens mapping from OpenAI tiktoken dump to build the encoder
* For gpt-3.5-turbo/gpt4, you can download the BPE tokens mapping from:
* https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
* @param tikTokenBpeFile BPE rank file path
* @param tikTokenBpeFileOrDict BPE rank file path or parsed dictionary
* @param specialTokensEncoder special tokens encoder
* @param regexPattern regex pattern to split the input text
* @param cacheSize cache size
*/
constructor(
tikTokenBpeFile: string,
tikTokenBpeFileOrDict: string | Map<Uint8Array, number>,
specialTokensEncoder: ReadonlyMap<string, number>,
regexPattern: string,
cacheSize: number = 8192
) {
this.cache = new LRUCache(cacheSize);
const bpeDict = loadTikTokenBpe(tikTokenBpeFile);
const bpeDict = typeof tikTokenBpeFileOrDict === 'string' ? loadTikTokenBpe(tikTokenBpeFileOrDict) : tikTokenBpeFileOrDict;
this.init(bpeDict, specialTokensEncoder, regexPattern);
}

Expand Down
6 changes: 3 additions & 3 deletions tokenizer_ts/src/tokenizerBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -285,20 +285,20 @@ export async function createByEncoderName(

/**
* Create a tokenizer from a file
* @param tikTokenBpeFile BPE rank file in tiktoken format
* @param tikTokenBpeFileOrDict BPE rank file in tiktoken format or parsed dictionary
* @param specialTokensEncoder special tokens mapping
* @param regexPattern regex pattern
* @param cacheSize cache size
* @returns TikTokenizer tokenizer
*/
export function createTokenizer(
tikTokenBpeFile: string,
tikTokenBpeFileOrDict: string | Map<Uint8Array, number>,
specialTokensEncoder: ReadonlyMap<string, number>,
regexPattern: string,
cacheSize: number = 8192
): TikTokenizer {
const tikTokenizer = new TikTokenizer(
tikTokenBpeFile,
tikTokenBpeFileOrDict,
specialTokensEncoder,
regexPattern,
cacheSize
Expand Down

0 comments on commit e874685

Please sign in to comment.