Skip to content

Commit

Permalink
Add support for gpt-4o (#46)
Browse files Browse the repository at this point in the history
* Add support for gpt-4o

* Make case insensitive matching because javascript doesn't support (?i: ...) inline modifier for case insensitivity
  • Loading branch information
shengyfu authored May 21, 2024
1 parent 2d96434 commit 9cad244
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 2 deletions.
2 changes: 1 addition & 1 deletion tokenizer_ts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "@microsoft/tiktokenizer",
"displayName": "tiktokenizer",
"description": "Tokenizer for OpenAI large language models.",
"version": "1.0.6",
"version": "1.0.7",
"author": {
"name": "Microsoft Corporation"
},
Expand Down
33 changes: 32 additions & 1 deletion tokenizer_ts/src/tokenizerBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,15 @@ import { TikTokenizer } from "./tikTokenizer";

const MODEL_PREFIX_TO_ENCODING: ReadonlyMap<string, string> = new Map([
// chat
["gpt-4o-", "o200k_base"], // e.g., gpt-4o-2024-05-13
["gpt-4-", "cl100k_base"], // e.g., gpt-4-0314, etc., plus gpt-4-32k
["gpt-3.5-turbo-", "cl100k_base"] // e.g, gpt-3.5-turbo-0301, -0401, etc.
["gpt-3.5-turbo-", "cl100k_base"], // e.g, gpt-3.5-turbo-0301, -0401, etc.
["gpt-35-turbo-", "cl100k_base"] // Azure deployment name
]);

export const MODEL_TO_ENCODING: ReadonlyMap<string, string> = new Map([
// chat
["gpt-4o", "o200k_base"],
["gpt-4", "cl100k_base"],
["gpt-3.5-turbo", "cl100k_base"],
// text
Expand Down Expand Up @@ -71,6 +74,22 @@ const REGEX_PATTERN_1: string =
const REGEX_PATTERN_2: string =
"(?:'s|'S|'t|'T|'re|'RE|'Re|'eR|'ve|'VE|'vE|'Ve|'m|'M|'ll|'lL|'Ll|'LL|'d|'D)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";


/*
* regex pattern used for gpt-4o
*/
const patterns: string[] = [
`[^\r\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'s|'S|'t|'T|'re|'RE|'Re|'eR|'ve|'VE|'vE|'Ve|'m|'M|'ll|'lL|'Ll|'LL|'d|'D)?`,
`[^\r\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'s|'S|'t|'T|'re|'RE|'Re|'eR|'ve|'VE|'vE|'Ve|'m|'M|'ll|'lL|'Ll|'LL|'d|'D)?`,
`\\p{N}{1,3}`,
` ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*`,
`\\s*[\\r\\n]+`,
`\\s+(?!\\S)`,
`\\s+`,
];

const REGEX_PATTERN_3: string = patterns.join("|");

function getEncoderFromModelName(modelName: string): string {
let encoder = "";
if (!MODEL_TO_ENCODING.has(modelName)) {
Expand Down Expand Up @@ -112,6 +131,12 @@ export function getSpecialTokensByEncoder(
): Map<string, number> {
let specialTokens: Map<string, number> = new Map([[ENDOFTEXT, 50256]]);
switch (encoder) {
case "o200k_base":
specialTokens = new Map([
[ENDOFTEXT, 199999],
[ENDOFPROMPT, 200018]
]);
break;
case "cl100k_base":
specialTokens = new Map([
[ENDOFTEXT, 100257],
Expand Down Expand Up @@ -157,6 +182,8 @@ export function getSpecialTokensByModel(
*/
export function getRegexByEncoder(encoder: string): string {
switch (encoder) {
case "o200k_base":
return REGEX_PATTERN_3;
case "cl100k_base":
return REGEX_PATTERN_2;
default:
Expand Down Expand Up @@ -208,6 +235,10 @@ export async function createByEncoderName(
);

switch (encoderName) {
case "o200k_base":
regexPattern = REGEX_PATTERN_3;
mergeableRanksFileUrl = `https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken`;
break;
case "cl100k_base":
regexPattern = REGEX_PATTERN_2;
mergeableRanksFileUrl = `https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken`;
Expand Down
1 change: 1 addition & 0 deletions tokenizer_ts/test/testdata/tokens_gpt_4o.json

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions tokenizer_ts/test/tikTokenizerGpt4o.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.

import * as assert from "assert";
import * as fs from "fs";
import { suite, before } from "mocha";
import { createByModelName } from "../src/tokenizerBuilder";
import { TikTokenizer } from "../src/tikTokenizer";
const ENDOFTEXT: string = "<|endoftext|>";
const ENDOFPROMPT: string = "<|endofprompt|>";
const specialTokens: ReadonlyMap<string, number> = new Map([
[ENDOFTEXT, 199999],
[ENDOFPROMPT, 200018]
]);

suite("TikTokenizer gpt-4o Test Suite", function() {
let tokenizer_gpt4o: TikTokenizer;
before(async () => {
tokenizer_gpt4o = await createByModelName("gpt-4o", specialTokens);
});

test("tokenize source code - gpt-4o", done => {
const source = fs.readFileSync("test/testdata/lib.rs.txt", "utf8");
const filePath = "test/testdata/tokens_gpt_4o.json";

fs.readFile(filePath, "utf8", (err, data) => {
assert.strictEqual(err, null);
const jsonArray = JSON.parse(data) as Array<number>;
let encoded = tokenizer_gpt4o.encode(
source,
Array.from(specialTokens.keys())
);
assert.deepStrictEqual(encoded.length, 5609);
assert.deepStrictEqual(encoded, jsonArray);
assert.strictEqual(tokenizer_gpt4o.decode(encoded), source);
done();
});
});
});

0 comments on commit 9cad244

Please sign in to comment.