Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[duplicate] Support for o200k_base and gpt-4o (omni) model #43

Merged
merged 8 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/dotnet-build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [windows-latest, ubuntu-latest, macos-latest]
os: [windows-latest, ubuntu-latest]
runs-on: ${{ matrix.os }}
steps:
- name: Checkout repository
Expand All @@ -21,6 +21,10 @@ jobs:
3.1.x
6.0.x
8.0.x
architecture: x64

- name: Log .NET SDK versions
run: dotnet --info

- name: Restore dependencies
run: dotnet restore
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ SharpToken currently supports the following models:
* `p50k_base`
* `p50k_edit`
* `cl100k_base`
* `o200k_base`

You can use any of these models when creating an instance of GptEncoding:

Expand All @@ -86,6 +87,7 @@ var r50kBaseEncoding = GptEncoding.GetEncoding("r50k_base");
var p50kBaseEncoding = GptEncoding.GetEncoding("p50k_base");
var p50kEditEncoding = GptEncoding.GetEncoding("p50k_edit");
var cl100kBaseEncoding = GptEncoding.GetEncoding("cl100k_base");
var o200kBaseEncoding = GptEncoding.GetEncoding("o200k_base");
```

### Model Prefix Matching
Expand All @@ -96,11 +98,13 @@ Here are the current supported prefixes and their corresponding encodings:

| Model Prefix | Encoding |
|---------------------|------------|
| `gpt-4o` | `o200k_base` |
| `gpt-4-` | `cl100k_base` |
| `gpt-3.5-turbo-` | `cl100k_base` |
| `gpt-35-turbo` | `cl100k_base` |

Examples of model names that fall under these prefixes include:
- For the prefix `gpt-4o`: `gpt-4o`, `gpt-4o-2024-05-13`, etc.
- For the prefix `gpt-4-`: `gpt-4-0314`, `gpt-4-32k`, etc.
- For the prefix `gpt-3.5-turbo-`: `gpt-3.5-turbo-0301`, `gpt-3.5-turbo-0401`, etc.
- For the Azure deployment name `gpt-35-turbo`.
Expand Down Expand Up @@ -256,7 +260,7 @@ public class CompareBenchmark

return sum;
}

[Benchmark]
public int MLTokenizers()
{
Expand Down
13 changes: 11 additions & 2 deletions SharpToken.Benchmark/CompareBenchmark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ public class CompareBenchmark
private Tokenizer _mlTokenizer;
private string _kLongText;

[GlobalSetup]
public async Task Setup()
[GlobalSetup] // TODO: move this to SetupO200k?
public async Task SetupCL100k()
{
_sharpToken = GptEncoding.GetEncoding("cl100k_base");
_tikToken = await TikToken.GetEncodingAsync("cl100k_base").ConfigureAwait(false);
Expand All @@ -30,6 +30,15 @@ public async Task Setup()
_kLongText = "King Lear, one of Shakespeare's darkest and most savage plays, tells the story of the foolish and Job-like Lear, who divides his kingdom, as he does his affections, according to vanity and whim. Lear’s failure as a father engulfs himself and his world in turmoil and tragedy.";
}

public async Task SetupO200k()
{
_sharpToken = GptEncoding.GetEncoding("o200k_base");
_tikToken = await TikToken.GetEncodingAsync("o200k_base").ConfigureAwait(false);
_tokenizer = await TokenizerBuilder.CreateByModelNameAsync("gpt-4o").ConfigureAwait(false);
_mlTokenizer = Tokenizer.CreateTiktokenForModel("gpt-4o");
_kLongText = "King Lear, one of Shakespeare's darkest and most savage plays, tells the story of the foolish and Job-like Lear, who divides his kingdom, as he does his affections, according to vanity and whim. Lear’s failure as a father engulfs himself and his world in turmoil and tragedy.";
}

[Benchmark]
public int SharpToken()
{
Expand Down
3 changes: 1 addition & 2 deletions SharpToken.Tests/SharpToken.Tests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace SharpToken.Tests;

public class Tests
{
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base" };
private static readonly List<string> ModelsList = new() { "p50k_base", "r50k_base", "cl100k_base", "o200k_base" };

private static readonly List<Tuple<string, string, List<int>>> TestData =
TestHelpers.ReadTestPlans("SharpToken.Tests.data.TestPlans.txt");
Expand Down Expand Up @@ -70,7 +70,6 @@ public async Task TestEncodingAndDecodingInParallel()
}
}


[Test]
public void TestEncodingWithCustomAllowedSet()
{
Expand Down
1 change: 1 addition & 0 deletions SharpToken.Tests/data/TestPlanGenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def save_test_plans(test_plans, filename):
tiktoken.get_encoding("p50k_base"),
tiktoken.get_encoding("p50k_edit"),
tiktoken.get_encoding("cl100k_base"),
tiktoken.get_encoding("o200k_base"),
]

test_samples = read_test_samples(samples_filename)
Expand Down
259 changes: 259 additions & 0 deletions SharpToken.Tests/data/TestPlans.txt

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions SharpToken/Lib/Internals/ModelParamsGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ public static ModelParams GetModelParams(string encodingName)
case "cl100k_base":
return Cl100KBase();

case "o200k_base":
return O200KBase();

default:
throw new ArgumentException($"Unknown encoding name: {encodingName}");
}
Expand Down Expand Up @@ -119,6 +122,24 @@ private static ModelParams Cl100KBase()
specialTokens: specialTokens
);
}

private static ModelParams O200KBase()
{
var mergeableRanks = EmbeddedResourceReader.LoadTokenBytePairEncoding("SharpToken.data.o200k_base.tiktoken");

var specialTokens = new Dictionary<string, int>
{
{ EndOfText, 199999 },
{ EndOfPrompt, 200018 }
};

return new ModelParams
(
tokenizerRegex: ModelParamsGeneratorRegex.RegexO200KBase(),
mergeableRanks: mergeableRanks,
specialTokens: specialTokens
);
}
}

internal sealed partial class ModelParamsGeneratorRegex
Expand All @@ -129,10 +150,15 @@ internal sealed partial class ModelParamsGeneratorRegex

[GeneratedRegex(@"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+")]
public static partial Regex RegexCl100KBase();

[GeneratedRegex(@"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+")]
public static partial Regex RegexO200KBase();
#else
public static Regex Regex50KBase() => new Regex(@"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+", RegexOptions.Compiled);

public static Regex RegexCl100KBase() => new Regex(@"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+", RegexOptions.Compiled);

public static Regex RegexO200KBase() => new Regex(@"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+", RegexOptions.Compiled);
#endif
}
}
2 changes: 2 additions & 0 deletions SharpToken/Lib/Model.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ public static class Model
private static readonly Dictionary<string, string> ModelToEncodingMapping = new Dictionary<string, string>
{
// chat
{ "gpt-4o", "o200k_base" },
{ "gpt-4", "cl100k_base" },
{ "gpt-3.5-turbo-16k", "cl100k_base" },
{ "gpt-35-turbo-16k", "cl100k_base" }, // Azure deployment name
Expand Down Expand Up @@ -53,6 +54,7 @@ public static class Model

private static readonly Dictionary<string, string> ModelPrefixToEncodingMapping = new Dictionary<string, string>
{
{ "gpt-4o", "o200k_base" }, // (NOTE: no trailing dash, on purpose). E.g., gpt-4o, gpt-4o-2024-05-13, etc.,
{ "gpt-4-", "cl100k_base" }, // e.g., gpt-4-0314, etc., plus gpt-4-32k
{ "gpt-3.5-turbo-", "cl100k_base" }, // e.g, gpt-3.5-turbo-0301, -0401, etc.
{ "gpt-35-turbo", "cl100k_base" }, // Azure deployment name
Expand Down
2 changes: 2 additions & 0 deletions SharpToken/SharpToken.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,13 @@
<None Remove="Lib\" />
<None Remove="data\" />
<None Remove="data\cl100k_base.tiktoken" />
<None Remove="data\o200k_base.tiktoken" />
<None Remove="data\p50k_base.tiktoken" />
<None Remove="data\r50k_base.tiktoken" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="data\cl100k_base.tiktoken" />
<EmbeddedResource Include="data\o200k_base.tiktoken" />
<EmbeddedResource Include="data\p50k_base.tiktoken" />
<EmbeddedResource Include="data\r50k_base.tiktoken" />
</ItemGroup>
Expand Down
Loading
Loading