Move the Tokenizer's data into separate packages. (#7248)

* Move the Tokenizer's data into separate packages. * Address the feedback * More feedback addressing * More feedback addressing * Trimming/AoT support * Make data types internal
dotnet · Oct 4, 2024 · 1e91427 · 1e91427
1 parent 189ba24
commit 1e91427
Show file tree

Hide file tree

Showing 29 changed files with 729 additions and 123 deletions.
diff --git a/Microsoft.ML.sln b/Microsoft.ML.sln
@@ -194,6 +194,18 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.GenAI.Mistral.Tests", "test\Microsoft.ML.GenAI.Mistral.Tests\Microsoft.ML.GenAI.Mistral.Tests.csproj", "{49264202-C90A-43F6-8C30-BDAEF2F1465A}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Cl100kBase", "src\Microsoft.ML.Tokenizers.Data.Cl100kBase\Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj", "{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Gpt2", "src\Microsoft.ML.Tokenizers.Data.Gpt2\Microsoft.ML.Tokenizers.Data.Gpt2.csproj", "{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.O200kBase", "src\Microsoft.ML.Tokenizers.Data.O200kBase\Microsoft.ML.Tokenizers.Data.O200kBase.csproj", "{D02DB243-5B96-4652-B172-35F18230434D}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.P50kBase", "src\Microsoft.ML.Tokenizers.Data.P50kBase\Microsoft.ML.Tokenizers.Data.P50kBase.csproj", "{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.R50kBase", "src\Microsoft.ML.Tokenizers.Data.R50kBase\Microsoft.ML.Tokenizers.Data.R50kBase.csproj", "{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.ML.Tokenizers.Data.Tests", "test\Microsoft.ML.Tokenizers.Data.Tests\Microsoft.ML.Tokenizers.Data.Tests.csproj", "{2E6055A1-3FC1-418E-9B3E-9C6255649F42}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -918,6 +930,54 @@ Global
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|Any CPU.Build.0 = Release|Any CPU
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.ActiveCfg = Release|Any CPU
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A}.Release|x64.Build.0 = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Debug|x64.Build.0 = Debug|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|Any CPU.Build.0 = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.ActiveCfg = Release|Any CPU
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007}.Release|x64.Build.0 = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Debug|x64.Build.0 = Debug|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|Any CPU.Build.0 = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.ActiveCfg = Release|Any CPU
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8}.Release|x64.Build.0 = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Debug|x64.Build.0 = Debug|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|Any CPU.Build.0 = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.ActiveCfg = Release|Any CPU
+		{D02DB243-5B96-4652-B172-35F18230434D}.Release|x64.Build.0 = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Debug|x64.Build.0 = Debug|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|Any CPU.Build.0 = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.ActiveCfg = Release|Any CPU
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA}.Release|x64.Build.0 = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Debug|x64.Build.0 = Debug|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|Any CPU.Build.0 = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.ActiveCfg = Release|Any CPU
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D}.Release|x64.Build.0 = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.ActiveCfg = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Debug|x64.Build.0 = Debug|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|Any CPU.Build.0 = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.ActiveCfg = Release|Any CPU
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42}.Release|x64.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -1013,6 +1073,12 @@ Global
 		{D202353D-6FAF-4263-9A01-BDCFBC92391F} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 		{2729CC66-7743-442B-B3A5-1F4F27F044A5} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
 		{49264202-C90A-43F6-8C30-BDAEF2F1465A} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
+		{14FB6EA7-A4A5-4491-AFBE-391AA27B8007} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{BCAD9EEF-01A0-459A-80A2-5C950AF275B8} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{D02DB243-5B96-4652-B172-35F18230434D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{FF2E2A95-E889-45C3-9205-8FDA7CD342BA} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{E1AE4EF6-9DEE-4267-B37E-94A7B413754D} = {09EADF06-BE25-4228-AB53-95AE3E15B530}
+		{2E6055A1-3FC1-418E-9B3E-9C6255649F42} = {AED9C836-31E3-4F3F-8ABC-929555D3F3C4}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {41165AF1-35BB-4832-A189-73060F82B01D}

diff --git a/eng/TokenizerData.targets b/eng/TokenizerData.targets
@@ -0,0 +1,88 @@
+<Project>
+  <UsingTask TaskName="CompressFile"
+    TaskFactory="RoslynCodeTaskFactory"
+    AssemblyFile="$(MSBuildToolsPath)\Microsoft.Build.Tasks.Core.dll" >
+    <ParameterGroup>
+      <Files ParameterType="Microsoft.Build.Framework.ITaskItem[]" Required="true" />
+    </ParameterGroup>
+    <Task>
+      <Using Namespace="System.Globalization" />
+      <Using Namespace="System.IO" />
+      <Using Namespace="System.IO.Compression" />
+      <Code Type="Fragment" Language="cs">
+			<![CDATA[
+        foreach (var file in Files)
+        {
+            string fileName = file.GetMetadata("FullPath");
+            string fileContent = File.ReadAllText(fileName);
+            int capacity = 1;
+            int eolIndex = 0;
+            do
+            {
+                if ((eolIndex = fileContent.IndexOf('\n', eolIndex)) >= 0)
+                {
+                    eolIndex++;
+                    capacity++;
+                }
+                else
+                {
+                    break;
+                }
+            } while (eolIndex < fileContent.Length);
+
+            using var sourceStream = File.OpenRead(fileName);
+            using var reader = new StreamReader(sourceStream);
+            using var destStream = new DeflateStream(File.Create(file.GetMetadata("Destination")), CompressionLevel.Optimal);
+            using var streamWriter = new StreamWriter(destStream);
+
+            streamWriter.WriteLine($"Capacity: {capacity.ToString(CultureInfo.InvariantCulture)}");
+
+            string line;
+            int destLineNumber = 0;
+
+            while ((line = reader.ReadLine()) != null)
+            {
+                if (line.Length == 0) { continue; }
+                int index = line.IndexOf(' ');
+
+                if (index <= 0 || index == line.Length - 1 || !int.TryParse(line.Substring(index + 1), out int id) || id < destLineNumber)
+                {
+                    Log.LogError($"Invalid format in the file {file.GetMetadata("FullPath")} line {line}");
+                    break;
+                }
+
+                while (destLineNumber < id)
+                {
+                    // ensure id always aligns with the line number
+                    streamWriter.WriteLine();
+                    destLineNumber++;
+                }
+
+                streamWriter.WriteLine(line.Substring(0, index));
+                destLineNumber++;
+            }
+        }
+      ]]>
+      </Code>
+    </Task>
+  </UsingTask>
+
+  <Target Name="CompressTiktokenData"
+          BeforeTargets="AssignTargetPaths"
+          DependsOnTargets="_EnsureTokenizerDataEmbeddedResourceDestination"
+          Inputs="@(TokenizerDataEmbeddedResource)"
+          Outputs="@(TokenizerDataEmbeddedResource->'%(Destination)')">
+
+      <CompressFile Files="@(TokenizerDataEmbeddedResource)" />
+
+      <ItemGroup>
+        <EmbeddedResource Include="@(TokenizerDataEmbeddedResource->'%(Destination)')" LogicalName="%(FileName)%(Extension).deflate" />
+      </ItemGroup>
+  </Target>
+
+  <Target Name="_EnsureTokenizerDataEmbeddedResourceDestination" >
+    <ItemGroup>
+      <TokenizerDataEmbeddedResource Condition="'%(TokenizerDataEmbeddedResource.Destination)' == ''" Destination="$(IntermediateOutputPath)%(FileName).deflate" />
+    </ItemGroup>
+  </Target>
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Cl100kBaseTokenizerData.cs
@@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Cl100kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the cl100k_base.tiktoken data file.
+    /// </summary>
+    internal sealed class Cl100kBaseTokenizerData
+    {
+    }
+}
diff --git a/...t.ML.Tokenizers/Data/cl100k_base.tiktoken → ...Data.Cl100kBase/Data/cl100k_base.tiktoken b/...t.ML.Tokenizers/Data/cl100k_base.tiktoken → ...Data.Cl100kBase/Data/cl100k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/Microsoft.ML.Tokenizers.Data.Cl100kBase.csproj
@@ -0,0 +1,31 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.Cl100kBase class includes the Tiktoken tokenizer data file cl100k_base.tiktoken, which is utilized by models such as GPT-4.</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - cl100k_base.tiktoken: https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <TokenizerDataEmbeddedResource Include="Data\cl100k_base.tiktoken" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Cl100kBase/PACKAGE.md
@@ -0,0 +1,47 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.Cl100kBase` includes the Tiktoken tokenizer data file `cl100k_base.tiktoken`, which is utilized by models such as GPT-4.
+
+## Key Features
+
+* This package mainly contains the cl100k_base.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the following models:
+      1. gpt-4
+      2. gpt-3.5-turbo
+      3. gpt-3.5-turbo-16k
+      4. gpt-35
+      5. gpt-35-turbo
+      6. gpt-35-turbo-16k
+      7. text-embedding-ada-002
+      8. text-embedding-3-small
+      9. text-embedding-3-large
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified models.
+
+```csharp
+
+// Create a tokenizer for the specified model or any other listed model name
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4");
+
+// Create a tokenizer for the specified encoding
+Tokenizer tokenizer = TiktokenTokenizer.CreateForEncoding("cl100k_base");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.Cl100kBase is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/...icrosoft.ML.Tokenizers/Data/gpt2.tiktoken → ...L.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken b/...icrosoft.ML.Tokenizers/Data/gpt2.tiktoken → ...L.Tokenizers.Data.Gpt2/Data/gpt2.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Gpt2TokenizerData.cs
@@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// Gpt2TokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the gpt2.tiktoken data file.
+    /// </summary>
+    internal sealed class Gpt2TokenizerData
+    {
+    }
+}
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj b/src/Microsoft.ML.Tokenizers.Data.Gpt2/Microsoft.ML.Tokenizers.Data.Gpt2.csproj
@@ -0,0 +1,31 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.Gpt2 includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as Gpt-2.</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - gpt2.tiktoken:        https://fossies.org/linux/misc/whisper-20231117.tar.gz/whisper-20231117/whisper/assets/gpt2.tiktoken?m=b
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <TokenizerDataEmbeddedResource Include="Data\gpt2.tiktoken" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md b/src/Microsoft.ML.Tokenizers.Data.Gpt2/PACKAGE.md
@@ -0,0 +1,35 @@
+## About
+
+The `Microsoft.ML.Tokenizers.Data.Gpt2` includes the Tiktoken tokenizer data file gpt2.tiktoken, which is utilized by models such as `Gpt-2`.
+
+## Key Features
+
+* This package mainly contains the gpt2.tiktoken file, which is used by the Tiktoken tokenizer. This data file is used by the Gpt-2 model.
+
+## How to Use
+
+Reference this package in your project to use the Tiktoken tokenizer with the specified model.
+
+```csharp
+
+// Create a tokenizer for the specified model
+Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("Gpt-2");
+
+```
+
+## Main Types
+
+Users shouldn't use any types exposed by this package directly. This package is intended to provide tokenizer data files.
+
+## Additional Documentation
+
+* [API documentation](https://learn.microsoft.com/en-us/dotnet/api/microsoft.ml.tokenizers)
+
+## Related Packages
+
+<!-- The related packages associated with this package -->
+Microsoft.ML.Tokenizers
+
+## Feedback & Contributing
+
+Microsoft.ML.Tokenizers.Data.Gpt2 is released as open source under the [MIT license](https://licenses.nuget.org/MIT). Bug reports and contributions are welcome at [the GitHub repository](https://github.com/dotnet/machinelearning).
diff --git a/...ft.ML.Tokenizers/Data/o200k_base.tiktoken → ...s.Data.O200kBase/Data/o200k_base.tiktoken b/...ft.ML.Tokenizers/Data/o200k_base.tiktoken → ...s.Data.O200kBase/Data/o200k_base.tiktoken
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj b/src/Microsoft.ML.Tokenizers.Data.O200kBase/Microsoft.ML.Tokenizers.Data.O200kBase.csproj
@@ -0,0 +1,31 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netstandard2.0</TargetFramework>
+    <Nullable>enable</Nullable>
+    <IsPackable>true</IsPackable>
+    <PackageDescription>The Microsoft.ML.Tokenizers.Data.O200kBase includes the Tiktoken tokenizer data file o200k_base.tiktoken, which is utilized by models such as gpt-4o.</PackageDescription>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <!--
+      The following file are compressed using the DeflateStream and embedded as resources in the assembly.
+      The files are downloaded from the following sources and compressed to the Destination.
+        - o200k_base.tiktoken   https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken
+
+      The file under MIT copyright license https://github.com/openai/tiktoken/blob/main/LICENSE
+
+      In the CompressFile task above we modify the file's content to elimenate the ranks, thus reducing the file size,
+      since the rank corresponds to the line number in the file. For the file p50k_base.tiktoken,
+      we introduce empty lines to replace any missing ranks, ensuring that the rank consistently aligns with the line number.
+      After we eleminate the ranks from the file, we compress the file using the DeflateStream and embed it as a resource in the assembly.
+    -->
+    <TokenizerDataEmbeddedResource Include="Data\o200k_base.tiktoken" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Microsoft.ML.Tokenizers\Microsoft.ML.Tokenizers.csproj"/>
+  </ItemGroup>
+
+  <Import Project="$(RepositoryEngineeringDir)TokenizerData.targets" />
+</Project>
diff --git a/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs b/src/Microsoft.ML.Tokenizers.Data.O200kBase/O200kBaseTokenizerData.cs
@@ -0,0 +1,13 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+namespace Microsoft.ML.Tokenizers
+{
+    /// <summary>
+    /// O200kBaseTokenizerData is internally used by Microsoft.ML.Tokenizers library to bind to the o200k_base.tiktoken data file.
+    /// </summary>
+    internal sealed class O200kBaseTokenizerData
+    {
+    }
+}