OpenAI-DotNet 7.7.6 (#268)

- Added support for Audio Transcription and Translation verbose json output - Added support for timestamp granularities for segments and words - Marked CreateTranscriptionAsync obsolete - Added CreateTranscriptionTextAsync - Added CreateTranscriptionJsonAsync - Marked CreateTranspationAsync obsolete - Added CreateTranslationTextAsync - Added CreateTranslationJsonAsync - Updated SpeechResponseFormat to include wav and pcm
RageAgainstThePixel · Mar 19, 2024 · 9124a33 · 9124a33
1 parent c17dee4
commit 9124a33
Show file tree

Hide file tree

Showing 13 changed files with 425 additions and 55 deletions.
diff --git a/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs b/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs
@@ -11,25 +11,81 @@ namespace OpenAI.Tests
     internal class TestFixture_07_Audio : AbstractTestFixture
     {
         [Test]
-        public async Task Test_1_Transcription()
+        public async Task Test_01_01_Transcription_Text()
         {
             Assert.IsNotNull(OpenAIClient.AudioEndpoint);
             var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3");
-            using var request = new AudioTranscriptionRequest(transcriptionAudio, temperature: 0.1f, language: "en");
-            var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(request);
-            Assert.IsNotNull(result);
-            Console.WriteLine(result);
+            using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Text, temperature: 0.1f, language: "en");
+            var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request);
+            Assert.IsNotNull(response);
         }
 
         [Test]
-        public async Task Test_2_Translation()
+        public async Task Test_01_02_Transcription_Json()
+        {
+            Assert.IsNotNull(OpenAIClient.AudioEndpoint);
+            var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3");
+            using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Json, temperature: 0.1f, language: "en");
+            var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request);
+            Assert.IsNotNull(response);
+        }
+
+        [Test]
+        public async Task Test_01_03_01_Transcription_VerboseJson()
+        {
+            Assert.IsNotNull(OpenAIClient.AudioEndpoint);
+            var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3");
+            using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, temperature: 0.1f, language: "en");
+            var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request);
+            Assert.IsNotNull(response);
+            Assert.IsNotNull(response.Duration);
+            Assert.IsTrue(response.Language == "english");
+            Assert.IsNotNull(response.Segments);
+            Assert.IsNotEmpty(response.Segments);
+        }
+
+        [Test]
+        public async Task Test_01_03_02_Transcription_VerboseJson_WordSimilarities()
+        {
+            Assert.IsNotNull(OpenAIClient.AudioEndpoint);
+            var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3");
+            using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en");
+            var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request);
+            Assert.IsNotNull(response);
+            Assert.IsNotNull(response.Duration);
+            Assert.IsTrue(response.Language == "english");
+            Assert.IsNotNull(response.Words);
+            Assert.IsNotEmpty(response.Words);
+        }
+
+        [Test]
+        public async Task Test_02_01_Translation_Text()
+        {
+            Assert.IsNotNull(OpenAIClient.AudioEndpoint);
+            var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3");
+            using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Text);
+            var response = await OpenAIClient.AudioEndpoint.CreateTranslationTextAsync(request);
+            Assert.IsNotNull(response);
+        }
+
+        [Test]
+        public async Task Test_02_02_Translation_Json()
+        {
+            Assert.IsNotNull(OpenAIClient.AudioEndpoint);
+            var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3");
+            using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Json);
+            var response = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request);
+            Assert.IsNotNull(response);
+        }
+
+        [Test]
+        public async Task Test_02_03_Translation_VerboseJson()
         {
             Assert.IsNotNull(OpenAIClient.AudioEndpoint);
             var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3");
-            using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio));
-            var result = await OpenAIClient.AudioEndpoint.CreateTranslationAsync(request);
-            Assert.IsNotNull(result);
-            Console.WriteLine(result);
+            using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Verbose_Json);
+            var response = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request);
+            Assert.IsNotNull(response);
         }
 
         [Test]
@@ -43,9 +99,9 @@ async Task ChunkCallback(ReadOnlyMemory<byte> chunkCallback)
                 await Task.CompletedTask;
             }
 
-            var result = await OpenAIClient.AudioEndpoint.CreateSpeechAsync(request, ChunkCallback);
-            Assert.IsFalse(result.IsEmpty);
-            await File.WriteAllBytesAsync("../../../Assets/HelloWorld.mp3", result.ToArray());
+            var response = await OpenAIClient.AudioEndpoint.CreateSpeechAsync(request, ChunkCallback);
+            Assert.IsFalse(response.IsEmpty);
+            await File.WriteAllBytesAsync("../../../Assets/HelloWorld.mp3", response.ToArray());
         }
     }
 }
diff --git a/OpenAI-DotNet/Audio/AudioEndpoint.cs b/OpenAI-DotNet/Audio/AudioEndpoint.cs
@@ -5,7 +5,6 @@
 using System.IO;
 using System.Net.Http;
 using System.Text.Json;
-using System.Text.Json.Serialization;
 using System.Threading;
 using System.Threading.Tasks;
 
@@ -17,17 +16,6 @@ namespace OpenAI.Audio
     /// </summary>
     public sealed class AudioEndpoint : OpenAIBaseEndpoint
     {
-        private class AudioResponse
-        {
-            public AudioResponse(string text)
-            {
-                Text = text;
-            }
-
-            [JsonPropertyName("text")]
-            public string Text { get; }
-        }
-
         /// <inheritdoc />
         public AudioEndpoint(OpenAIClient client) : base(client) { }
 
@@ -75,44 +63,96 @@ public async Task<ReadOnlyMemory<byte>> CreateSpeechAsync(SpeechRequest request,
             return new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), 0, totalBytesRead);
         }
 
+        [Obsolete("Use CreateTranscriptionTextAsync or CreateTranscriptionJsonAsync instead.")]
+        public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
+            => await CreateTranscriptionTextAsync(request, cancellationToken).ConfigureAwait(false);
+
         /// <summary>
         /// Transcribes audio into the input language.
         /// </summary>
         /// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
         /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
         /// <returns>The transcribed text.</returns>
-        public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
+        public async Task<string> CreateTranscriptionTextAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
+        {
+            var responseAsString = await Internal_CreateTranscriptionAsync(request, cancellationToken).ConfigureAwait(false);
+            return request.ResponseFormat is AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json
+                ? JsonSerializer.Deserialize<AudioResponse>(responseAsString)?.Text
+                : responseAsString;
+        }
+
+        /// <summary>
+        /// Transcribes audio into the input language.
+        /// </summary>
+        /// <remarks>This method expects the request format to be either <see cref="AudioResponseFormat.Json"/> or <see cref="AudioResponseFormat.Verbose_Json"/>.</remarks>
+        /// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
+        /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
+        /// <returns><see cref="AudioResponse"/>.</returns>
+        public async Task<AudioResponse> CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
+        {
+            if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
+            {
+                throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
+            }
+
+            var responseAsString = await Internal_CreateTranscriptionAsync(request, cancellationToken).ConfigureAwait(false);
+            return JsonSerializer.Deserialize<AudioResponse>(responseAsString);
+        }
+
+        private async Task<string> Internal_CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
         {
             using var content = new MultipartFormDataContent();
             using var audioData = new MemoryStream();
             await request.Audio.CopyToAsync(audioData, cancellationToken).ConfigureAwait(false);
             content.Add(new ByteArrayContent(audioData.ToArray()), "file", request.AudioName);
             content.Add(new StringContent(request.Model), "model");
 
+            if (!string.IsNullOrWhiteSpace(request.Language))
+            {
+                content.Add(new StringContent(request.Language), "language");
+            }
+
             if (!string.IsNullOrWhiteSpace(request.Prompt))
             {
                 content.Add(new StringContent(request.Prompt), "prompt");
             }
 
-            var responseFormat = request.ResponseFormat;
-            content.Add(new StringContent(responseFormat.ToString().ToLower()), "response_format");
+            content.Add(new StringContent(request.ResponseFormat.ToString().ToLower()), "response_format");
 
             if (request.Temperature.HasValue)
             {
                 content.Add(new StringContent(request.Temperature.ToString()), "temperature");
             }
 
-            if (!string.IsNullOrWhiteSpace(request.Language))
+            switch (request.TimestampGranularities)
             {
-                content.Add(new StringContent(request.Language), "language");
+                case TimestampGranularity.Segment:
+                case TimestampGranularity.Word:
+                    content.Add(new StringContent(request.TimestampGranularities.ToString().ToLower()), "timestamp_granularities[]");
+                    break;
             }
 
             request.Dispose();
 
             using var response = await client.Client.PostAsync(GetUrl("/transcriptions"), content, cancellationToken).ConfigureAwait(false);
             var responseAsString = await response.ReadAsStringAsync(EnableDebug, content, null, cancellationToken).ConfigureAwait(false);
+            return responseAsString;
+        }
 
-            return responseFormat == AudioResponseFormat.Json
+        [Obsolete("Use CreateTranslationTextAsync or CreateTranslationJsonAsync instead.")]
+        public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
+            => await CreateTranslationTextAsync(request, cancellationToken).ConfigureAwait(false);
+
+        /// <summary>
+        /// Translates audio into English.
+        /// </summary>
+        /// <param name="request"></param>
+        /// <param name="cancellationToken"></param>
+        /// <returns>The translated text.</returns>
+        public async Task<string> CreateTranslationTextAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
+        {
+            var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken).ConfigureAwait(false);
+            return request.ResponseFormat is AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json
                 ? JsonSerializer.Deserialize<AudioResponse>(responseAsString)?.Text
                 : responseAsString;
         }
@@ -122,8 +162,20 @@ public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest req
         /// </summary>
         /// <param name="request"></param>
         /// <param name="cancellationToken"></param>
-        /// <returns>The translated text.</returns>
-        public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
+        /// <returns></returns>
+        /// <exception cref="ArgumentException"></exception>
+        public async Task<AudioResponse> CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
+        {
+            if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
+            {
+                throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
+            }
+
+            var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken).ConfigureAwait(false);
+            return JsonSerializer.Deserialize<AudioResponse>(responseAsString);
+        }
+
+        private async Task<string> Internal_CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
         {
             using var content = new MultipartFormDataContent();
             using var audioData = new MemoryStream();
@@ -136,8 +188,7 @@ public async Task<string> CreateTranslationAsync(AudioTranslationRequest request
                 content.Add(new StringContent(request.Prompt), "prompt");
             }
 
-            var responseFormat = request.ResponseFormat;
-            content.Add(new StringContent(responseFormat.ToString().ToLower()), "response_format");
+            content.Add(new StringContent(request.ResponseFormat.ToString().ToLower()), "response_format");
 
             if (request.Temperature.HasValue)
             {
@@ -148,10 +199,7 @@ public async Task<string> CreateTranslationAsync(AudioTranslationRequest request
 
             using var response = await client.Client.PostAsync(GetUrl("/translations"), content, cancellationToken).ConfigureAwait(false);
             var responseAsString = await response.ReadAsStringAsync(EnableDebug, content, null, cancellationToken).ConfigureAwait(false);
-
-            return responseFormat == AudioResponseFormat.Json
-                ? JsonSerializer.Deserialize<AudioResponse>(responseAsString)?.Text
-                : responseAsString;
+            return responseAsString;
         }
     }
 }
diff --git a/OpenAI-DotNet/Audio/AudioResponse.cs b/OpenAI-DotNet/Audio/AudioResponse.cs
@@ -0,0 +1,47 @@
+// Licensed under the MIT License. See LICENSE in the project root for license information.
+
+using System.Text.Json.Serialization;
+
+namespace OpenAI.Audio
+{
+    public sealed class AudioResponse
+    {
+        /// <summary>
+        /// The language of the input audio.
+        /// </summary>
+        [JsonInclude]
+        [JsonPropertyName("language")]
+        [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
+        public string Language { get; private set; }
+
+        /// <summary>
+        /// The duration of the input audio.
+        /// </summary>
+        [JsonInclude]
+        [JsonPropertyName("duration")]
+        [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
+        public double? Duration { get; private set; }
+
+        /// <summary>
+        /// The transcribed text.
+        /// </summary>
+        [JsonInclude]
+        [JsonPropertyName("text")]
+        public string Text { get; private set; }
+
+        /// <summary>
+        /// Extracted words and their corresponding timestamps.
+        /// </summary>
+        [JsonInclude]
+        [JsonPropertyName("words")]
+        public TranscriptionWord[] Words { get; private set; }
+
+        /// <summary>
+        /// Segments of the transcribed text and their corresponding details.
+        /// </summary>
+        [JsonInclude]
+        [JsonPropertyName("segments")]
+        [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
+        public TranscriptionSegment[] Segments { get; private set; }
+    }
+}