From 9c50d8e0d716cebdfc5140be2ccb6d21f6e464cd Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Mon, 18 Mar 2024 14:27:49 -0400 Subject: [PATCH 1/5] OpenAI-DotNet 7.7.6 - Added support for Audio Transcription and Translation verbose json output - Added support for timestamp granularities for segments and words --- OpenAI-DotNet-Tests/TestFixture_07_Audio.cs | 72 +++++++++++-- OpenAI-DotNet/Audio/AudioEndpoint.cs | 102 +++++++++++++----- .../Audio/AudioTranscriptionRequest.cs | 35 +++++- .../Audio/AudioTranscriptionResponse.cs | 47 ++++++++ .../Audio/AudioTranslationRequest.cs | 4 +- OpenAI-DotNet/Audio/TimestampGranularity.cs | 11 ++ OpenAI-DotNet/Audio/TranscriptionSegment.cs | 85 +++++++++++++++ OpenAI-DotNet/Audio/TranscriptionWord.cs | 33 ++++++ .../Extensions/ResponseExtensions.cs | 38 ++++++- OpenAI-DotNet/OpenAI-DotNet.csproj | 5 +- 10 files changed, 389 insertions(+), 43 deletions(-) create mode 100644 OpenAI-DotNet/Audio/AudioTranscriptionResponse.cs create mode 100644 OpenAI-DotNet/Audio/TimestampGranularity.cs create mode 100644 OpenAI-DotNet/Audio/TranscriptionSegment.cs create mode 100644 OpenAI-DotNet/Audio/TranscriptionWord.cs diff --git a/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs b/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs index 0673677a..022e5061 100644 --- a/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs +++ b/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs @@ -11,25 +11,81 @@ namespace OpenAI.Tests internal class TestFixture_07_Audio : AbstractTestFixture { [Test] - public async Task Test_1_Transcription() + public async Task Test_01_01_Transcription_Text() { Assert.IsNotNull(OpenAIClient.AudioEndpoint); var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3"); - using var request = new AudioTranscriptionRequest(transcriptionAudio, temperature: 0.1f, language: "en"); - var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(request); + using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Text, temperature: 0.1f, language: "en"); + var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request); Assert.IsNotNull(result); - Console.WriteLine(result); } [Test] - public async Task Test_2_Translation() + public async Task Test_01_02_Transcription_Json() + { + Assert.IsNotNull(OpenAIClient.AudioEndpoint); + var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3"); + using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Json, temperature: 0.1f, language: "en"); + var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); + Assert.IsNotNull(result); + } + + [Test] + public async Task Test_01_03_01_Transcription_VerboseJson() + { + Assert.IsNotNull(OpenAIClient.AudioEndpoint); + var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3"); + using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, temperature: 0.1f, language: "en"); + var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); + Assert.IsNotNull(result); + Assert.IsNotNull(result.Duration); + Assert.IsTrue(result.Language == "english"); + Assert.IsNotNull(result.Segments); + Assert.IsNotEmpty(result.Segments); + } + + [Test] + public async Task Test_01_03_02_Transcription_VerboseJson_WordSimilarities() + { + Assert.IsNotNull(OpenAIClient.AudioEndpoint); + var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3"); + using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en"); + var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); + Assert.IsNotNull(result); + Assert.IsNotNull(result.Duration); + Assert.IsTrue(result.Language == "english"); + Assert.IsNotNull(result.Words); + Assert.IsNotEmpty(result.Words); + } + + [Test] + public async Task Test_02_01_Translation_Text() + { + Assert.IsNotNull(OpenAIClient.AudioEndpoint); + var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3"); + using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Text); + var result = await OpenAIClient.AudioEndpoint.CreateTranslationTextAsync(request); + Assert.IsNotNull(result); + } + + [Test] + public async Task Test_02_02_Translation_Json() + { + Assert.IsNotNull(OpenAIClient.AudioEndpoint); + var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3"); + using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Json); + var result = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request); + Assert.IsNotNull(result); + } + + [Test] + public async Task Test_02_03_Translation_VerboseJson() { Assert.IsNotNull(OpenAIClient.AudioEndpoint); var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3"); - using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio)); - var result = await OpenAIClient.AudioEndpoint.CreateTranslationAsync(request); + using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Verbose_Json); + var result = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request); Assert.IsNotNull(result); - Console.WriteLine(result); } [Test] diff --git a/OpenAI-DotNet/Audio/AudioEndpoint.cs b/OpenAI-DotNet/Audio/AudioEndpoint.cs index 7bb9997a..2798cdc9 100644 --- a/OpenAI-DotNet/Audio/AudioEndpoint.cs +++ b/OpenAI-DotNet/Audio/AudioEndpoint.cs @@ -5,7 +5,6 @@ using System.IO; using System.Net.Http; using System.Text.Json; -using System.Text.Json.Serialization; using System.Threading; using System.Threading.Tasks; @@ -17,17 +16,6 @@ namespace OpenAI.Audio /// public sealed class AudioEndpoint : OpenAIBaseEndpoint { - private class AudioResponse - { - public AudioResponse(string text) - { - Text = text; - } - - [JsonPropertyName("text")] - public string Text { get; } - } - /// public AudioEndpoint(OpenAIClient client) : base(client) { } @@ -75,13 +63,43 @@ public async Task> CreateSpeechAsync(SpeechRequest request, return new ReadOnlyMemory(memoryStream.GetBuffer(), 0, totalBytesRead); } + [Obsolete("Use CreateTranscriptionTextAsync or CreateTranscriptionJsonAsync instead.")] + public async Task CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) + => await CreateTranscriptionTextAsync(request, cancellationToken).ConfigureAwait(false); + /// /// Transcribes audio into the input language. /// /// . /// Optional, . /// The transcribed text. - public async Task CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) + public async Task CreateTranscriptionTextAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) + { + var responseAsString = await Internal_CreateTranscriptionAsync(request, cancellationToken).ConfigureAwait(false); + return request.ResponseFormat is AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json + ? JsonSerializer.Deserialize(responseAsString)?.Text + : responseAsString; + } + + /// + /// Transcribes audio into the input language. + /// + /// This method expects the request format to be either or . + /// . + /// Optional, . + /// . + public async Task CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) + { + if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json)) + { + throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat)); + } + + var responseAsString = await Internal_CreateTranscriptionAsync(request, cancellationToken).ConfigureAwait(false); + return JsonSerializer.Deserialize(responseAsString); + } + + private async Task Internal_CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) { using var content = new MultipartFormDataContent(); using var audioData = new MemoryStream(); @@ -89,31 +107,53 @@ public async Task CreateTranscriptionAsync(AudioTranscriptionRequest req content.Add(new ByteArrayContent(audioData.ToArray()), "file", request.AudioName); content.Add(new StringContent(request.Model), "model"); + if (!string.IsNullOrWhiteSpace(request.Language)) + { + content.Add(new StringContent(request.Language), "language"); + } + if (!string.IsNullOrWhiteSpace(request.Prompt)) { content.Add(new StringContent(request.Prompt), "prompt"); } - var responseFormat = request.ResponseFormat; - content.Add(new StringContent(responseFormat.ToString().ToLower()), "response_format"); + content.Add(new StringContent(request.ResponseFormat.ToString().ToLower()), "response_format"); if (request.Temperature.HasValue) { content.Add(new StringContent(request.Temperature.ToString()), "temperature"); } - if (!string.IsNullOrWhiteSpace(request.Language)) + switch (request.TimestampGranularities) { - content.Add(new StringContent(request.Language), "language"); + case TimestampGranularity.Segment: + case TimestampGranularity.Word: + content.Add(new StringContent(request.TimestampGranularities.ToString().ToLower()), "timestamp_granularities[]"); + break; } request.Dispose(); using var response = await client.Client.PostAsync(GetUrl("/transcriptions"), content, cancellationToken).ConfigureAwait(false); var responseAsString = await response.ReadAsStringAsync(EnableDebug, content, null, cancellationToken).ConfigureAwait(false); + return responseAsString; + } - return responseFormat == AudioResponseFormat.Json - ? JsonSerializer.Deserialize(responseAsString)?.Text + [Obsolete("Use CreateTranslationTextAsync or CreateTranslationJsonAsync instead.")] + public async Task CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) + => await CreateTranslationTextAsync(request, cancellationToken).ConfigureAwait(false); + + /// + /// Translates audio into English. + /// + /// + /// + /// The translated text. + public async Task CreateTranslationTextAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) + { + var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken).ConfigureAwait(false); + return request.ResponseFormat is AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json + ? JsonSerializer.Deserialize(responseAsString)?.Text : responseAsString; } @@ -122,8 +162,20 @@ public async Task CreateTranscriptionAsync(AudioTranscriptionRequest req /// /// /// - /// The translated text. - public async Task CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) + /// + /// + public async Task CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) + { + if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json)) + { + throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat)); + } + + var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken).ConfigureAwait(false); + return JsonSerializer.Deserialize(responseAsString); + } + + private async Task Internal_CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) { using var content = new MultipartFormDataContent(); using var audioData = new MemoryStream(); @@ -136,8 +188,7 @@ public async Task CreateTranslationAsync(AudioTranslationRequest request content.Add(new StringContent(request.Prompt), "prompt"); } - var responseFormat = request.ResponseFormat; - content.Add(new StringContent(responseFormat.ToString().ToLower()), "response_format"); + content.Add(new StringContent(request.ResponseFormat.ToString().ToLower()), "response_format"); if (request.Temperature.HasValue) { @@ -148,10 +199,7 @@ public async Task CreateTranslationAsync(AudioTranslationRequest request using var response = await client.Client.PostAsync(GetUrl("/translations"), content, cancellationToken).ConfigureAwait(false); var responseAsString = await response.ReadAsStringAsync(EnableDebug, content, null, cancellationToken).ConfigureAwait(false); - - return responseFormat == AudioResponseFormat.Json - ? JsonSerializer.Deserialize(responseAsString)?.Text - : responseAsString; + return responseAsString; } } } \ No newline at end of file diff --git a/OpenAI-DotNet/Audio/AudioTranscriptionRequest.cs b/OpenAI-DotNet/Audio/AudioTranscriptionRequest.cs index f50d8486..0ce6578a 100644 --- a/OpenAI-DotNet/Audio/AudioTranscriptionRequest.cs +++ b/OpenAI-DotNet/Audio/AudioTranscriptionRequest.cs @@ -39,14 +39,21 @@ public sealed class AudioTranscriptionRequest : IDisposable /// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, /// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh. /// + /// + /// The timestamp granularities to populate for this transcription. + /// response_format must be set verbose_json to use timestamp granularities. + /// Either or both of these options are supported: , or .
+ /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + /// public AudioTranscriptionRequest( string audioPath, string model = null, string prompt = null, AudioResponseFormat responseFormat = AudioResponseFormat.Json, float? temperature = null, - string language = null) - : this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature, language) + string language = null, + TimestampGranularity timestampGranularity = TimestampGranularity.None) + : this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature, language, timestampGranularity) { } @@ -85,6 +92,12 @@ public AudioTranscriptionRequest( /// Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, /// Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh. /// + /// + /// The timestamp granularities to populate for this transcription. + /// response_format must be set verbose_json to use timestamp granularities. + /// Either or both of these options are supported: , or .
+ /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + /// public AudioTranscriptionRequest( Stream audio, string audioName, @@ -92,7 +105,8 @@ public AudioTranscriptionRequest( string prompt = null, AudioResponseFormat responseFormat = AudioResponseFormat.Json, float? temperature = null, - string language = null) + string language = null, + TimestampGranularity timestampGranularity = TimestampGranularity.None) { Audio = audio; @@ -107,6 +121,13 @@ public AudioTranscriptionRequest( ResponseFormat = responseFormat; Temperature = temperature; Language = language; + + if (timestampGranularity != TimestampGranularity.None && responseFormat != AudioResponseFormat.Verbose_Json) + { + throw new ArgumentException($"{nameof(responseFormat)} must be set {AudioResponseFormat.Verbose_Json} to use timestamp granularities."); + } + + TimestampGranularities = timestampGranularity; } ~AudioTranscriptionRequest() => Dispose(false); @@ -157,6 +178,14 @@ public AudioTranscriptionRequest( /// public string Language { get; } + /// + /// The timestamp granularities to populate for this transcription. + /// response_format must be set verbose_json to use timestamp granularities. + /// Either or both of these options are supported: , or .
+ /// Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + ///
+ public TimestampGranularity TimestampGranularities { get; } + private void Dispose(bool disposing) { if (disposing) diff --git a/OpenAI-DotNet/Audio/AudioTranscriptionResponse.cs b/OpenAI-DotNet/Audio/AudioTranscriptionResponse.cs new file mode 100644 index 00000000..10f2d03f --- /dev/null +++ b/OpenAI-DotNet/Audio/AudioTranscriptionResponse.cs @@ -0,0 +1,47 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System.Text.Json.Serialization; + +namespace OpenAI.Audio +{ + public sealed class AudioTranscriptionResponse + { + /// + /// The language of the input audio. + /// + [JsonInclude] + [JsonPropertyName("language")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)] + public string Language { get; private set; } + + /// + /// The duration of the input audio. + /// + [JsonInclude] + [JsonPropertyName("duration")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)] + public double? Duration { get; private set; } + + /// + /// The transcribed text. + /// + [JsonInclude] + [JsonPropertyName("text")] + public string Text { get; private set; } + + /// + /// Extracted words and their corresponding timestamps. + /// + [JsonInclude] + [JsonPropertyName("words")] + public TranscriptionWord[] Words { get; private set; } + + /// + /// Segments of the transcribed text and their corresponding details. + /// + [JsonInclude] + [JsonPropertyName("segments")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)] + public TranscriptionSegment[] Segments { get; private set; } + } +} \ No newline at end of file diff --git a/OpenAI-DotNet/Audio/AudioTranslationRequest.cs b/OpenAI-DotNet/Audio/AudioTranslationRequest.cs index aacb7a58..502ed5de 100644 --- a/OpenAI-DotNet/Audio/AudioTranslationRequest.cs +++ b/OpenAI-DotNet/Audio/AudioTranslationRequest.cs @@ -33,7 +33,7 @@ public sealed class AudioTranslationRequest : IDisposable public AudioTranslationRequest( string audioPath, string model = null, - string prompt = null, + string prompt = "response should be in english.", AudioResponseFormat responseFormat = AudioResponseFormat.Json, float? temperature = null) : this(File.OpenRead(audioPath), Path.GetFileName(audioPath), model, prompt, responseFormat, temperature) @@ -70,7 +70,7 @@ public AudioTranslationRequest( Stream audio, string audioName, string model = null, - string prompt = null, + string prompt = "response should be in english.", AudioResponseFormat responseFormat = AudioResponseFormat.Json, float? temperature = null) { diff --git a/OpenAI-DotNet/Audio/TimestampGranularity.cs b/OpenAI-DotNet/Audio/TimestampGranularity.cs new file mode 100644 index 00000000..dca4925e --- /dev/null +++ b/OpenAI-DotNet/Audio/TimestampGranularity.cs @@ -0,0 +1,11 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +namespace OpenAI.Audio +{ + public enum TimestampGranularity + { + None = 0, + Word, + Segment + } +} \ No newline at end of file diff --git a/OpenAI-DotNet/Audio/TranscriptionSegment.cs b/OpenAI-DotNet/Audio/TranscriptionSegment.cs new file mode 100644 index 00000000..c29cea43 --- /dev/null +++ b/OpenAI-DotNet/Audio/TranscriptionSegment.cs @@ -0,0 +1,85 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System.Text.Json.Serialization; + +namespace OpenAI.Audio +{ + /// + /// Segment of the transcribed text and their corresponding details. + /// + public sealed class TranscriptionSegment + { + /// + /// Unique identifier of the segment. + /// + [JsonInclude] + [JsonPropertyName("id")] + public int Id { get; private set; } + + /// + /// Seek offset of the segment. + /// + [JsonInclude] + [JsonPropertyName("seek")] + public int Seek { get; private set; } + + /// + /// Start time of the segment in seconds. + /// + [JsonInclude] + [JsonPropertyName("start")] + public double Start { get; private set; } + + /// + /// End time of the segment in seconds. + /// + [JsonInclude] + [JsonPropertyName("end")] + public double End { get; private set; } + + /// + /// Text content of the segment. + /// + [JsonInclude] + [JsonPropertyName("text")] + public string Text { get; private set; } + + /// + /// Array of token IDs for the text content. + /// + [JsonInclude] + [JsonPropertyName("tokens")] + public int[] Tokens { get; private set; } + + /// + /// Temperature parameter used for generating the segment. + /// + [JsonInclude] + [JsonPropertyName("temperature")] + public double Temperature { get; private set; } + + /// + /// Average logprob of the segment. + /// If the value is lower than -1, consider the logprobs failed. + /// + [JsonInclude] + [JsonPropertyName("avg_logprob")] + public double AverageLogProbability { get; private set; } + + /// + /// Compression ratio of the segment. + /// If the value is greater than 2.4, consider the compression failed. + /// + [JsonInclude] + [JsonPropertyName("compression_ratio")] + public double CompressionRatio { get; private set; } + + /// + /// Probability of no speech in the segment. + /// If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent. + /// + [JsonInclude] + [JsonPropertyName("no_speech_prob")] + public double NoSpeechProbability { get; private set; } + } +} \ No newline at end of file diff --git a/OpenAI-DotNet/Audio/TranscriptionWord.cs b/OpenAI-DotNet/Audio/TranscriptionWord.cs new file mode 100644 index 00000000..1ede1282 --- /dev/null +++ b/OpenAI-DotNet/Audio/TranscriptionWord.cs @@ -0,0 +1,33 @@ +// Licensed under the MIT License. See LICENSE in the project root for license information. + +using System.Text.Json.Serialization; + +namespace OpenAI.Audio +{ + /// + /// Extracted word and their corresponding timestamps. + /// + public sealed class TranscriptionWord + { + /// + /// The text content of the word. + /// + [JsonInclude] + [JsonPropertyName("word")] + public string Word { get; private set; } + + /// + /// Start time of the word in seconds. + /// + [JsonInclude] + [JsonPropertyName("start")] + public double Start { get; private set; } + + /// + /// End time of the word in seconds. + /// + [JsonInclude] + [JsonPropertyName("end")] + public double End { get; private set; } + } +} \ No newline at end of file diff --git a/OpenAI-DotNet/Extensions/ResponseExtensions.cs b/OpenAI-DotNet/Extensions/ResponseExtensions.cs index a32ece1e..ad92252f 100644 --- a/OpenAI-DotNet/Extensions/ResponseExtensions.cs +++ b/OpenAI-DotNet/Extensions/ResponseExtensions.cs @@ -7,6 +7,7 @@ using System.Linq; using System.Net.Http; using System.Net.Http.Headers; +using System.Net.Http.Json; using System.Runtime.CompilerServices; using System.Text; using System.Text.Json; @@ -124,7 +125,7 @@ internal static async Task ReadAsStringAsync(this HttpResponseMessage re { debugMessage.Append($"[{response.RequestMessage.Method}:{(int)response.StatusCode}] {response.RequestMessage.RequestUri}\n"); - debugMessageObject["Request"] = new() + debugMessageObject["Request"] = new Dictionary { ["Headers"] = response.RequestMessage.Headers.ToDictionary(pair => pair.Key, pair => pair.Value), }; @@ -132,7 +133,40 @@ internal static async Task ReadAsStringAsync(this HttpResponseMessage re if (requestContent != null) { - var requestAsString = await requestContent.ReadAsStringAsync(cancellationToken).ConfigureAwait(false); + debugMessageObject["Request"]["Body-Headers"] = requestContent.Headers.ToDictionary(pair => pair.Key, pair => pair.Value); + string requestAsString; + + if (requestContent is MultipartFormDataContent multipartFormData) + { + var stringContents = multipartFormData.Select(content => + { + var headers = content.Headers.ToDictionary(pair => pair.Key, pair => pair.Value); + switch (content) + { + case StringContent stringContent: + var valueAsString = stringContent.ReadAsStringAsync(cancellationToken).Result; + object value; + + try + { + value = JsonNode.Parse(valueAsString); + } + catch + { + value = valueAsString; + } + + return new { headers, value }; + default: + return new { headers }; + } + }); + requestAsString = JsonSerializer.Serialize(stringContents); + } + else + { + requestAsString = await requestContent.ReadAsStringAsync(cancellationToken).ConfigureAwait(false); + } if (!string.IsNullOrWhiteSpace(requestAsString)) { diff --git a/OpenAI-DotNet/OpenAI-DotNet.csproj b/OpenAI-DotNet/OpenAI-DotNet.csproj index 15d4c26f..4d2e69c8 100644 --- a/OpenAI-DotNet/OpenAI-DotNet.csproj +++ b/OpenAI-DotNet/OpenAI-DotNet.csproj @@ -28,8 +28,11 @@ More context [on Roger Pincombe's blog](https://rogerpincombe.com/openai-dotnet- OpenAI-DotNet.pfx True True - 7.7.5 + 7.7.6 +Version 7.7.6 +- Added support for Audio Transcription and Translation verbose json output + - Added support for timestamp granularities for segments and words Version 7.7.5 - Allow FunctionPropertyAttribute to be assignable to fields - Updated Function schema generation From 4a2293223ba3f7c7487acd9a6de29128eb89b277 Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Mon, 18 Mar 2024 14:46:02 -0400 Subject: [PATCH 2/5] updated docs --- OpenAI-DotNet-Tests/TestFixture_07_Audio.cs | 50 ++++++++++----------- OpenAI-DotNet/OpenAI-DotNet.csproj | 6 +++ README.md | 23 +++++++--- 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs b/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs index 022e5061..b389b58c 100644 --- a/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs +++ b/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs @@ -16,8 +16,8 @@ public async Task Test_01_01_Transcription_Text() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3"); using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Text, temperature: 0.1f, language: "en"); - var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request); - Assert.IsNotNull(result); + var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request); + Assert.IsNotNull(response); } [Test] @@ -26,8 +26,8 @@ public async Task Test_01_02_Transcription_Json() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3"); using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Json, temperature: 0.1f, language: "en"); - var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); - Assert.IsNotNull(result); + var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); + Assert.IsNotNull(response); } [Test] @@ -36,12 +36,12 @@ public async Task Test_01_03_01_Transcription_VerboseJson() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3"); using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, temperature: 0.1f, language: "en"); - var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); - Assert.IsNotNull(result); - Assert.IsNotNull(result.Duration); - Assert.IsTrue(result.Language == "english"); - Assert.IsNotNull(result.Segments); - Assert.IsNotEmpty(result.Segments); + var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); + Assert.IsNotNull(response); + Assert.IsNotNull(response.Duration); + Assert.IsTrue(response.Language == "english"); + Assert.IsNotNull(response.Segments); + Assert.IsNotEmpty(response.Segments); } [Test] @@ -50,12 +50,12 @@ public async Task Test_01_03_02_Transcription_VerboseJson_WordSimilarities() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3"); using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en"); - var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); - Assert.IsNotNull(result); - Assert.IsNotNull(result.Duration); - Assert.IsTrue(result.Language == "english"); - Assert.IsNotNull(result.Words); - Assert.IsNotEmpty(result.Words); + var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); + Assert.IsNotNull(response); + Assert.IsNotNull(response.Duration); + Assert.IsTrue(response.Language == "english"); + Assert.IsNotNull(response.Words); + Assert.IsNotEmpty(response.Words); } [Test] @@ -64,8 +64,8 @@ public async Task Test_02_01_Translation_Text() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3"); using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Text); - var result = await OpenAIClient.AudioEndpoint.CreateTranslationTextAsync(request); - Assert.IsNotNull(result); + var response = await OpenAIClient.AudioEndpoint.CreateTranslationTextAsync(request); + Assert.IsNotNull(response); } [Test] @@ -74,8 +74,8 @@ public async Task Test_02_02_Translation_Json() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3"); using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Json); - var result = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request); - Assert.IsNotNull(result); + var response = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request); + Assert.IsNotNull(response); } [Test] @@ -84,8 +84,8 @@ public async Task Test_02_03_Translation_VerboseJson() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3"); using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Verbose_Json); - var result = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request); - Assert.IsNotNull(result); + var response = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request); + Assert.IsNotNull(response); } [Test] @@ -99,9 +99,9 @@ async Task ChunkCallback(ReadOnlyMemory chunkCallback) await Task.CompletedTask; } - var result = await OpenAIClient.AudioEndpoint.CreateSpeechAsync(request, ChunkCallback); - Assert.IsFalse(result.IsEmpty); - await File.WriteAllBytesAsync("../../../Assets/HelloWorld.mp3", result.ToArray()); + var response = await OpenAIClient.AudioEndpoint.CreateSpeechAsync(request, ChunkCallback); + Assert.IsFalse(response.IsEmpty); + await File.WriteAllBytesAsync("../../../Assets/HelloWorld.mp3", response.ToArray()); } } } diff --git a/OpenAI-DotNet/OpenAI-DotNet.csproj b/OpenAI-DotNet/OpenAI-DotNet.csproj index 4d2e69c8..add211fb 100644 --- a/OpenAI-DotNet/OpenAI-DotNet.csproj +++ b/OpenAI-DotNet/OpenAI-DotNet.csproj @@ -33,6 +33,12 @@ More context [on Roger Pincombe's blog](https://rogerpincombe.com/openai-dotnet- Version 7.7.6 - Added support for Audio Transcription and Translation verbose json output - Added support for timestamp granularities for segments and words + - Marked CreateTranscriptionAsync obsolete + - Added CreateTranscriptionTextAsync + - Added CreateTranscriptionJsonAsync + - Marked CreateTranspationAsync obsolete + - Added CreateTranslationTextAsync + - Added CreateTranslationJsonAsync Version 7.7.5 - Allow FunctionPropertyAttribute to be assignable to fields - Updated Function schema generation diff --git a/README.md b/README.md index 13d67f9c..2de84068 100644 --- a/README.md +++ b/README.md @@ -1068,19 +1068,32 @@ Transcribes audio into the input language. ```csharp using var api = new OpenAIClient(); -var request = new AudioTranscriptionRequest(Path.GetFullPath(audioAssetPath), language: "en"); -var response = await api.AudioEndpoint.CreateTranscriptionAsync(request); +using var request = new AudioTranscriptionRequest(Path.GetFullPath(audioAssetPath), language: "en"); +var response = await api.AudioEndpoint.CreateTranscriptionTextAsync(request); Console.WriteLine(response); ``` +You can also get detailed information using `verbose_json` to get timestamp granularities: + +```csharp +using var api = new OpenAIClient(); +using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en"); +var response = await api.AudioEndpoint.CreateTranscriptionTextAsync(request); + +foreach (var word in response.Words) +{ + Console.WriteLine($"[{word.Start}-{word.End}] \"{word.Word}\""); +} +``` + #### [Create Translation](https://platform.openai.com/docs/api-reference/audio/createTranslation) Translates audio into into English. ```csharp using var api = new OpenAIClient(); -var request = new AudioTranslationRequest(Path.GetFullPath(audioAssetPath)); -var response = await api.AudioEndpoint.CreateTranslationAsync(request); +using var request = new AudioTranslationRequest(Path.GetFullPath(audioAssetPath)); +var response = await api.AudioEndpoint.CreateTranslationTextAsync(request); Console.WriteLine(response); ``` @@ -1186,7 +1199,7 @@ Returns information about a specific file. ```csharp using var api = new OpenAIClient(); -var file = await GetFileInfoAsync(fileId); +var file = await api.FilesEndpoint.GetFileInfoAsync(fileId); Console.WriteLine($"{file.Id} -> {file.Object}: {file.FileName} | {file.Size} bytes"); ``` From a78d29bfb359f4cbee1da19ec224176f532157b2 Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Mon, 18 Mar 2024 14:58:09 -0400 Subject: [PATCH 3/5] update test --- OpenAI-DotNet-Tests/TestFixture_07_Audio.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs b/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs index b389b58c..6c5d83c3 100644 --- a/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs +++ b/OpenAI-DotNet-Tests/TestFixture_07_Audio.cs @@ -26,7 +26,7 @@ public async Task Test_01_02_Transcription_Json() Assert.IsNotNull(OpenAIClient.AudioEndpoint); var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3"); using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Json, temperature: 0.1f, language: "en"); - var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request); + var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request); Assert.IsNotNull(response); } From 17bdd78d058a974059c402af7d0825126ac24fda Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Mon, 18 Mar 2024 15:24:02 -0400 Subject: [PATCH 4/5] renamed response object --- OpenAI-DotNet/Audio/AudioEndpoint.cs | 14 +++++++------- ...ioTranscriptionResponse.cs => AudioResponse.cs} | 2 +- OpenAI-DotNet/OpenAI-DotNet.csproj | 1 + 3 files changed, 9 insertions(+), 8 deletions(-) rename OpenAI-DotNet/Audio/{AudioTranscriptionResponse.cs => AudioResponse.cs} (96%) diff --git a/OpenAI-DotNet/Audio/AudioEndpoint.cs b/OpenAI-DotNet/Audio/AudioEndpoint.cs index 2798cdc9..36fed29a 100644 --- a/OpenAI-DotNet/Audio/AudioEndpoint.cs +++ b/OpenAI-DotNet/Audio/AudioEndpoint.cs @@ -77,7 +77,7 @@ public async Task CreateTranscriptionTextAsync(AudioTranscriptionRequest { var responseAsString = await Internal_CreateTranscriptionAsync(request, cancellationToken).ConfigureAwait(false); return request.ResponseFormat is AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json - ? JsonSerializer.Deserialize(responseAsString)?.Text + ? JsonSerializer.Deserialize(responseAsString)?.Text : responseAsString; } @@ -87,8 +87,8 @@ public async Task CreateTranscriptionTextAsync(AudioTranscriptionRequest /// This method expects the request format to be either or . /// . /// Optional, . - /// . - public async Task CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) + /// . + public async Task CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) { if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json)) { @@ -96,7 +96,7 @@ public async Task CreateTranscriptionJsonAsync(Audio } var responseAsString = await Internal_CreateTranscriptionAsync(request, cancellationToken).ConfigureAwait(false); - return JsonSerializer.Deserialize(responseAsString); + return JsonSerializer.Deserialize(responseAsString); } private async Task Internal_CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default) @@ -153,7 +153,7 @@ public async Task CreateTranslationTextAsync(AudioTranslationRequest req { var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken).ConfigureAwait(false); return request.ResponseFormat is AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json - ? JsonSerializer.Deserialize(responseAsString)?.Text + ? JsonSerializer.Deserialize(responseAsString)?.Text : responseAsString; } @@ -164,7 +164,7 @@ public async Task CreateTranslationTextAsync(AudioTranslationRequest req /// /// /// - public async Task CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) + public async Task CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) { if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json)) { @@ -172,7 +172,7 @@ public async Task CreateTranslationJsonAsync(AudioTr } var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken).ConfigureAwait(false); - return JsonSerializer.Deserialize(responseAsString); + return JsonSerializer.Deserialize(responseAsString); } private async Task Internal_CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default) diff --git a/OpenAI-DotNet/Audio/AudioTranscriptionResponse.cs b/OpenAI-DotNet/Audio/AudioResponse.cs similarity index 96% rename from OpenAI-DotNet/Audio/AudioTranscriptionResponse.cs rename to OpenAI-DotNet/Audio/AudioResponse.cs index 10f2d03f..4b43c86a 100644 --- a/OpenAI-DotNet/Audio/AudioTranscriptionResponse.cs +++ b/OpenAI-DotNet/Audio/AudioResponse.cs @@ -4,7 +4,7 @@ namespace OpenAI.Audio { - public sealed class AudioTranscriptionResponse + public sealed class AudioResponse { /// /// The language of the input audio. diff --git a/OpenAI-DotNet/OpenAI-DotNet.csproj b/OpenAI-DotNet/OpenAI-DotNet.csproj index add211fb..6818f371 100644 --- a/OpenAI-DotNet/OpenAI-DotNet.csproj +++ b/OpenAI-DotNet/OpenAI-DotNet.csproj @@ -33,6 +33,7 @@ More context [on Roger Pincombe's blog](https://rogerpincombe.com/openai-dotnet- Version 7.7.6 - Added support for Audio Transcription and Translation verbose json output - Added support for timestamp granularities for segments and words + - Added AudioResponse - Marked CreateTranscriptionAsync obsolete - Added CreateTranscriptionTextAsync - Added CreateTranscriptionJsonAsync From e72c0a701ca78bf49e13a568d8ac3b8d5b8f8d6a Mon Sep 17 00:00:00 2001 From: Stephen Hodgson Date: Mon, 18 Mar 2024 21:33:48 -0400 Subject: [PATCH 5/5] Updated SpeechResponseFormat to include wav and pcm --- OpenAI-DotNet/Audio/SpeechRequest.cs | 4 ++-- OpenAI-DotNet/Audio/SpeechResponseFormat.cs | 6 +++++- OpenAI-DotNet/Extensions/ResponseExtensions.cs | 1 - OpenAI-DotNet/OpenAI-DotNet.csproj | 1 + 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/OpenAI-DotNet/Audio/SpeechRequest.cs b/OpenAI-DotNet/Audio/SpeechRequest.cs index 261fcf9c..12978d23 100644 --- a/OpenAI-DotNet/Audio/SpeechRequest.cs +++ b/OpenAI-DotNet/Audio/SpeechRequest.cs @@ -14,7 +14,7 @@ public sealed class SpeechRequest /// The text to generate audio for. The maximum length is 4096 characters. /// One of the available TTS models. Defaults to tts-1. /// The voice to use when generating the audio. - /// The format to audio in. Supported formats are mp3, opus, aac, and flac. + /// The format to audio in. Supported formats are mp3, opus, aac, flac, wav and pcm. /// The speed of the generated audio. Select a value from 0.25 to 4.0. 1.0 is the default. public SpeechRequest(string input, Model model = null, SpeechVoice voice = SpeechVoice.Alloy, SpeechResponseFormat responseFormat = SpeechResponseFormat.MP3, float? speed = null) { @@ -44,7 +44,7 @@ public SpeechRequest(string input, Model model = null, SpeechVoice voice = Speec public SpeechVoice Voice { get; } /// - /// The format to audio in. Supported formats are mp3, opus, aac, and flac. + /// The format to audio in. Supported formats are mp3, opus, aac, flac, wav and pcm. /// [JsonPropertyName("response_format")] [JsonIgnore(Condition = JsonIgnoreCondition.Never)] diff --git a/OpenAI-DotNet/Audio/SpeechResponseFormat.cs b/OpenAI-DotNet/Audio/SpeechResponseFormat.cs index 43fc4205..2b3afe4d 100644 --- a/OpenAI-DotNet/Audio/SpeechResponseFormat.cs +++ b/OpenAI-DotNet/Audio/SpeechResponseFormat.cs @@ -13,6 +13,10 @@ public enum SpeechResponseFormat [EnumMember(Value = "aac")] AAC, [EnumMember(Value = "flac")] - Flac + Flac, + [EnumMember(Value = "wav")] + WAV, + [EnumMember(Value = "pcm")] + PCM } } diff --git a/OpenAI-DotNet/Extensions/ResponseExtensions.cs b/OpenAI-DotNet/Extensions/ResponseExtensions.cs index ad92252f..f55f76e0 100644 --- a/OpenAI-DotNet/Extensions/ResponseExtensions.cs +++ b/OpenAI-DotNet/Extensions/ResponseExtensions.cs @@ -7,7 +7,6 @@ using System.Linq; using System.Net.Http; using System.Net.Http.Headers; -using System.Net.Http.Json; using System.Runtime.CompilerServices; using System.Text; using System.Text.Json; diff --git a/OpenAI-DotNet/OpenAI-DotNet.csproj b/OpenAI-DotNet/OpenAI-DotNet.csproj index 6818f371..4af05b89 100644 --- a/OpenAI-DotNet/OpenAI-DotNet.csproj +++ b/OpenAI-DotNet/OpenAI-DotNet.csproj @@ -40,6 +40,7 @@ Version 7.7.6 - Marked CreateTranspationAsync obsolete - Added CreateTranslationTextAsync - Added CreateTranslationJsonAsync +- Updated SpeechResponseFormat to include wav and pcm Version 7.7.5 - Allow FunctionPropertyAttribute to be assignable to fields - Updated Function schema generation