Skip to content

Commit

Permalink
OpenAI-DotNet 7.7.6 (#268)
Browse files Browse the repository at this point in the history
- Added support for Audio Transcription and Translation verbose json output
  - Added support for timestamp granularities for segments and words
  - Marked CreateTranscriptionAsync obsolete
  - Added CreateTranscriptionTextAsync
  - Added CreateTranscriptionJsonAsync
  - Marked CreateTranspationAsync obsolete
  - Added CreateTranslationTextAsync
  - Added CreateTranslationJsonAsync
- Updated SpeechResponseFormat to include wav and pcm
  • Loading branch information
StephenHodgson authored Mar 19, 2024
1 parent c17dee4 commit 9124a33
Show file tree
Hide file tree
Showing 13 changed files with 425 additions and 55 deletions.
82 changes: 69 additions & 13 deletions OpenAI-DotNet-Tests/TestFixture_07_Audio.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,81 @@ namespace OpenAI.Tests
internal class TestFixture_07_Audio : AbstractTestFixture
{
[Test]
public async Task Test_1_Transcription()
public async Task Test_01_01_Transcription_Text()
{
Assert.IsNotNull(OpenAIClient.AudioEndpoint);
var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3");
using var request = new AudioTranscriptionRequest(transcriptionAudio, temperature: 0.1f, language: "en");
var result = await OpenAIClient.AudioEndpoint.CreateTranscriptionAsync(request);
Assert.IsNotNull(result);
Console.WriteLine(result);
using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Text, temperature: 0.1f, language: "en");
var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request);
Assert.IsNotNull(response);
}

[Test]
public async Task Test_2_Translation()
public async Task Test_01_02_Transcription_Json()
{
Assert.IsNotNull(OpenAIClient.AudioEndpoint);
var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3");
using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Json, temperature: 0.1f, language: "en");
var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionTextAsync(request);
Assert.IsNotNull(response);
}

[Test]
public async Task Test_01_03_01_Transcription_VerboseJson()
{
Assert.IsNotNull(OpenAIClient.AudioEndpoint);
var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3");
using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, temperature: 0.1f, language: "en");
var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request);
Assert.IsNotNull(response);
Assert.IsNotNull(response.Duration);
Assert.IsTrue(response.Language == "english");
Assert.IsNotNull(response.Segments);
Assert.IsNotEmpty(response.Segments);
}

[Test]
public async Task Test_01_03_02_Transcription_VerboseJson_WordSimilarities()
{
Assert.IsNotNull(OpenAIClient.AudioEndpoint);
var transcriptionAudio = Path.GetFullPath("../../../Assets/T3mt39YrlyLoq8laHSdf.mp3");
using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en");
var response = await OpenAIClient.AudioEndpoint.CreateTranscriptionJsonAsync(request);
Assert.IsNotNull(response);
Assert.IsNotNull(response.Duration);
Assert.IsTrue(response.Language == "english");
Assert.IsNotNull(response.Words);
Assert.IsNotEmpty(response.Words);
}

[Test]
public async Task Test_02_01_Translation_Text()
{
Assert.IsNotNull(OpenAIClient.AudioEndpoint);
var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3");
using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Text);
var response = await OpenAIClient.AudioEndpoint.CreateTranslationTextAsync(request);
Assert.IsNotNull(response);
}

[Test]
public async Task Test_02_02_Translation_Json()
{
Assert.IsNotNull(OpenAIClient.AudioEndpoint);
var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3");
using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Json);
var response = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request);
Assert.IsNotNull(response);
}

[Test]
public async Task Test_02_03_Translation_VerboseJson()
{
Assert.IsNotNull(OpenAIClient.AudioEndpoint);
var translationAudio = Path.GetFullPath("../../../Assets/Ja-botchan_1-1_1-2.mp3");
using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio));
var result = await OpenAIClient.AudioEndpoint.CreateTranslationAsync(request);
Assert.IsNotNull(result);
Console.WriteLine(result);
using var request = new AudioTranslationRequest(Path.GetFullPath(translationAudio), responseFormat: AudioResponseFormat.Verbose_Json);
var response = await OpenAIClient.AudioEndpoint.CreateTranslationJsonAsync(request);
Assert.IsNotNull(response);
}

[Test]
Expand All @@ -43,9 +99,9 @@ async Task ChunkCallback(ReadOnlyMemory<byte> chunkCallback)
await Task.CompletedTask;
}

var result = await OpenAIClient.AudioEndpoint.CreateSpeechAsync(request, ChunkCallback);
Assert.IsFalse(result.IsEmpty);
await File.WriteAllBytesAsync("../../../Assets/HelloWorld.mp3", result.ToArray());
var response = await OpenAIClient.AudioEndpoint.CreateSpeechAsync(request, ChunkCallback);
Assert.IsFalse(response.IsEmpty);
await File.WriteAllBytesAsync("../../../Assets/HelloWorld.mp3", response.ToArray());
}
}
}
100 changes: 74 additions & 26 deletions OpenAI-DotNet/Audio/AudioEndpoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
using System.IO;
using System.Net.Http;
using System.Text.Json;
using System.Text.Json.Serialization;
using System.Threading;
using System.Threading.Tasks;

Expand All @@ -17,17 +16,6 @@ namespace OpenAI.Audio
/// </summary>
public sealed class AudioEndpoint : OpenAIBaseEndpoint
{
private class AudioResponse
{
public AudioResponse(string text)
{
Text = text;
}

[JsonPropertyName("text")]
public string Text { get; }
}

/// <inheritdoc />
public AudioEndpoint(OpenAIClient client) : base(client) { }

Expand Down Expand Up @@ -75,44 +63,96 @@ public async Task<ReadOnlyMemory<byte>> CreateSpeechAsync(SpeechRequest request,
return new ReadOnlyMemory<byte>(memoryStream.GetBuffer(), 0, totalBytesRead);
}

[Obsolete("Use CreateTranscriptionTextAsync or CreateTranscriptionJsonAsync instead.")]
public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
=> await CreateTranscriptionTextAsync(request, cancellationToken).ConfigureAwait(false);

/// <summary>
/// Transcribes audio into the input language.
/// </summary>
/// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns>The transcribed text.</returns>
public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
public async Task<string> CreateTranscriptionTextAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
{
var responseAsString = await Internal_CreateTranscriptionAsync(request, cancellationToken).ConfigureAwait(false);
return request.ResponseFormat is AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json
? JsonSerializer.Deserialize<AudioResponse>(responseAsString)?.Text
: responseAsString;
}

/// <summary>
/// Transcribes audio into the input language.
/// </summary>
/// <remarks>This method expects the request format to be either <see cref="AudioResponseFormat.Json"/> or <see cref="AudioResponseFormat.Verbose_Json"/>.</remarks>
/// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
/// <returns><see cref="AudioResponse"/>.</returns>
public async Task<AudioResponse> CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
{
if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
{
throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
}

var responseAsString = await Internal_CreateTranscriptionAsync(request, cancellationToken).ConfigureAwait(false);
return JsonSerializer.Deserialize<AudioResponse>(responseAsString);
}

private async Task<string> Internal_CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
{
using var content = new MultipartFormDataContent();
using var audioData = new MemoryStream();
await request.Audio.CopyToAsync(audioData, cancellationToken).ConfigureAwait(false);
content.Add(new ByteArrayContent(audioData.ToArray()), "file", request.AudioName);
content.Add(new StringContent(request.Model), "model");

if (!string.IsNullOrWhiteSpace(request.Language))
{
content.Add(new StringContent(request.Language), "language");
}

if (!string.IsNullOrWhiteSpace(request.Prompt))
{
content.Add(new StringContent(request.Prompt), "prompt");
}

var responseFormat = request.ResponseFormat;
content.Add(new StringContent(responseFormat.ToString().ToLower()), "response_format");
content.Add(new StringContent(request.ResponseFormat.ToString().ToLower()), "response_format");

if (request.Temperature.HasValue)
{
content.Add(new StringContent(request.Temperature.ToString()), "temperature");
}

if (!string.IsNullOrWhiteSpace(request.Language))
switch (request.TimestampGranularities)
{
content.Add(new StringContent(request.Language), "language");
case TimestampGranularity.Segment:
case TimestampGranularity.Word:
content.Add(new StringContent(request.TimestampGranularities.ToString().ToLower()), "timestamp_granularities[]");
break;
}

request.Dispose();

using var response = await client.Client.PostAsync(GetUrl("/transcriptions"), content, cancellationToken).ConfigureAwait(false);
var responseAsString = await response.ReadAsStringAsync(EnableDebug, content, null, cancellationToken).ConfigureAwait(false);
return responseAsString;
}

return responseFormat == AudioResponseFormat.Json
[Obsolete("Use CreateTranslationTextAsync or CreateTranslationJsonAsync instead.")]
public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
=> await CreateTranslationTextAsync(request, cancellationToken).ConfigureAwait(false);

/// <summary>
/// Translates audio into English.
/// </summary>
/// <param name="request"></param>
/// <param name="cancellationToken"></param>
/// <returns>The translated text.</returns>
public async Task<string> CreateTranslationTextAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
{
var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken).ConfigureAwait(false);
return request.ResponseFormat is AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json
? JsonSerializer.Deserialize<AudioResponse>(responseAsString)?.Text
: responseAsString;
}
Expand All @@ -122,8 +162,20 @@ public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest req
/// </summary>
/// <param name="request"></param>
/// <param name="cancellationToken"></param>
/// <returns>The translated text.</returns>
public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
/// <returns></returns>
/// <exception cref="ArgumentException"></exception>
public async Task<AudioResponse> CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
{
if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
{
throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
}

var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken).ConfigureAwait(false);
return JsonSerializer.Deserialize<AudioResponse>(responseAsString);
}

private async Task<string> Internal_CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
{
using var content = new MultipartFormDataContent();
using var audioData = new MemoryStream();
Expand All @@ -136,8 +188,7 @@ public async Task<string> CreateTranslationAsync(AudioTranslationRequest request
content.Add(new StringContent(request.Prompt), "prompt");
}

var responseFormat = request.ResponseFormat;
content.Add(new StringContent(responseFormat.ToString().ToLower()), "response_format");
content.Add(new StringContent(request.ResponseFormat.ToString().ToLower()), "response_format");

if (request.Temperature.HasValue)
{
Expand All @@ -148,10 +199,7 @@ public async Task<string> CreateTranslationAsync(AudioTranslationRequest request

using var response = await client.Client.PostAsync(GetUrl("/translations"), content, cancellationToken).ConfigureAwait(false);
var responseAsString = await response.ReadAsStringAsync(EnableDebug, content, null, cancellationToken).ConfigureAwait(false);

return responseFormat == AudioResponseFormat.Json
? JsonSerializer.Deserialize<AudioResponse>(responseAsString)?.Text
: responseAsString;
return responseAsString;
}
}
}
47 changes: 47 additions & 0 deletions OpenAI-DotNet/Audio/AudioResponse.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using System.Text.Json.Serialization;

namespace OpenAI.Audio
{
public sealed class AudioResponse
{
/// <summary>
/// The language of the input audio.
/// </summary>
[JsonInclude]
[JsonPropertyName("language")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
public string Language { get; private set; }

/// <summary>
/// The duration of the input audio.
/// </summary>
[JsonInclude]
[JsonPropertyName("duration")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
public double? Duration { get; private set; }

/// <summary>
/// The transcribed text.
/// </summary>
[JsonInclude]
[JsonPropertyName("text")]
public string Text { get; private set; }

/// <summary>
/// Extracted words and their corresponding timestamps.
/// </summary>
[JsonInclude]
[JsonPropertyName("words")]
public TranscriptionWord[] Words { get; private set; }

/// <summary>
/// Segments of the transcribed text and their corresponding details.
/// </summary>
[JsonInclude]
[JsonPropertyName("segments")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
public TranscriptionSegment[] Segments { get; private set; }
}
}
Loading

0 comments on commit 9124a33

Please sign in to comment.