diff --git a/.github/workflows/dotnet-demos.yml b/.github/workflows/dotnet-demos.yml index 8a3d9b83..cdf6b939 100644 --- a/.github/workflows/dotnet-demos.yml +++ b/.github/workflows/dotnet-demos.yml @@ -38,6 +38,15 @@ jobs: - name: Package restore run: dotnet restore + # *********** REMOVE AFTER RELEASE ********************** + - name: Pack binding for local ref + run: dotnet pack -c Release + working-directory: binding/dotnet + + - name: Add binding to demo + run: dotnet add package -s ../../../binding/dotnet/Leopard/bin/Release Leopard + # ****************************************************** + - name: Dotnet build micdemo run: dotnet build -c MicDemo.Release @@ -60,6 +69,15 @@ jobs: - name: Package restore run: dotnet restore + # *********** REMOVE AFTER RELEASE ********************** + - name: Pack binding for local ref + run: dotnet pack -c Release + working-directory: binding/dotnet + + - name: Add binding to demo + run: dotnet add package -s ../../../binding/dotnet/Leopard/bin/Release Leopard + # ****************************************************** + - name: Dotnet build micdemo run: dotnet build -c MicDemo.Release diff --git a/binding/dotnet/Leopard/Leopard.cs b/binding/dotnet/Leopard/Leopard.cs index beaf8629..75d0877d 100644 --- a/binding/dotnet/Leopard/Leopard.cs +++ b/binding/dotnet/Leopard/Leopard.cs @@ -10,7 +10,6 @@ specific language governing permissions and limitations under the License. */ using System; -using System.Collections.Generic; using System.IO; using System.Reflection; using System.Runtime.InteropServices; @@ -42,9 +41,9 @@ public enum PvStatus public class Leopard : IDisposable { private const string LIBRARY = "libpv_leopard"; - private IntPtr _libraryPointer = IntPtr.Zero; public static readonly string DEFAULT_MODEL_PATH; + private IntPtr _libraryPointer; static Leopard() { @@ -62,7 +61,7 @@ static Leopard() private static IntPtr ImportResolver(string libraryName, Assembly assembly, DllImportSearchPath? searchPath) { - IntPtr libHandle = IntPtr.Zero; + IntPtr libHandle; NativeLibrary.TryLoad(Utils.PvLibraryPath(libraryName), out libHandle); return libHandle; } @@ -73,12 +72,10 @@ private static IntPtr ImportResolver(string libraryName, Assembly assembly, DllI private static extern PvStatus pv_leopard_init( IntPtr accessKey, IntPtr modelPath, - bool enable_automatic_punctuation, + bool enableAutomaticPunctuation, + bool enableDiarization, out IntPtr handle); - [DllImport(LIBRARY, CallingConvention = CallingConvention.Cdecl)] - private static extern Int32 pv_sample_rate(); - [DllImport(LIBRARY, CallingConvention = CallingConvention.Cdecl)] private static extern void pv_leopard_delete(IntPtr handle); @@ -108,16 +105,29 @@ private static extern PvStatus pv_leopard_process_file( [DllImport(LIBRARY, CallingConvention = CallingConvention.Cdecl)] private static extern IntPtr pv_leopard_version(); + [DllImport(LIBRARY, CallingConvention = CallingConvention.Cdecl)] + private static extern Int32 pv_sample_rate(); + + [DllImport(LIBRARY, CallingConvention = CallingConvention.Cdecl)] + private static extern void pv_set_sdk(string sdk); + + [DllImport(LIBRARY, CallingConvention = CallingConvention.Cdecl)] + private static extern PvStatus pv_get_error_stack(out IntPtr messageStack, out int messageStackDepth); + + [DllImport(LIBRARY, CallingConvention = CallingConvention.Cdecl)] + private static extern void pv_free_error_stack(IntPtr messageStack); + /// /// C Struct for storing word metadata /// [StructLayout(LayoutKind.Sequential, CharSet = CharSet.Ansi)] private struct CWord { - public IntPtr wordPtr; - public float startSec; - public float endSec; - public float confidence; + public readonly IntPtr wordPtr; + public readonly float startSec; + public readonly float endSec; + public readonly float confidence; + public readonly Int32 speakerTag; } /// @@ -131,10 +141,22 @@ private struct CWord /// /// Set to `true` to enable automatic punctuation insertion. /// + /// + /// Set to `true` to enable speaker diarization, which allows Leopard to differentiate speakers as + /// part of the transcription process. Word metadata will include a `SpeakerTag` to identify unique speakers. + /// /// An instance of Leopard Speech-to-Text engine. - public static Leopard Create(string accessKey, string modelPath = null, bool enableAutomaticPunctuation = false) + public static Leopard Create( + string accessKey, + string modelPath = null, + bool enableAutomaticPunctuation = false, + bool enableDiarization = false) { - return new Leopard(accessKey, modelPath ?? DEFAULT_MODEL_PATH, enableAutomaticPunctuation); + return new Leopard( + accessKey, + modelPath ?? DEFAULT_MODEL_PATH, + enableAutomaticPunctuation, + enableDiarization); } /// @@ -148,10 +170,15 @@ public static Leopard Create(string accessKey, string modelPath = null, bool ena /// /// Set to `true` to enable automatic punctuation insertion. /// + /// + /// Set to `true` to enable speaker diarization, which allows Leopard to differentiate speakers as + /// part of the transcription process. Word metadata will include a `SpeakerTag` to identify unique speakers. + /// private Leopard( string accessKey, string modelPath, - bool enableAutomaticPunctuation) + bool enableAutomaticPunctuation, + bool enableDiarization) { if (string.IsNullOrEmpty(accessKey)) { @@ -166,10 +193,13 @@ private Leopard( IntPtr accessKeyPtr = Utils.GetPtrFromUtf8String(accessKey); IntPtr modelPathPtr = Utils.GetPtrFromUtf8String(modelPath); + pv_set_sdk("dotnet"); + PvStatus status = pv_leopard_init( accessKeyPtr, modelPathPtr, enableAutomaticPunctuation, + enableDiarization, out _libraryPointer); Marshal.FreeHGlobal(accessKeyPtr); @@ -177,7 +207,8 @@ private Leopard( if (status != PvStatus.SUCCESS) { - throw PvStatusToException(status); + string[] messageStack = GetMessageStack(); + throw PvStatusToException(status, "Leopard init failed", messageStack); } Version = Utils.GetUtf8StringFromPtr(pv_leopard_version()); @@ -188,47 +219,54 @@ private Leopard( /// Processes a given audio data and returns its transcription. /// /// - /// Audio data. The audio needs to have a sample rate equal to `.SampleRate` and be 16-bit linearly-encoded. This function operates on single-channel audio. + /// Audio data. The audio needs to have a sample rate equal to `.SampleRate` and be 16-bit linearly-encoded. This + /// function operates on single-channel audio. /// /// /// LeopardTranscript object which contains the transcription results of the engine. /// public LeopardTranscript Process(Int16[] pcm) { - if (pcm.Length == 0 | pcm == null) + if (pcm == null || pcm.Length == 0) { throw new LeopardInvalidArgumentException("Input audio frame is empty"); } - IntPtr transcriptPtr = IntPtr.Zero; - Int32 numWords = 0; - IntPtr wordsPtr = IntPtr.Zero; + IntPtr transcriptPtr; + Int32 numWords; + IntPtr wordsPtr; PvStatus status = pv_leopard_process (_libraryPointer, pcm, - (Int32)pcm.Length, + pcm.Length, out transcriptPtr, out numWords, out wordsPtr); if (status != PvStatus.SUCCESS) { - throw PvStatusToException(status, "Leopard failed to process the audio frame."); + string[] messageStack = GetMessageStack(); + throw PvStatusToException(status, "Leopard process failed", messageStack); } string transcript = Utils.GetUtf8StringFromPtr(transcriptPtr); pv_leopard_transcript_delete(transcriptPtr); - List wordsList = new List(); + + LeopardWord[] wordsList = new LeopardWord[numWords]; IntPtr orgWordsPtr = wordsPtr; for (int i = 0; i < numWords; i++) { CWord cword = (CWord)Marshal.PtrToStructure(wordsPtr, typeof(CWord)); - string word = Utils.GetUtf8StringFromPtr(cword.wordPtr); - wordsList.Add(new LeopardWord(word, cword.confidence, cword.startSec, cword.endSec)); + wordsList[i] = new LeopardWord( + Utils.GetUtf8StringFromPtr(cword.wordPtr), + cword.confidence, + cword.startSec, + cword.endSec, + cword.speakerTag); wordsPtr += Marshal.SizeOf(typeof(CWord)); } pv_leopard_words_delete(orgWordsPtr); - return new LeopardTranscript(transcript, wordsList.ToArray()); + return new LeopardTranscript(transcript, wordsList); } /// @@ -243,6 +281,11 @@ public LeopardTranscript Process(Int16[] pcm) /// public LeopardTranscript ProcessFile(string audioPath) { + if (String.IsNullOrEmpty(audioPath)) + { + throw new LeopardInvalidArgumentException("Audio file path was empty"); + } + if (!File.Exists(audioPath)) { throw new LeopardIOException($"Couldn't find audio file at '{audioPath}'"); @@ -250,9 +293,9 @@ public LeopardTranscript ProcessFile(string audioPath) IntPtr audioPathPtr = Utils.GetPtrFromUtf8String(audioPath); - IntPtr transcriptPtr = IntPtr.Zero; - Int32 numWords = 0; - IntPtr wordsPtr = IntPtr.Zero; + IntPtr transcriptPtr; + Int32 numWords; + IntPtr wordsPtr; PvStatus status = pv_leopard_process_file( _libraryPointer, audioPathPtr, @@ -264,70 +307,86 @@ public LeopardTranscript ProcessFile(string audioPath) if (status != PvStatus.SUCCESS) { - throw PvStatusToException(status, "Leopard failed to process the audio file."); + string[] messageStack = GetMessageStack(); + throw PvStatusToException(status, "Leopard process file failed", messageStack); } string transcript = Utils.GetUtf8StringFromPtr(transcriptPtr); pv_leopard_transcript_delete(transcriptPtr); + + LeopardWord[] wordsList = new LeopardWord[numWords]; IntPtr orgWordsPtr = wordsPtr; - List wordsList = new List(); for (int i = 0; i < numWords; i++) { CWord cword = (CWord)Marshal.PtrToStructure(wordsPtr, typeof(CWord)); - string word = Utils.GetUtf8StringFromPtr(cword.wordPtr); - wordsList.Add(new LeopardWord(word, cword.confidence, cword.startSec, cword.endSec)); + wordsList[i] = new LeopardWord( + Utils.GetUtf8StringFromPtr(cword.wordPtr), + cword.confidence, + cword.startSec, + cword.endSec, + cword.speakerTag); wordsPtr += Marshal.SizeOf(typeof(CWord)); } pv_leopard_words_delete(orgWordsPtr); - return new LeopardTranscript(transcript, wordsList.ToArray()); + return new LeopardTranscript(transcript, wordsList); } /// /// Gets the version number of the Leopard library. /// /// Version of Leopard - public string Version { get; private set; } + public string Version { get; } /// /// Get the audio sample rate required by Leopard /// /// Required sample rate. - public Int32 SampleRate { get; private set; } + public Int32 SampleRate { get; } /// /// Coverts status codes to relevant .NET exceptions /// /// Picovoice library status code. + /// Default error message. + /// Error stack returned from Picovoice library. /// .NET exception - private static Exception PvStatusToException(PvStatus status, string message = "") + private static Exception PvStatusToException( + PvStatus status, + string message = "", + string[] messageStack = null) { + if (messageStack == null) + { + messageStack = new string[] { }; + } + switch (status) { case PvStatus.OUT_OF_MEMORY: - return new LeopardMemoryException(message); + return new LeopardMemoryException(message, messageStack); case PvStatus.IO_ERROR: - return new LeopardIOException(message); + return new LeopardIOException(message, messageStack); case PvStatus.INVALID_ARGUMENT: - return new LeopardInvalidArgumentException(message); + return new LeopardInvalidArgumentException(message, messageStack); case PvStatus.STOP_ITERATION: - return new LeopardStopIterationException(message); + return new LeopardStopIterationException(message, messageStack); case PvStatus.KEY_ERROR: - return new LeopardKeyException(message); + return new LeopardKeyException(message, messageStack); case PvStatus.INVALID_STATE: - return new LeopardInvalidStateException(message); + return new LeopardInvalidStateException(message, messageStack); case PvStatus.RUNTIME_ERROR: - return new LeopardRuntimeException(message); + return new LeopardRuntimeException(message, messageStack); case PvStatus.ACTIVATION_ERROR: - return new LeopardActivationException(message); + return new LeopardActivationException(message, messageStack); case PvStatus.ACTIVATION_LIMIT_REACHED: - return new LeopardActivationLimitException(message); + return new LeopardActivationLimitException(message, messageStack); case PvStatus.ACTIVATION_THROTTLED: - return new LeopardActivationThrottledException(message); + return new LeopardActivationThrottledException(message, messageStack); case PvStatus.ACTIVATION_REFUSED: - return new LeopardActivationRefusedException(message); + return new LeopardActivationRefusedException(message, messageStack); default: - return new LeopardException("Unmapped error code returned from Leopard."); + return new LeopardException("Unmapped error code returned from Leopard.", messageStack); } } @@ -350,5 +409,29 @@ public void Dispose() { Dispose(); } + + private string[] GetMessageStack() + { + Int32 messageStackDepth; + IntPtr messageStackRef; + + PvStatus status = pv_get_error_stack(out messageStackRef, out messageStackDepth); + if (status != PvStatus.SUCCESS) + { + throw PvStatusToException(status, "Unable to get Leopard error state"); + } + + int elementSize = Marshal.SizeOf(typeof(IntPtr)); + string[] messageStack = new string[messageStackDepth]; + + for (int i = 0; i < messageStackDepth; i++) + { + messageStack[i] = Marshal.PtrToStringAnsi(Marshal.ReadIntPtr(messageStackRef, i * elementSize)); + } + + pv_free_error_stack(messageStackRef); + + return messageStack; + } } } \ No newline at end of file diff --git a/binding/dotnet/Leopard/Leopard.csproj b/binding/dotnet/Leopard/Leopard.csproj index f6158107..b1a6b2f2 100644 --- a/binding/dotnet/Leopard/Leopard.csproj +++ b/binding/dotnet/Leopard/Leopard.csproj @@ -2,16 +2,16 @@ net6.0;netcoreapp3.0;netstandard2.0 - 1.2.1 + 2.0.0 Picovoice - + Leopard Speech-to-Text Engine Apache-2.0 https://github.com/Picovoice/leopard https://github.com/Picovoice/leopard.git git Speech-to-Text, Speech Recognition, Voice Recognition, ASR - See https://github.com/Picovoice/leopard/ + See https://github.com/Picovoice/leopard/ Picovoice 2022-2023 Leopard is an on-device speech-to-text engine. Leopard is: @@ -24,6 +24,8 @@ true pv_circle_512.png + + @@ -33,20 +35,15 @@ - build/netcoreapp3.0; - build/net6.0; - - PreserveNewest - - - - build/netstandard2.0/lib/common/leopard_params.pv; - build/netcoreapp3.0/lib/common/leopard_params.pv; - build/net6.0/lib/common/leopard_params.pv; + build/netcoreapp3.0/Leopard.targets; + build/net6.0/Leopard.targets; - lib\common\leopard_params.pv PreserveNewest + + + + build/netstandard2.0/libpv_leopard.dll; @@ -78,8 +75,10 @@ false + + - + build/netcoreapp3.0/lib/raspberry-pi; build/net6.0/lib/raspberry-pi; @@ -98,6 +97,8 @@ false + + @@ -108,8 +109,22 @@ false + + + + + + build/netstandard2.0/lib/common/leopard_params.pv; + build/netcoreapp3.0/lib/common/leopard_params.pv; + build/net6.0/lib/common/leopard_params.pv; + + lib\common\leopard_params.pv + PreserveNewest + + + - + diff --git a/binding/dotnet/Leopard/LeopardException.cs b/binding/dotnet/Leopard/LeopardException.cs index b8e33f13..4d5ccd70 100644 --- a/binding/dotnet/Leopard/LeopardException.cs +++ b/binding/dotnet/Leopard/LeopardException.cs @@ -1,5 +1,5 @@ /* - Copyright 2022 Picovoice Inc. + Copyright 2022-2023 Picovoice Inc. You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" file accompanying this source. @@ -15,10 +15,35 @@ namespace Pv { public class LeopardException : Exception { + private readonly string[] _messageStack; + public LeopardException() { } public LeopardException(string message) : base(message) { } + public LeopardException(string message, string[] messageStack) : base(ModifyMessages(message, messageStack)) + { + this._messageStack = messageStack; + } + + public string[] MessageStack + { + get => _messageStack; + } + + private static string ModifyMessages(string message, string[] messageStack) + { + string messageString = message; + if (messageStack.Length > 0) + { + messageString += ":"; + for (int i = 0; i < messageStack.Length; i++) + { + messageString += $"\n [{i}] {messageStack[i]}"; + } + } + return messageString; + } } public class LeopardMemoryException : LeopardException @@ -26,6 +51,8 @@ public class LeopardMemoryException : LeopardException public LeopardMemoryException() { } public LeopardMemoryException(string message) : base(message) { } + + public LeopardMemoryException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardIOException : LeopardException @@ -33,6 +60,8 @@ public class LeopardIOException : LeopardException public LeopardIOException() { } public LeopardIOException(string message) : base(message) { } + + public LeopardIOException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardInvalidArgumentException : LeopardException @@ -40,6 +69,8 @@ public class LeopardInvalidArgumentException : LeopardException public LeopardInvalidArgumentException() { } public LeopardInvalidArgumentException(string message) : base(message) { } + + public LeopardInvalidArgumentException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardStopIterationException : LeopardException @@ -47,6 +78,8 @@ public class LeopardStopIterationException : LeopardException public LeopardStopIterationException() { } public LeopardStopIterationException(string message) : base(message) { } + + public LeopardStopIterationException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardKeyException : LeopardException @@ -54,6 +87,8 @@ public class LeopardKeyException : LeopardException public LeopardKeyException() { } public LeopardKeyException(string message) : base(message) { } + + public LeopardKeyException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardInvalidStateException : LeopardException @@ -61,6 +96,8 @@ public class LeopardInvalidStateException : LeopardException public LeopardInvalidStateException() { } public LeopardInvalidStateException(string message) : base(message) { } + + public LeopardInvalidStateException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardRuntimeException : LeopardException @@ -68,6 +105,8 @@ public class LeopardRuntimeException : LeopardException public LeopardRuntimeException() { } public LeopardRuntimeException(string message) : base(message) { } + + public LeopardRuntimeException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardActivationException : LeopardException @@ -75,6 +114,8 @@ public class LeopardActivationException : LeopardException public LeopardActivationException() { } public LeopardActivationException(string message) : base(message) { } + + public LeopardActivationException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardActivationLimitException : LeopardException @@ -82,6 +123,8 @@ public class LeopardActivationLimitException : LeopardException public LeopardActivationLimitException() { } public LeopardActivationLimitException(string message) : base(message) { } + + public LeopardActivationLimitException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardActivationThrottledException : LeopardException @@ -89,6 +132,8 @@ public class LeopardActivationThrottledException : LeopardException public LeopardActivationThrottledException() { } public LeopardActivationThrottledException(string message) : base(message) { } + + public LeopardActivationThrottledException(string message, string[] messageStack) : base(message, messageStack) { } } public class LeopardActivationRefusedException : LeopardException @@ -96,6 +141,8 @@ public class LeopardActivationRefusedException : LeopardException public LeopardActivationRefusedException() { } public LeopardActivationRefusedException(string message) : base(message) { } + + public LeopardActivationRefusedException(string message, string[] messageStack) : base(message, messageStack) { } } } \ No newline at end of file diff --git a/binding/dotnet/Leopard/LeopardTranscript.cs b/binding/dotnet/Leopard/LeopardTranscript.cs index f67623e9..b2d2155a 100644 --- a/binding/dotnet/Leopard/LeopardTranscript.cs +++ b/binding/dotnet/Leopard/LeopardTranscript.cs @@ -1,5 +1,5 @@ /* - Copyright 2022 Picovoice Inc. + Copyright 2022-2023 Picovoice Inc. You may not use this file except in compliance with the license. A copy of the license is located in the "LICENSE" file accompanying this source. @@ -9,6 +9,8 @@ file accompanying this source. specific language governing permissions and limitations under the License. */ +using System; + namespace Pv { /// @@ -19,18 +21,26 @@ public class LeopardWord /// /// Constructor. /// - /// Transcribed word. - /// Transcription confidence. It is a number within [0, 1]. - /// Start of word in seconds. - /// End of word in seconds. - /// - public LeopardWord(string word, float confidence, float startSec, float endSec) + /// Transcribed word. + /// Transcription confidence. It is a number within [0, 1]. + /// Start of word in seconds. + /// End of word in seconds. + /// + /// The speaker tag is `-1` if diarization is not enabled during initialization; otherwise, + /// it's a non-negative integer identifying unique speakers, with `0` reserved for unknown speakers. + /// + public LeopardWord( + string word, + float confidence, + float startSec, + float endSec, + Int32 speakerTag) { Word = word; Confidence = confidence; - StartSec = startSec; EndSec = endSec; + SpeakerTag = speakerTag; } /// @@ -52,6 +62,11 @@ public LeopardWord(string word, float confidence, float startSec, float endSec) /// Getter for endSec. /// public float EndSec { get; } + + /// + /// Getter for speakerTag. + /// + public Int32 SpeakerTag { get; } } /// @@ -63,7 +78,7 @@ public class LeopardTranscript /// Constructor. /// /// - /// transcript String transcript returned from Leopard. + /// Transcript returned from Leopard. /// /// /// Transcribed words and their associated metadata. diff --git a/binding/dotnet/LeopardTest/MainTest.cs b/binding/dotnet/LeopardTest/MainTest.cs index bc39852e..a5b1551f 100644 --- a/binding/dotnet/LeopardTest/MainTest.cs +++ b/binding/dotnet/LeopardTest/MainTest.cs @@ -13,6 +13,7 @@ specific language governing permissions and limitations under the License. using System.Collections.Generic; using System.IO; using System.Reflection; +using System.Runtime.InteropServices; using Fastenshtein; @@ -36,110 +37,170 @@ public static void ClassInitialize(TestContext _) _accessKey = Environment.GetEnvironmentVariable("ACCESS_KEY"); } - private static List GetPcmFromFile(string audioFilePath, int expectedSampleRate) + [Serializable] + private class TestJson { - List data = new List(); - using (BinaryReader reader = new BinaryReader(File.Open(audioFilePath, FileMode.Open))) - { - reader.ReadBytes(24); // skip over part of the header - Assert.AreEqual(reader.ReadInt32(), expectedSampleRate, "Specified sample rate did not match test file."); - reader.ReadBytes(16); // skip over the rest of the header - - while (reader.BaseStream.Position != reader.BaseStream.Length) - { - data.Add(reader.ReadInt16()); - } - } - - return data; + public LanguageTestJson[] language_tests { get; set; } + public DiarizationTestJson[] diarization_tests { get; set; } } - - private static JObject LoadJsonTestData() + [Serializable] + private class LanguageTestJson { - string content = File.ReadAllText(Path.Combine(ROOT_DIR, "resources/.test/test_data.json")); - return JObject.Parse(content); + public string language { get; set; } + public string audio_file { get; set; } + public string transcript { get; set; } + + public string transcript_with_punctuation { get; set; } + public float error_rate { get; set; } + public WordJson[] words { get; set; } } [Serializable] - private class TestParameterJson + private class DiarizationTestJson { public string language { get; set; } public string audio_file { get; set; } - public string transcript { get; set; } + public WordJson[] words { get; set; } + } - public string[] punctuations { get; set; } + [Serializable] + private class WordJson + { + public string word { get; set; } + public float start_sec { get; set; } + public float end_sec { get; set; } + public float confidence { get; set; } + public Int32 speaker_tag { get; set; } + } - public float error_rate { get; set; } + private static TestJson LoadJsonTestData() + { + string content = File.ReadAllText(Path.Combine(ROOT_DIR, "resources/.test/test_data.json")); + return JObject.Parse(content)["tests"].ToObject(); } - public static IEnumerable TestParameters + private static IEnumerable ProcessTestParameters { get { - JObject testDataJson = LoadJsonTestData(); - IList testParametersJson = ((JArray)testDataJson["tests"]["parameters"]).ToObject>(); - List testParameters = new List(); - foreach (TestParameterJson t in testParametersJson) + TestJson testDataJson = LoadJsonTestData(); + object[][] processTestParameters = new object[testDataJson.language_tests.Length][]; + for (int i = 0; i < testDataJson.language_tests.Length; i++) { - testParameters.Add(new object[] - { - t.language, - t.audio_file, - t.transcript, - true, - t.error_rate - }); - - string transcriptWithoutPunctuation = t.transcript; - foreach (string p in t.punctuations) + WordJson[] wordsJson = testDataJson.language_tests[i].words; + LeopardWord[] words = new LeopardWord[wordsJson.Length]; + for (int j = 0; j < wordsJson.Length; j++) { - transcriptWithoutPunctuation = transcriptWithoutPunctuation.Replace(p, ""); + words[j] = new LeopardWord( + wordsJson[j].word, + wordsJson[j].confidence, + wordsJson[j].start_sec, + wordsJson[j].end_sec, + wordsJson[j].speaker_tag); } + processTestParameters[i] = new object[] + { + testDataJson.language_tests[i].language, + testDataJson.language_tests[i].audio_file, + testDataJson.language_tests[i].transcript, + testDataJson.language_tests[i].transcript_with_punctuation, + testDataJson.language_tests[i].error_rate, + words + }; + } + + return processTestParameters; + } + } - testParameters.Add(new object[] + private static IEnumerable DiarizationTestParameters + { + get + { + TestJson testDataJson = LoadJsonTestData(); + object[][] processTestParameters = new object[testDataJson.diarization_tests.Length][]; + for (int i = 0; i < testDataJson.diarization_tests.Length; i++) + { + WordJson[] wordsJson = testDataJson.diarization_tests[i].words; + LeopardWord[] words = new LeopardWord[wordsJson.Length]; + for (int j = 0; j < wordsJson.Length; j++) { - t.language, - t.audio_file, - transcriptWithoutPunctuation, - false, - t.error_rate - }); + words[j] = new LeopardWord( + wordsJson[j].word, + 0, + 0, + 0, + wordsJson[j].speaker_tag); + } + processTestParameters[i] = new object[] + { + testDataJson.diarization_tests[i].language, + testDataJson.diarization_tests[i].audio_file, + words + }; } - return testParameters; + return processTestParameters; } } private static string AppendLanguage(string s, string language) - => language == "en" ? s : $"{s}_{language}"; + { + return language == "en" ? s : $"{s}_{language}"; + } + + private static float GetErrorRate(string transcript, string referenceTranscript) + { + return Levenshtein.Distance(transcript, referenceTranscript) / (float)referenceTranscript.Length; + } private static string GetModelPath(string language) - => Path.Combine( + { + return Path.Combine( ROOT_DIR, "lib/common", $"{AppendLanguage("leopard_params", language)}.pv"); + } - static float GetErrorRate(string transcript, string referenceTranscript) - => Levenshtein.Distance(transcript, referenceTranscript) / (float)referenceTranscript.Length; + private static short[] GetPcmFromFile(string audioFilePath, int expectedSampleRate) + { + List data = new List(); + using (BinaryReader reader = new BinaryReader(File.Open(audioFilePath, FileMode.Open))) + { + reader.ReadBytes(24); // skip over part of the header + Assert.AreEqual(reader.ReadInt32(), expectedSampleRate, "Specified sample rate did not match test file."); + reader.ReadBytes(16); // skip over the rest of the header + + while (reader.BaseStream.Position != reader.BaseStream.Length) + { + data.Add(reader.ReadInt16()); + } + } - private static void ValidateMetadata(LeopardWord[] words, string transcript, float audioLength) + return data.ToArray(); + } + + private static void ValidateMetadata( + LeopardWord[] words, + LeopardWord[] referenceWords, + bool enableDiarization) { - string normTranscript = transcript.ToUpper(); + Assert.AreEqual(words.Length, referenceWords.Length); for (int i = 0; i < words.Length; i++) { - Assert.IsTrue(normTranscript.Contains(words[i].Word.ToUpper())); - Assert.IsTrue(words[i].StartSec > 0); - Assert.IsTrue(words[i].StartSec <= words[i].EndSec); - if (i < words.Length - 1) + Assert.AreEqual(words[i].Word.ToUpper(), referenceWords[i].Word.ToUpper()); + Assert.AreEqual(words[i].StartSec, referenceWords[i].StartSec, 0.1); + Assert.AreEqual(words[i].EndSec, referenceWords[i].EndSec, 0.1); + Assert.AreEqual(words[i].Confidence, referenceWords[i].Confidence, 0.1); + if (enableDiarization) { - Assert.IsTrue(words[i].EndSec <= words[i + 1].StartSec); + Assert.AreEqual(words[i].SpeakerTag, referenceWords[i].SpeakerTag); } else { - Assert.IsTrue(words[i].EndSec <= audioLength); + Assert.AreEqual(words[i].SpeakerTag, -1); } - Assert.IsTrue(words[i].Confidence >= 0.0f && words[i].Confidence <= 1.0f); } } @@ -148,7 +209,9 @@ public void TestVersion() { using (Leopard leopard = Leopard.Create(_accessKey)) { - Assert.IsFalse(string.IsNullOrWhiteSpace(leopard?.Version), "Leopard did not return a valid version number."); + Assert.IsFalse( + string.IsNullOrWhiteSpace(leopard?.Version), + "Leopard did not return a valid version number."); } } @@ -157,74 +220,193 @@ public void TestSampleRate() { using (Leopard leopard = Leopard.Create(_accessKey)) { - Assert.IsTrue(leopard.SampleRate > 0, "Leopard did not return a valid sample rate number."); + Assert.IsTrue( + leopard.SampleRate > 0, + "Leopard did not return a valid sample rate number."); } } [TestMethod] - [DynamicData(nameof(TestParameters))] - public void TestProcessFile( + public void TestMessageStack() + { + Leopard l; + string[] messageList = new string[] { }; + + try + { + l = Leopard.Create("invalid"); + Assert.IsNull(l); + l.Dispose(); + } + catch (LeopardException e) + { + messageList = e.MessageStack; + } + + Assert.IsTrue(0 < messageList.Length); + Assert.IsTrue(messageList.Length < 8); + + try + { + l = Leopard.Create("invalid"); + Assert.IsNull(l); + l.Dispose(); + } + catch (LeopardException e) + { + for (int i = 0; i < messageList.Length; i++) + { + Assert.AreEqual(messageList[i], e.MessageStack[i]); + } + } + } + + [TestMethod] + public void TestProcessMessageStack() + { + Leopard l = Leopard.Create(_accessKey); + short[] testPcm = new short[1024]; + + var obj = typeof(Leopard).GetField("_libraryPointer", BindingFlags.NonPublic | BindingFlags.Instance); + IntPtr address = (IntPtr)obj.GetValue(l); + obj.SetValue(l, IntPtr.Zero); + + try + { + LeopardTranscript res = l.Process(testPcm); + Assert.IsTrue(res == null); + } + catch (LeopardException e) + { + Assert.IsTrue(0 < e.MessageStack.Length); + Assert.IsTrue(e.MessageStack.Length < 8); + } + + obj.SetValue(l, address); + l.Dispose(); + } + + + [TestMethod] + [DynamicData(nameof(ProcessTestParameters))] + public void TestProcess( string language, string testAudioFile, string referenceTranscript, - bool enablePunctuation, - float targetErrorRate) + string _, + float targetErrorRate, + LeopardWord[] referenceWords) { using (Leopard leopard = Leopard.Create( - _accessKey, - modelPath: GetModelPath(language), - enableAutomaticPunctuation: enablePunctuation - )) + _accessKey, + GetModelPath(language))) { + string testAudioPath = Path.Combine(ROOT_DIR, "resources/audio_samples", testAudioFile); + + LeopardTranscript result = leopard.Process(GetPcmFromFile(testAudioPath, leopard.SampleRate)); + float errorRate = GetErrorRate(result.TranscriptString.ToUpper(), referenceTranscript.ToUpper()); + Assert.IsTrue(errorRate < targetErrorRate); + + ValidateMetadata(result.WordArray, referenceWords, false); + } + } + + [TestMethod] + [DynamicData(nameof(ProcessTestParameters))] + public void TestProcessFile( + string language, + string testAudioFile, + string referenceTranscript, + string _, + float targetErrorRate, + LeopardWord[] referenceWords) + { + using (Leopard leopard = Leopard.Create( + _accessKey, + GetModelPath(language))) + { string testAudioPath = Path.Combine(ROOT_DIR, "resources/audio_samples", testAudioFile); LeopardTranscript result = leopard.ProcessFile(testAudioPath); - string transcript = result.TranscriptString; - if (!enablePunctuation) - { - referenceTranscript = referenceTranscript.ToUpper(); - transcript = transcript.ToUpper(); - } + float errorRate = GetErrorRate(result.TranscriptString.ToUpper(), referenceTranscript.ToUpper()); + Assert.IsTrue(errorRate < targetErrorRate); + + ValidateMetadata(result.WordArray, referenceWords, false); + } + } + + [TestMethod] + [DynamicData(nameof(ProcessTestParameters))] + public void TestProcessFileWithPunctuation( + string language, + string testAudioFile, + string _, + string referenceTranscript, + float targetErrorRate, + LeopardWord[] referenceWords) + { + using (Leopard leopard = Leopard.Create( + _accessKey, + GetModelPath(language), + enableAutomaticPunctuation: true)) + { + string testAudioPath = Path.Combine(ROOT_DIR, "resources/audio_samples", testAudioFile); + LeopardTranscript result = leopard.ProcessFile(testAudioPath); - Assert.IsTrue(GetErrorRate(transcript, referenceTranscript) < targetErrorRate); + float errorRate = GetErrorRate(result.TranscriptString.ToUpper(), referenceTranscript.ToUpper()); + Assert.IsTrue(errorRate < targetErrorRate); - float audioLength = GetPcmFromFile(testAudioPath, leopard.SampleRate).Count / (float)leopard.SampleRate; - ValidateMetadata(result.WordArray, referenceTranscript, audioLength); + ValidateMetadata(result.WordArray, referenceWords, false); } } [TestMethod] - [DynamicData(nameof(TestParameters))] - public void TestProcess( + [DynamicData(nameof(ProcessTestParameters))] + public void TestProcessFileWithDiarization( string language, string testAudioFile, string referenceTranscript, - bool enablePunctuation, - float targetErrorRate) + string _, + float targetErrorRate, + LeopardWord[] referenceWords) { using (Leopard leopard = Leopard.Create( - _accessKey, - modelPath: GetModelPath(language), - enableAutomaticPunctuation: enablePunctuation - )) + _accessKey, + GetModelPath(language), + enableDiarization: true)) { string testAudioPath = Path.Combine(ROOT_DIR, "resources/audio_samples", testAudioFile); + LeopardTranscript result = leopard.ProcessFile(testAudioPath); - List pcm = GetPcmFromFile(testAudioPath, leopard.SampleRate); - LeopardTranscript result = leopard.Process(pcm.ToArray()); + float errorRate = GetErrorRate(result.TranscriptString.ToUpper(), referenceTranscript.ToUpper()); + Assert.IsTrue(errorRate < targetErrorRate); - string transcript = result.TranscriptString; - if (!enablePunctuation) - { - referenceTranscript = referenceTranscript.ToUpper(); - transcript = transcript.ToUpper(); - } + ValidateMetadata(result.WordArray, referenceWords, true); + } + } - Assert.IsTrue(GetErrorRate(transcript, referenceTranscript) < targetErrorRate); + [TestMethod] + [DynamicData(nameof(DiarizationTestParameters))] + public void TestDiarization( + string language, + string testAudioFile, + LeopardWord[] referenceWords) + { + using (Leopard leopard = Leopard.Create( + _accessKey, + GetModelPath(language), + enableDiarization: true)) + { + string testAudioPath = Path.Combine(ROOT_DIR, "resources/audio_samples", testAudioFile); + LeopardWord[] words = leopard.ProcessFile(testAudioPath).WordArray; - float audioLength = pcm.Count / (float)leopard.SampleRate; - ValidateMetadata(result.WordArray, referenceTranscript, audioLength); + Assert.AreEqual(words.Length, referenceWords.Length); + for (int i = 0; i < words.Length; i++) + { + Assert.AreEqual(words[i].Word.ToUpper(), referenceWords[i].Word.ToUpper()); + Assert.AreEqual(words[i].SpeakerTag, referenceWords[i].SpeakerTag); + } } } } diff --git a/demo/dotnet/LeopardDemo/FileDemo.cs b/demo/dotnet/LeopardDemo/FileDemo.cs index a729a0eb..f4800d8e 100644 --- a/demo/dotnet/LeopardDemo/FileDemo.cs +++ b/demo/dotnet/LeopardDemo/FileDemo.cs @@ -17,7 +17,8 @@ specific language governing permissions and limitations under the License. namespace LeopardDemo { /// - /// File Demo for Leopard Speech-to-Text engine. The demo takes an input audio file and returns prints the the transcription. + /// File Demo for Leopard Speech-to-Text engine. + /// The demo takes an input audio file and processes it with Leopard. /// public class FileDemo { @@ -26,28 +27,36 @@ public class FileDemo /// Reads through input file and prints the transcription returned by Leopard. /// /// Required argument. Absolute path to input audio file. - /// AccessKey obtained from Picovoice Console (https://console.picovoice.ai/). - /// Absolute path to the file containing model parameters. If not set it will be set to the default location. + /// + /// AccessKey obtained from Picovoice Console (https://console.picovoice.ai/). + /// + /// + /// Absolute path to the file containing model parameters. + /// If not set it will be set to the default location. /// /// Set to `true` to enable automatic punctuation insertion. /// + /// + /// Set to `true` to enable speaker diarization, which allows Leopard to differentiate speakers as + /// part of the transcription process. Word metadata will include a `SpeakerTag` to identify unique speakers. + /// /// /// Enable verbose logging. /// - /// public static void RunDemo( string accessKey, string inputAudioPath, string modelPath, bool enableAutomaticPunctuation, - bool verbose - ) + bool enableDiarization, + bool verbose) { // init Leopard speech-to-text engine using (Leopard leopard = Leopard.Create( accessKey: accessKey, modelPath: modelPath, - enableAutomaticPunctuation: enableAutomaticPunctuation)) + enableAutomaticPunctuation: enableAutomaticPunctuation, + enableDiarization: enableDiarization)) { try @@ -56,11 +65,25 @@ bool verbose Console.WriteLine(result.TranscriptString); if (verbose) { - Console.WriteLine(String.Format("\n|{0,-15}|{1,-10:0.00}|{2,-10:0.00}|{3,-10:0.00}|\n", "word", "Confidence", "StartSec", "EndSec")); + Console.WriteLine( + string.Format( + "\n|{0,-15}|{1,11:0.00}|{2,10:0.00}|{3,10:0.00}|{4,11}|\n", + "Word", + "Confidence", + "StartSec", + "EndSec", + "SpeakerTag")); for (int i = 0; i < result.WordArray.Length; i++) { LeopardWord word = result.WordArray[i]; - Console.WriteLine(String.Format("|{0,-15}|{1,10:0.00}|{2,10:0.00}|{3,10:0.00}|", word.Word, word.Confidence, word.StartSec, word.EndSec)); + Console.WriteLine( + string.Format( + "|{0,-15}|{1,11:0.00}|{2,10:0.00}|{3,10:0.00}|{4,11}|", + word.Word, + word.Confidence, + word.StartSec, + word.EndSec, + word.SpeakerTag)); } } } @@ -85,7 +108,8 @@ public static void Main(string[] args) string accessKey = null; string modelPath = null; bool enableAutomaticPunctuation = true; - bool verbose = true; + bool enableDiarization = true; + bool verbose = false; bool showHelp = false; // parse command line arguments @@ -118,6 +142,11 @@ public static void Main(string[] args) enableAutomaticPunctuation = false; argIndex++; } + else if (args[argIndex] == "--disable_speaker_diarization") + { + enableDiarization = false; + argIndex++; + } else if (args[argIndex] == "--verbose") { verbose = true; @@ -149,7 +178,9 @@ public static void Main(string[] args) } if (!File.Exists(inputAudioPath)) { - throw new ArgumentException($"Audio file at path {inputAudioPath} does not exist", "--input_audio_path"); + throw new ArgumentException( + $"Audio file at path {inputAudioPath} does not exist", + "input_audio_path"); } RunDemo( @@ -157,6 +188,7 @@ public static void Main(string[] args) inputAudioPath, modelPath, enableAutomaticPunctuation, + enableDiarization, verbose); } @@ -172,6 +204,7 @@ private static void OnUnhandledException(object sender, UnhandledExceptionEventA "\t--access_key (required): AccessKey obtained from Picovoice Console (https://console.picovoice.ai/)\n" + "\t--model_path: Absolute path to the file containing model parameters.\n" + "\t--disable_automatic_punctuation: Disable automatic punctuation.\n" + - "\t--verbose: Enable verbose logging"; + "\t--disable_speaker_diarization: Disable speaker diarization.\n" + + "\t--verbose: Enable verbose output. Prints Leopard word metadata."; } } \ No newline at end of file diff --git a/demo/dotnet/LeopardDemo/MicDemo.cs b/demo/dotnet/LeopardDemo/MicDemo.cs index 4e1df02e..6fb481d0 100644 --- a/demo/dotnet/LeopardDemo/MicDemo.cs +++ b/demo/dotnet/LeopardDemo/MicDemo.cs @@ -19,38 +19,49 @@ specific language governing permissions and limitations under the License. namespace LeopardDemo { /// - /// Microphone Demo for Leopard Speech-to-Text engine. It creates an input audio stream from a microphone. + /// Microphone Demo for Leopard Speech-to-Text engine. + /// It creates an input audio stream from a microphone and processes it with Leopard. /// public class MicDemo { - private static readonly int PV_RECORDER_FRAME_LENGTH = 2048; /// /// Creates an input audio stream and instantiates an instance of Leopard object. /// - /// AccessKey obtained from Picovoice Console (https://console.picovoice.ai/). - /// Absolute path to the file containing model parameters. If not set it will be set to the default location. + /// + /// AccessKey obtained from Picovoice Console (https://console.picovoice.ai/). + /// + /// + /// Absolute path to the file containing model parameters. + /// If not set it will be set to the default location. + /// /// /// Set to `true` to enable automatic punctuation insertion. /// - /// - /// Enable verbose logging. + /// + /// Set to `true` to enable speaker diarization, which allows Leopard to differentiate speakers as + /// part of the transcription process. Word metadata will include a `SpeakerTag` to identify unique speakers. + /// + /// Enable verbose logging. + /// + /// Optional argument. If provided, audio is recorded from this input device. + /// Otherwise, the default audio input device is used. /// - /// Optional argument. If provided, audio is recorded from this input device. Otherwise, the default audio input device is used. - public static void RunDemo( + private static void RunDemo( string accessKey, string modelPath, bool enableAutomaticPunctuation, + bool enableDiarization, bool verbose, int audioDeviceIndex) { using (Leopard leopard = Leopard.Create( accessKey: accessKey, modelPath: modelPath, - enableAutomaticPunctuation: enableAutomaticPunctuation)) + enableAutomaticPunctuation: enableAutomaticPunctuation, + enableDiarization: enableDiarization)) { - using (PvRecorder recorder = PvRecorder.Create(PV_RECORDER_FRAME_LENGTH, audioDeviceIndex)) { Console.WriteLine($"Using device: {recorder.SelectedDevice}"); @@ -98,11 +109,25 @@ public static void RunDemo( Console.WriteLine(result.TranscriptString); if (verbose) { - Console.WriteLine(string.Format("\n|{0,-15}|{1,-10:0.00}|{2,-10:0.00}|{3,-10:0.00}|\n", "word", "Confidence", "StartSec", "EndSec")); + Console.WriteLine( + string.Format( + "\n|{0,-15}|{1,11:0.00}|{2,10:0.00}|{3,10:0.00}|{4,11}|\n", + "Word", + "Confidence", + "StartSec", + "EndSec", + "SpeakerTag")); for (int i = 0; i < result.WordArray.Length; i++) { LeopardWord word = result.WordArray[i]; - Console.WriteLine(string.Format("|{0,-15}|{1,10:0.00}|{2,10:0.00}|{3,10:0.00}|", word.Word, word.Confidence, word.StartSec, word.EndSec)); + Console.WriteLine( + string.Format( + "|{0,-15}|{1,11:0.00}|{2,10:0.00}|{3,10:0.00}|{4,11}|", + word.Word, + word.Confidence, + word.StartSec, + word.EndSec, + word.SpeakerTag)); } Console.WriteLine(); } @@ -119,7 +144,7 @@ public static void RunDemo( /// /// Lists available audio input devices. /// - public static void ShowAudioDevices() + private static void ShowAudioDevices() { string[] devices = PvRecorder.GetAvailableDevices(); for (int i = 0; i < devices.Length; i++) @@ -141,7 +166,8 @@ public static void Main(string[] args) string accessKey = null; string modelPath = null; bool enableAutomaticPunctuation = true; - bool verbose = true; + bool enableDiarization = true; + bool verbose = false; int audioDeviceIndex = -1; bool showAudioDevices = false; bool showHelp = false; @@ -169,6 +195,11 @@ public static void Main(string[] args) enableAutomaticPunctuation = false; argIndex++; } + else if (args[argIndex] == "--disable_speaker_diarization") + { + enableDiarization = false; + argIndex++; + } else if (args[argIndex] == "--verbose") { verbose = true; @@ -219,6 +250,7 @@ public static void Main(string[] args) accessKey, modelPath, enableAutomaticPunctuation, + enableDiarization, verbose, audioDeviceIndex); } @@ -236,6 +268,7 @@ private static void OnUnhandledException(object sender, UnhandledExceptionEventA "\t--audio_device_index: Index of input audio device.\n" + "\t--show_audio_devices: Print available recording devices.\n" + "\t--disable_automatic_punctuation: Disable automatic punctuation.\n" + - "\t--verbose: Enable verbose logging"; + "\t--disable_speaker_diarization: Disable speaker diarization.\n" + + "\t--verbose: Enable verbose output. Prints Leopard word metadata."; } } \ No newline at end of file