From 287160622fed08d4e5c9970973f6930fe793df9d Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Wed, 10 Nov 2021 22:26:31 +0300 Subject: [PATCH] Update to latest state 0.3.32 --- ios/VoskApiTest.xcodeproj/project.pbxproj | 43 +++-- .../contents.xcworkspacedata | 3 - ios/VoskApiTest/Vosk.swift | 2 +- ios/VoskApiTest/Vosk/vosk_api.h | 149 ++++++++++++++---- 4 files changed, 142 insertions(+), 55 deletions(-) diff --git a/ios/VoskApiTest.xcodeproj/project.pbxproj b/ios/VoskApiTest.xcodeproj/project.pbxproj index f38d4199..7b9e0f27 100644 --- a/ios/VoskApiTest.xcodeproj/project.pbxproj +++ b/ios/VoskApiTest.xcodeproj/project.pbxproj @@ -13,10 +13,10 @@ 92375229240C550B00DD6076 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 92375228240C550B00DD6076 /* Assets.xcassets */; }; 9237522C240C550B00DD6076 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 9237522A240C550B00DD6076 /* LaunchScreen.storyboard */; }; 92375234240C558900DD6076 /* Vosk.swift in Sources */ = {isa = PBXBuildFile; fileRef = 92375233240C558900DD6076 /* Vosk.swift */; }; - 9237523C240C642000DD6076 /* libkaldiwrap.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 9237523A240C642000DD6076 /* libkaldiwrap.a */; }; 92375244240C6DAF00DD6076 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 92375243240C6DAF00DD6076 /* Accelerate.framework */; }; - 92375246240C6DC900DD6076 /* libstdc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 92375245240C6DC900DD6076 /* libstdc++.tbd */; }; 92375274240C6F1E00DD6076 /* 10001-90210-01803.wav in Resources */ = {isa = PBXBuildFile; fileRef = 92375256240C6E3D00DD6076 /* 10001-90210-01803.wav */; }; + 925527A9273C492C00FFD9CC /* libvosk.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 925527A8273C492C00FFD9CC /* libvosk.a */; }; + 92833003273C466E00058B52 /* libc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 92833002273C466E00058B52 /* libc++.tbd */; }; 92BACED125BE125A00B5CC93 /* vosk-model-small-en-us-0.15 in Resources */ = {isa = PBXBuildFile; fileRef = 928CC50C25BE124400490481 /* vosk-model-small-en-us-0.15 */; }; 92D6B8D325BDFEAC007FF08D /* VoskModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 92D6B8D225BDFEAC007FF08D /* VoskModel.swift */; }; 92D86BD6253F823F0040D53F /* vosk-model-spk-0.4 in Resources */ = {isa = PBXBuildFile; fileRef = 92D86BD4253F823F0040D53F /* vosk-model-spk-0.4 */; }; @@ -31,10 +31,10 @@ 9237522B240C550B00DD6076 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = ""; }; 9237522D240C550B00DD6076 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; 92375233240C558900DD6076 /* Vosk.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Vosk.swift; sourceTree = ""; }; - 9237523A240C642000DD6076 /* libkaldiwrap.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = libkaldiwrap.a; sourceTree = ""; }; 92375243240C6DAF00DD6076 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; }; - 92375245240C6DC900DD6076 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; }; 92375256240C6E3D00DD6076 /* 10001-90210-01803.wav */ = {isa = PBXFileReference; lastKnownFileType = audio.wav; path = "10001-90210-01803.wav"; sourceTree = ""; }; + 925527A8273C492C00FFD9CC /* libvosk.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = libvosk.a; sourceTree = ""; }; + 92833002273C466E00058B52 /* libc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libc++.tbd"; path = "usr/lib/libc++.tbd"; sourceTree = SDKROOT; }; 928CC50C25BE124400490481 /* vosk-model-small-en-us-0.15 */ = {isa = PBXFileReference; lastKnownFileType = folder; name = "vosk-model-small-en-us-0.15"; path = "/Users/shmyrev/Documents/IOS/VoskApiTest/VoskApiTest/Vosk/vosk-model-small-en-us-0.15"; sourceTree = ""; }; 92AA22AD244CDD1200DA464B /* vosk_api.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vosk_api.h; sourceTree = ""; }; 92AA22AE244CDD5200DA464B /* bridging.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bridging.h; sourceTree = ""; }; @@ -47,9 +47,9 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( - 92375246240C6DC900DD6076 /* libstdc++.tbd in Frameworks */, + 92833003273C466E00058B52 /* libc++.tbd in Frameworks */, 92375244240C6DAF00DD6076 /* Accelerate.framework in Frameworks */, - 9237523C240C642000DD6076 /* libkaldiwrap.a in Frameworks */, + 925527A9273C492C00FFD9CC /* libvosk.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -93,11 +93,11 @@ 92375239240C642000DD6076 /* Vosk */ = { isa = PBXGroup; children = ( - 928CC50C25BE124400490481 /* vosk-model-small-en-us-0.15 */, 92D86BD4253F823F0040D53F /* vosk-model-spk-0.4 */, - 92375256240C6E3D00DD6076 /* 10001-90210-01803.wav */, + 928CC50C25BE124400490481 /* vosk-model-small-en-us-0.15 */, + 925527A8273C492C00FFD9CC /* libvosk.a */, 92AA22AD244CDD1200DA464B /* vosk_api.h */, - 9237523A240C642000DD6076 /* libkaldiwrap.a */, + 92375256240C6E3D00DD6076 /* 10001-90210-01803.wav */, ); name = Vosk; path = VoskApiTest/Vosk; @@ -106,7 +106,7 @@ 92375242240C6DAF00DD6076 /* Frameworks */ = { isa = PBXGroup; children = ( - 92375245240C6DC900DD6076 /* libstdc++.tbd */, + 92833002273C466E00058B52 /* libc++.tbd */, 92375243240C6DAF00DD6076 /* Accelerate.framework */, ); name = Frameworks; @@ -145,7 +145,7 @@ 9237521D240C550B00DD6076 = { CreatedOnToolsVersion = 8.3.2; LastSwiftMigration = 0920; - ProvisioningStyle = Automatic; + ProvisioningStyle = Manual; }; }; }; @@ -223,7 +223,6 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; @@ -245,7 +244,7 @@ CLANG_WARN_SUSPICIOUS_MOVE = YES; CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer"; + "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = ""; COPY_PHASE_STRIP = NO; DEBUG_INFORMATION_FORMAT = dwarf; ENABLE_STRICT_OBJC_MSGSEND = YES; @@ -282,7 +281,6 @@ ALWAYS_SEARCH_USER_PATHS = NO; CLANG_ANALYZER_NONNULL = YES; CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; - CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x"; CLANG_CXX_LIBRARY = "libc++"; CLANG_ENABLE_MODULES = YES; CLANG_ENABLE_OBJC_ARC = YES; @@ -304,7 +302,7 @@ CLANG_WARN_SUSPICIOUS_MOVE = YES; CLANG_WARN_UNREACHABLE_CODE = YES; CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; - "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer"; + "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = ""; COPY_PHASE_STRIP = NO; DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; ENABLE_NS_ASSERTIONS = NO; @@ -319,6 +317,7 @@ GCC_WARN_UNUSED_VARIABLE = YES; IPHONEOS_DEPLOYMENT_TARGET = 10.3; MTL_ENABLE_DEBUG_INFO = NO; + ONLY_ACTIVE_ARCH = YES; SDKROOT = iphoneos; SWIFT_OBJC_BRIDGING_HEADER = bridging.h; SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule"; @@ -333,7 +332,10 @@ buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; CLANG_ENABLE_MODULES = YES; - ENABLE_BITCODE = YES; + "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer"; + CODE_SIGN_STYLE = Manual; + DEVELOPMENT_TEAM = ""; + ENABLE_BITCODE = NO; INFOPLIST_FILE = VoskApiTest/Info.plist; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks"; LIBRARY_SEARCH_PATHS = ( @@ -342,11 +344,13 @@ ); PRODUCT_BUNDLE_IDENTIFIER = com.alphacephei.VoskApiTest; PRODUCT_NAME = "$(TARGET_NAME)"; + PROVISIONING_PROFILE_SPECIFIER = ""; SWIFT_INSTALL_OBJC_HEADER = YES; SWIFT_OBJC_BRIDGING_HEADER = VoskApiTest/bridging.h; SWIFT_OPTIMIZATION_LEVEL = "-Onone"; SWIFT_SWIFT3_OBJC_INFERENCE = Default; SWIFT_VERSION = 4.0; + TARGETED_DEVICE_FAMILY = "1,2"; }; name = Debug; }; @@ -355,7 +359,10 @@ buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; CLANG_ENABLE_MODULES = YES; - ENABLE_BITCODE = YES; + "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer"; + CODE_SIGN_STYLE = Manual; + DEVELOPMENT_TEAM = ""; + ENABLE_BITCODE = NO; INFOPLIST_FILE = VoskApiTest/Info.plist; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks"; LIBRARY_SEARCH_PATHS = ( @@ -364,10 +371,12 @@ ); PRODUCT_BUNDLE_IDENTIFIER = com.alphacephei.VoskApiTest; PRODUCT_NAME = "$(TARGET_NAME)"; + PROVISIONING_PROFILE_SPECIFIER = ""; SWIFT_INSTALL_OBJC_HEADER = YES; SWIFT_OBJC_BRIDGING_HEADER = VoskApiTest/bridging.h; SWIFT_SWIFT3_OBJC_INFERENCE = Default; SWIFT_VERSION = 4.0; + TARGETED_DEVICE_FAMILY = "1,2"; }; name = Release; }; diff --git a/ios/VoskApiTest.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/ios/VoskApiTest.xcodeproj/project.xcworkspace/contents.xcworkspacedata index 3609e01e..919434a6 100644 --- a/ios/VoskApiTest.xcodeproj/project.xcworkspace/contents.xcworkspacedata +++ b/ios/VoskApiTest.xcodeproj/project.xcworkspace/contents.xcworkspacedata @@ -1,9 +1,6 @@ - - diff --git a/ios/VoskApiTest/Vosk.swift b/ios/VoskApiTest/Vosk.swift index 104b895a..a2097cc5 100644 --- a/ios/VoskApiTest/Vosk.swift +++ b/ios/VoskApiTest/Vosk.swift @@ -14,7 +14,7 @@ public final class Vosk { var recognizer : OpaquePointer! init(model: VoskModel, sampleRate: Float) { - recognizer = vosk_recognizer_new_spk(model.model, model.spkModel, sampleRate) + recognizer = vosk_recognizer_new_spk(model.model, sampleRate, model.spkModel) } deinit { diff --git a/ios/VoskApiTest/Vosk/vosk_api.h b/ios/VoskApiTest/Vosk/vosk_api.h index 8af3ee85..7636caa6 100644 --- a/ios/VoskApiTest/Vosk/vosk_api.h +++ b/ios/VoskApiTest/Vosk/vosk_api.h @@ -1,4 +1,4 @@ -// Copyright 2020 Alpha Cephei Inc. +// Copyright 2020-2021 Alpha Cephei Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -43,7 +43,7 @@ typedef struct VoskRecognizer VoskRecognizer; /** Loads model data from the file and returns the model object * * @param model_path: the path of the model on the filesystem - @ @returns model object */ + * @returns model object or NULL if problem occured */ VoskModel *vosk_model_new(const char *model_path); @@ -55,10 +55,18 @@ VoskModel *vosk_model_new(const char *model_path); void vosk_model_free(VoskModel *model); +/** Check if a word can be recognized by the model + * @param word: the word + * @returns the word symbol if @param word exists inside the model + * or -1 otherwise. + * Reminding that word symbol 0 is for */ +int vosk_model_find_word(VoskModel *model, const char *word); + + /** Loads speaker model data from the file and returns the model object * * @param model_path: the path of the model on the filesystem - * @returns model object */ + * @returns model object or NULL if problem occured */ VoskSpkModel *vosk_spk_model_new(const char *model_path); @@ -71,9 +79,13 @@ void vosk_spk_model_free(VoskSpkModel *model); /** Creates the recognizer object * - * The recognizers process the speech and return text using shared model data - * @param sample_rate The sample rate of the audio you going to feed into the recognizer - * @returns recognizer object */ + * The recognizers process the speech and return text using shared model data + * @param model VoskModel containing static data for recognizer. Model can be + * shared across recognizers, even running in different threads. + * @param sample_rate The sample rate of the audio you going to feed into the recognizer. + * Make sure this rate matches the audio content, it is a common + * issue causing accuracy problems. + * @returns recognizer object or NULL if problem occured */ VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate); @@ -82,10 +94,14 @@ VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate); * With the speaker recognition mode the recognizer not just recognize * text but also return speaker vectors one can use for speaker identification * + * @param model VoskModel containing static data for recognizer. Model can be + * shared across recognizers, even running in different threads. + * @param sample_rate The sample rate of the audio you going to feed into the recognizer. + * Make sure this rate matches the audio content, it is a common + * issue causing accuracy problems. * @param spk_model speaker model for speaker identification - * @param sample_rate The sample rate of the audio you going to feed into the recognizer - * @returns recognizer object */ -VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, VoskSpkModel *spk_model, float sample_rate); + * @returns recognizer object or NULL if problem occured */ +VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, VoskSpkModel *spk_model); /** Creates the recognizer object with the phrase list @@ -98,42 +114,46 @@ VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, VoskSpkModel *spk_mode * Only recognizers with lookahead models support this type of quick configuration. * Precompiled HCLG graph models are not supported. * - * @param sample_rate The sample rate of the audio you going to feed into the recognizer + * @param model VoskModel containing static data for recognizer. Model can be + * shared across recognizers, even running in different threads. + * @param sample_rate The sample rate of the audio you going to feed into the recognizer. + * Make sure this rate matches the audio content, it is a common + * issue causing accuracy problems. * @param grammar The string with the list of phrases to recognize as JSON array of strings, * for example "["one two three four five", "[unk]"]". * - * @returns recognizer object */ + * @returns recognizer object or NULL if problem occured */ VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, const char *grammar); -/** Accept voice data +/** Adds speaker model to already initialized recognizer * - * accept and process new chunk of voice data + * Can add speaker recognition model to already created recognizer. Helps to initialize + * speaker recognition for grammar-based recognizer. * - * @param data - audio data in PCM 16-bit mono format - * @param length - length of the audio data - * @returns true if silence is occured and you can retrieve a new utterance with result method */ -int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length); + * @param spk_model Speaker recognition model */ +void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model); -/** Same as above but the version with the short data for language bindings where you have - * audio as array of shorts */ -int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length); - - -/** Same as above but the version with the float data for language bindings where you have - * audio as array of floats */ -int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length); +/** Configures recognizer to output n-best results + * + *
+ *   {
+ *      "alternatives": [
+ *          { "text": "one two three four five", "confidence": 0.97 },
+ *          { "text": "one two three for five", "confidence": 0.03 },
+ *      ]
+ *   }
+ * 
+ * + * @param max_alternatives - maximum alternatives to return from recognition results + */ +void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_alternatives); -/** Returns speech recognition result - * - * @returns the result in JSON format which contains decoded line, decoded - * words, times in seconds and confidences. You can parse this result - * with any json parser +/** Enables words with times in the output * *
- * {
  *   "result" : [{
  *       "conf" : 1.000000,
  *       "end" : 1.110000,
@@ -156,13 +176,54 @@ int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *d
  *       "word" : "zero"
  *     }, {
  *       "conf" : 1.000000,
- *      "end" : 2.610000,
+ *       "end" : 2.610000,
  *       "start" : 2.340000,
  *       "word" : "one"
  *     }],
- *   "text" : "what zero zero zero one"
+ * 
+ * + * @param words - boolean value + */ +void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words); + + +/** Accept voice data + * + * accept and process new chunk of voice data + * + * @param data - audio data in PCM 16-bit mono format + * @param length - length of the audio data + * @returns 1 if silence is occured and you can retrieve a new utterance with result method + * 0 if decoding continues + * -1 if exception occured */ +int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length); + + +/** Same as above but the version with the short data for language bindings where you have + * audio as array of shorts */ +int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length); + + +/** Same as above but the version with the float data for language bindings where you have + * audio as array of floats */ +int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length); + + +/** Returns speech recognition result + * + * @returns the result in JSON format which contains decoded line, decoded + * words, times in seconds and confidences. You can parse this result + * with any json parser + * + *
+ *  {
+ *    "text" : "what zero zero zero one"
  *  }
  * 
+ * + * If alternatives enabled it returns result with alternatives, see also vosk_recognizer_set_alternatives(). + * + * If word times enabled returns word time, see also vosk_recognizer_set_word_times(). */ const char *vosk_recognizer_result(VoskRecognizer *recognizer); @@ -174,7 +235,7 @@ const char *vosk_recognizer_result(VoskRecognizer *recognizer); * *
  * {
- *  "partial" : "cyril one eight zero"
+ *    "partial" : "cyril one eight zero"
  * }
  * 
*/ @@ -190,6 +251,12 @@ const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer); const char *vosk_recognizer_final_result(VoskRecognizer *recognizer); +/** Resets the recognizer + * + * Resets current results so the recognition can continue from scratch */ +void vosk_recognizer_reset(VoskRecognizer *recognizer); + + /** Releases recognizer object * * Underlying model is also unreferenced and if needed released */ @@ -204,6 +271,20 @@ void vosk_recognizer_free(VoskRecognizer *recognizer); */ void vosk_set_log_level(int log_level); +/** + * Init, automatically select a CUDA device and allow multithreading. + * Must be called once from the main thread. + * Has no effect if HAVE_CUDA flag is not set. + */ +void vosk_gpu_init(); + +/** + * Init CUDA device in a multi-threaded environment. + * Must be called for each thread. + * Has no effect if HAVE_CUDA flag is not set. + */ +void vosk_gpu_thread_init(); + #ifdef __cplusplus } #endif