From 79b8395be055a9398fbd8f2105b0321fb186ebff Mon Sep 17 00:00:00 2001 From: Nickolay Shmyrev Date: Thu, 3 Feb 2022 23:08:09 +0100 Subject: [PATCH] Add NLSML output --- python/example/test_nlsml.py | 31 +++++++++++++++++ python/vosk/__init__.py | 3 ++ src/recognizer.cc | 65 +++++++++++++++++++++++++++++++++++- src/recognizer.h | 3 ++ src/vosk_api.cc | 5 +++ src/vosk_api.h | 6 ++++ 6 files changed, 112 insertions(+), 1 deletion(-) create mode 100755 python/example/test_nlsml.py diff --git a/python/example/test_nlsml.py b/python/example/test_nlsml.py new file mode 100755 index 00000000..18132093 --- /dev/null +++ b/python/example/test_nlsml.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + +from vosk import Model, KaldiRecognizer, SetLogLevel +import sys +import os +import wave + +SetLogLevel(0) + +if not os.path.exists("model"): + print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.") + exit (1) + +wf = wave.open(sys.argv[1], "rb") +if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": + print ("Audio file must be WAV format mono PCM.") + exit (1) + +model = Model("model") +rec = KaldiRecognizer(model, wf.getframerate()) +rec.SetMaxAlternatives(10) +rec.SetNLSML(True) + +while True: + data = wf.readframes(4000) + if len(data) == 0: + break + if rec.AcceptWaveform(data): + print(rec.Result()) + +print(rec.FinalResult()) diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py index 0e60c2ba..d8e384b9 100644 --- a/python/vosk/__init__.py +++ b/python/vosk/__init__.py @@ -69,6 +69,9 @@ def SetMaxAlternatives(self, max_alternatives): def SetWords(self, enable_words): _c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0) + def SetNLSML(self, enable_nlsml): + _c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0) + def SetSpkModel(self, spk_model): _c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle) diff --git a/src/recognizer.cc b/src/recognizer.cc index f25ff0ee..cfcf638a 100644 --- a/src/recognizer.cc +++ b/src/recognizer.cc @@ -246,6 +246,11 @@ void Recognizer::SetWords(bool words) words_ = words; } +void Recognizer::SetNLSML(bool nlsml) +{ + nlsml_ = nlsml; +} + void Recognizer::SetSpkModel(SpkModel *spk_model) { if (state_ == RECOGNIZER_RUNNING) { @@ -534,7 +539,6 @@ const char *Recognizer::NbestResult(CompactLattice &clat) fst::ConvertNbestToVector(nbest_lat, &nbest_lats); json::JSON obj; - std::stringstream ss; for (int k = 0; k < nbest_lats.size(); k++) { Lattice nlat = nbest_lats[k]; @@ -584,6 +588,63 @@ const char *Recognizer::NbestResult(CompactLattice &clat) return StoreReturn(obj.dump()); } +const char *Recognizer::NlsmlResult(CompactLattice &clat) +{ + Lattice lat; + Lattice nbest_lat; + std::vector nbest_lats; + + ConvertLattice (clat, &lat); + fst::ShortestPath(lat, &nbest_lat, max_alternatives_); + fst::ConvertNbestToVector(nbest_lat, &nbest_lats); + + std::stringstream ss; + ss << "\n"; + ss << "\n"; + + for (int k = 0; k < nbest_lats.size(); k++) { + + Lattice nlat = nbest_lats[k]; + + CompactLattice nclat; + fst::Invert(&nlat); + DeterminizeLattice(nlat, &nclat); + + CompactLattice aligned_nclat; + if (model_->winfo_) { + WordAlignLattice(nclat, *model_->trans_model_, *model_->winfo_, 0, &aligned_nclat); + } else { + aligned_nclat = nclat; + } + + std::vector words; + std::vector begin_times; + std::vector lengths; + CompactLattice::Weight weight; + + CompactLatticeToWordAlignmentWeight(aligned_nclat, &words, &begin_times, &lengths, &weight); + float likelihood = -(weight.Weight().Value1() + weight.Weight().Value2()); + + stringstream text; + for (int i = 0; i < words.size(); i++) { + json::JSON word; + if (words[i] == 0) + continue; + if (i) + text << " "; + text << model_->word_syms_->Find(words[i]); + } + + ss << "\n"; + ss << "" << text.str() << "\n"; + ss << "" << text.str() << "\n"; + ss << "\n"; + } + ss << "\n"; + + return StoreReturn(ss.str()); +} + const char* Recognizer::GetResult() { if (decoder_->NumFramesDecoded() == 0) { @@ -638,6 +699,8 @@ const char* Recognizer::GetResult() if (max_alternatives_ == 0) { return MbrResult(rlat); + } else if (nlsml_) { + return NlsmlResult(rlat); } else { return NbestResult(rlat); } diff --git a/src/recognizer.h b/src/recognizer.h index e5a733d1..b0338a01 100644 --- a/src/recognizer.h +++ b/src/recognizer.h @@ -49,6 +49,7 @@ class Recognizer { void SetMaxAlternatives(int max_alternatives); void SetSpkModel(SpkModel *spk_model); void SetWords(bool words); + void SetNLSML(bool nlsml); bool AcceptWaveform(const char *data, int len); bool AcceptWaveform(const short *sdata, int len); bool AcceptWaveform(const float *fdata, int len); @@ -69,6 +70,7 @@ class Recognizer { const char *StoreReturn(const string &res); const char *MbrResult(CompactLattice &clat); const char *NbestResult(CompactLattice &clat); + const char *NlsmlResult(CompactLattice &clat); Model *model_ = nullptr; SingleUtteranceNnet3Decoder *decoder_ = nullptr; @@ -94,6 +96,7 @@ class Recognizer { // Other int max_alternatives_ = 0; // Disable alternatives by default bool words_ = false; + bool nlsml_ = false; float sample_frequency_; int32 frame_offset_; diff --git a/src/vosk_api.cc b/src/vosk_api.cc index 65356038..5df70715 100644 --- a/src/vosk_api.cc +++ b/src/vosk_api.cc @@ -103,6 +103,11 @@ void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words) ((Recognizer *)recognizer)->SetWords((bool)words); } +void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml) +{ + ((Recognizer *)recognizer)->SetNLSML((bool)nlsml); +} + void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model) { if (recognizer == nullptr || spk_model == nullptr) { diff --git a/src/vosk_api.h b/src/vosk_api.h index f6a981cb..c448087f 100644 --- a/src/vosk_api.h +++ b/src/vosk_api.h @@ -191,6 +191,12 @@ void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_al void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words); +/** Set NLSML output + * @param nlsml - boolean value + */ +void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml); + + /** Accept voice data * * accept and process new chunk of voice data