From 79b8395be055a9398fbd8f2105b0321fb186ebff Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Thu, 3 Feb 2022 23:08:09 +0100
Subject: [PATCH] Add NLSML output

---
 python/example/test_nlsml.py | 31 +++++++++++++++++
 python/vosk/__init__.py      |  3 ++
 src/recognizer.cc            | 65 +++++++++++++++++++++++++++++++++++-
 src/recognizer.h             |  3 ++
 src/vosk_api.cc              |  5 +++
 src/vosk_api.h               |  6 ++++
 6 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100755 python/example/test_nlsml.py
diff --git a/python/example/test_nlsml.py b/python/example/test_nlsml.py
new file mode 100755
index 00000000..18132093
--- /dev/null
+++ b/python/example/test_nlsml.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+from vosk import Model, KaldiRecognizer, SetLogLevel
+import sys
+import os
+import wave
+
+SetLogLevel(0)
+
+if not os.path.exists("model"):
+    print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
+    exit (1)
+
+wf = wave.open(sys.argv[1], "rb")
+if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
+    print ("Audio file must be WAV format mono PCM.")
+    exit (1)
+
+model = Model("model")
+rec = KaldiRecognizer(model, wf.getframerate())
+rec.SetMaxAlternatives(10)
+rec.SetNLSML(True)
+
+while True:
+    data = wf.readframes(4000)
+    if len(data) == 0:
+        break
+    if rec.AcceptWaveform(data):
+        print(rec.Result())
+
+print(rec.FinalResult())
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index 0e60c2ba..d8e384b9 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -69,6 +69,9 @@ def SetMaxAlternatives(self, max_alternatives):
     def SetWords(self, enable_words):
         _c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0)
 
+    def SetNLSML(self, enable_nlsml):
+        _c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0)
+
     def SetSpkModel(self, spk_model):
         _c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle)
 
diff --git a/src/recognizer.cc b/src/recognizer.cc
index f25ff0ee..cfcf638a 100644
--- a/src/recognizer.cc
+++ b/src/recognizer.cc
@@ -246,6 +246,11 @@ void Recognizer::SetWords(bool words)
     words_ = words;
 }
 
+void Recognizer::SetNLSML(bool nlsml)
+{
+    nlsml_ = nlsml;
+}
+
 void Recognizer::SetSpkModel(SpkModel *spk_model)
 {
     if (state_ == RECOGNIZER_RUNNING) {
@@ -534,7 +539,6 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
     fst::ConvertNbestToVector(nbest_lat, &nbest_lats);
 
     json::JSON obj;
-    std::stringstream ss;
     for (int k = 0; k < nbest_lats.size(); k++) {
 
       Lattice nlat = nbest_lats[k];
@@ -584,6 +588,63 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
     return StoreReturn(obj.dump());
 }
 
+const char *Recognizer::NlsmlResult(CompactLattice &clat)
+{
+    Lattice lat;
+    Lattice nbest_lat;
+    std::vector<Lattice> nbest_lats;
+
+    ConvertLattice (clat, &lat);
+    fst::ShortestPath(lat, &nbest_lat, max_alternatives_);
+    fst::ConvertNbestToVector(nbest_lat, &nbest_lats);
+
+    std::stringstream ss;
+    ss << "<?xml version=\"1.0\"?>\n";
+    ss << "<result grammar=\"default\">\n";
+
+    for (int k = 0; k < nbest_lats.size(); k++) {
+
+      Lattice nlat = nbest_lats[k];
+
+      CompactLattice nclat;
+      fst::Invert(&nlat);
+      DeterminizeLattice(nlat, &nclat);
+
+      CompactLattice aligned_nclat;
+      if (model_->winfo_) {
+          WordAlignLattice(nclat, *model_->trans_model_, *model_->winfo_, 0, &aligned_nclat);
+      } else {
+          aligned_nclat = nclat;
+      }
+
+      std::vector<int32> words;
+      std::vector<int32> begin_times;
+      std::vector<int32> lengths;
+      CompactLattice::Weight weight;
+
+      CompactLatticeToWordAlignmentWeight(aligned_nclat, &words, &begin_times, &lengths, &weight);
+      float likelihood = -(weight.Weight().Value1() + weight.Weight().Value2());
+
+      stringstream text;
+      for (int i = 0; i < words.size(); i++) {
+        json::JSON word;
+        if (words[i] == 0)
+            continue;
+        if (i)
+          text << " ";
+        text << model_->word_syms_->Find(words[i]);
+      }
+
+      ss << "<interpretation grammar=\"default\" confidence=\"" << likelihood << "\">\n";
+      ss << "<input mode=\"speech\">" << text.str() << "</input>\n";
+      ss << "<instance>" << text.str() << "</instance>\n";
+      ss << "</interpretation>\n";
+    }
+    ss << "</result>\n";
+
+    return StoreReturn(ss.str());
+}
+
 const char* Recognizer::GetResult()
 {
     if (decoder_->NumFramesDecoded() == 0) {
@@ -638,6 +699,8 @@ const char* Recognizer::GetResult()
 
     if (max_alternatives_ == 0) {
         return MbrResult(rlat);
+    } else if (nlsml_) {
+        return NlsmlResult(rlat);
     } else {
         return NbestResult(rlat);
     }
diff --git a/src/recognizer.h b/src/recognizer.h
index e5a733d1..b0338a01 100644
--- a/src/recognizer.h
+++ b/src/recognizer.h
@@ -49,6 +49,7 @@ class Recognizer {
         void SetMaxAlternatives(int max_alternatives);
         void SetSpkModel(SpkModel *spk_model);
         void SetWords(bool words);
+        void SetNLSML(bool nlsml);
         bool AcceptWaveform(const char *data, int len);
         bool AcceptWaveform(const short *sdata, int len);
         bool AcceptWaveform(const float *fdata, int len);
@@ -69,6 +70,7 @@ class Recognizer {
         const char *StoreReturn(const string &res);
         const char *MbrResult(CompactLattice &clat);
         const char *NbestResult(CompactLattice &clat);
+        const char *NlsmlResult(CompactLattice &clat);
 
         Model *model_ = nullptr;
         SingleUtteranceNnet3Decoder *decoder_ = nullptr;
@@ -94,6 +96,7 @@ class Recognizer {
         // Other
         int max_alternatives_ = 0; // Disable alternatives by default
         bool words_ = false;
+        bool nlsml_ = false;
 
         float sample_frequency_;
         int32 frame_offset_;
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 65356038..5df70715 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -103,6 +103,11 @@ void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words)
     ((Recognizer *)recognizer)->SetWords((bool)words);
 }
 
+void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml)
+{
+    ((Recognizer *)recognizer)->SetNLSML((bool)nlsml);
+}
+
 void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model)
 {
     if (recognizer == nullptr || spk_model == nullptr) {
diff --git a/src/vosk_api.h b/src/vosk_api.h
index f6a981cb..c448087f 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -191,6 +191,12 @@ void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_al
 void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words);
 
 
+/** Set NLSML output
+ * @param nlsml - boolean value
+ */
+void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml);
+
+
 /** Accept voice data
  *
  *  accept and process new chunk of voice data