Skip to content

Commit

Permalink
Add NLSML output
Browse files Browse the repository at this point in the history
  • Loading branch information
nshmyrev committed Feb 3, 2022
1 parent d2c11a6 commit 79b8395
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 1 deletion.
31 changes: 31 additions & 0 deletions python/example/test_nlsml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python3

from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import os
import wave

SetLogLevel(0)

if not os.path.exists("model"):
print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
exit (1)

wf = wave.open(sys.argv[1], "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
print ("Audio file must be WAV format mono PCM.")
exit (1)

model = Model("model")
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetMaxAlternatives(10)
rec.SetNLSML(True)

while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if rec.AcceptWaveform(data):
print(rec.Result())

print(rec.FinalResult())
3 changes: 3 additions & 0 deletions python/vosk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ def SetMaxAlternatives(self, max_alternatives):
def SetWords(self, enable_words):
_c.vosk_recognizer_set_words(self._handle, 1 if enable_words else 0)

def SetNLSML(self, enable_nlsml):
_c.vosk_recognizer_set_nlsml(self._handle, 1 if enable_nlsml else 0)

def SetSpkModel(self, spk_model):
_c.vosk_recognizer_set_spk_model(self._handle, spk_model._handle)

Expand Down
65 changes: 64 additions & 1 deletion src/recognizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@ void Recognizer::SetWords(bool words)
words_ = words;
}

void Recognizer::SetNLSML(bool nlsml)
{
nlsml_ = nlsml;
}

void Recognizer::SetSpkModel(SpkModel *spk_model)
{
if (state_ == RECOGNIZER_RUNNING) {
Expand Down Expand Up @@ -534,7 +539,6 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
fst::ConvertNbestToVector(nbest_lat, &nbest_lats);

json::JSON obj;
std::stringstream ss;
for (int k = 0; k < nbest_lats.size(); k++) {

Lattice nlat = nbest_lats[k];
Expand Down Expand Up @@ -584,6 +588,63 @@ const char *Recognizer::NbestResult(CompactLattice &clat)
return StoreReturn(obj.dump());
}

const char *Recognizer::NlsmlResult(CompactLattice &clat)
{
Lattice lat;
Lattice nbest_lat;
std::vector<Lattice> nbest_lats;

ConvertLattice (clat, &lat);
fst::ShortestPath(lat, &nbest_lat, max_alternatives_);
fst::ConvertNbestToVector(nbest_lat, &nbest_lats);

std::stringstream ss;
ss << "<?xml version=\"1.0\"?>\n";
ss << "<result grammar=\"default\">\n";

for (int k = 0; k < nbest_lats.size(); k++) {

Lattice nlat = nbest_lats[k];

CompactLattice nclat;
fst::Invert(&nlat);
DeterminizeLattice(nlat, &nclat);

CompactLattice aligned_nclat;
if (model_->winfo_) {
WordAlignLattice(nclat, *model_->trans_model_, *model_->winfo_, 0, &aligned_nclat);
} else {
aligned_nclat = nclat;
}

std::vector<int32> words;
std::vector<int32> begin_times;
std::vector<int32> lengths;
CompactLattice::Weight weight;

CompactLatticeToWordAlignmentWeight(aligned_nclat, &words, &begin_times, &lengths, &weight);
float likelihood = -(weight.Weight().Value1() + weight.Weight().Value2());

stringstream text;
for (int i = 0; i < words.size(); i++) {
json::JSON word;
if (words[i] == 0)
continue;
if (i)
text << " ";
text << model_->word_syms_->Find(words[i]);
}

ss << "<interpretation grammar=\"default\" confidence=\"" << likelihood << "\">\n";
ss << "<input mode=\"speech\">" << text.str() << "</input>\n";
ss << "<instance>" << text.str() << "</instance>\n";
ss << "</interpretation>\n";
}
ss << "</result>\n";

return StoreReturn(ss.str());
}

const char* Recognizer::GetResult()
{
if (decoder_->NumFramesDecoded() == 0) {
Expand Down Expand Up @@ -638,6 +699,8 @@ const char* Recognizer::GetResult()

if (max_alternatives_ == 0) {
return MbrResult(rlat);
} else if (nlsml_) {
return NlsmlResult(rlat);
} else {
return NbestResult(rlat);
}
Expand Down
3 changes: 3 additions & 0 deletions src/recognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class Recognizer {
void SetMaxAlternatives(int max_alternatives);
void SetSpkModel(SpkModel *spk_model);
void SetWords(bool words);
void SetNLSML(bool nlsml);
bool AcceptWaveform(const char *data, int len);
bool AcceptWaveform(const short *sdata, int len);
bool AcceptWaveform(const float *fdata, int len);
Expand All @@ -69,6 +70,7 @@ class Recognizer {
const char *StoreReturn(const string &res);
const char *MbrResult(CompactLattice &clat);
const char *NbestResult(CompactLattice &clat);
const char *NlsmlResult(CompactLattice &clat);

Model *model_ = nullptr;
SingleUtteranceNnet3Decoder *decoder_ = nullptr;
Expand All @@ -94,6 +96,7 @@ class Recognizer {
// Other
int max_alternatives_ = 0; // Disable alternatives by default
bool words_ = false;
bool nlsml_ = false;

float sample_frequency_;
int32 frame_offset_;
Expand Down
5 changes: 5 additions & 0 deletions src/vosk_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words)
((Recognizer *)recognizer)->SetWords((bool)words);
}

void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml)
{
((Recognizer *)recognizer)->SetNLSML((bool)nlsml);
}

void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model)
{
if (recognizer == nullptr || spk_model == nullptr) {
Expand Down
6 changes: 6 additions & 0 deletions src/vosk_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,12 @@ void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_al
void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words);


/** Set NLSML output
* @param nlsml - boolean value
*/
void vosk_recognizer_set_nlsml(VoskRecognizer *recognizer, int nlsml);


/** Accept voice data
*
* accept and process new chunk of voice data
Expand Down

0 comments on commit 79b8395

Please sign in to comment.