From 6977be7fb78a82dbae7c80eeab360f46c2035c5e Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Sun, 12 Dec 2021 21:37:44 +0100
Subject: [PATCH 1/8] Batch recognizer draft

---
 python/example/batch/test_batch.py         |  26 +++++
 python/vosk/__init__.py                    |  20 ++++
 src/Makefile                               |  11 ++-
 src/batch_recognizer.cc                    | 107 +++++++++++++++++++++
 src/batch_recognizer.h                     |  67 +++++++++++++
 src/model.h                                |   6 +-
 src/{kaldi_recognizer.cc => recognizer.cc} |  52 +++++-----
 src/{kaldi_recognizer.h => recognizer.h}   |  14 +--
 src/spk_model.h                            |   4 +-
 src/vosk_api.cc                            |  57 ++++++++---
 src/vosk_api.h                             |  24 +++++
 11 files changed, 333 insertions(+), 55 deletions(-)
 create mode 100755 python/example/batch/test_batch.py
 create mode 100644 src/batch_recognizer.cc
 create mode 100644 src/batch_recognizer.h
 rename src/{kaldi_recognizer.cc => recognizer.cc} (93%)
 rename src/{kaldi_recognizer.h => recognizer.h} (91%)

diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
new file mode 100755
index 00000000..fb1bb7e9
--- /dev/null
+++ b/python/example/batch/test_batch.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+from vosk import Model, BatchRecognizer
+import sys
+import os
+import wave
+
+model = Model("model")
+rec = BatchRecognizer(model, 16000.0)
+
+fnames = open("tedlium.list").readlines()
+fds = [open(x) for x in fnames]
+ended = set()
+while True:
+    for i, fd in fds:
+        if i in ended():
+            continue
+        data = fd.read(4000)
+        if len(data) == 0:
+            rec.FinishStream(i)
+            ended.add(i)
+        else:
+            rec.AcceptWaveform(i, data)
+    rec.Results()
+    if len(ended) == len(fds):
+        break
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index cf39a472..02e1df97 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -101,3 +101,23 @@ def GpuInit():
 
 def GpuThreadInit():
     _c.vosk_gpu_thread_init()
+
+class BatchRecognizer(object):
+
+    def __init__(self, *args):
+        self._handle = _c.vosk_batch_recognizer_new(args[0]._handle, args[1])
+
+        if self._handle == _ffi.NULL:
+            raise Exception("Failed to create a recognizer")
+
+    def __del__(self):
+        _c.vosk_batch_recognizer_free(self._handle)
+
+    def AcceptWaveform(self, uid, data):
+        res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data))
+
+    def Results(self):
+        return _ffi.string(_c.vosk_batch_recognizer_result(self._handle)).decode('utf-8')
+
+    def FinishStream(self, uid):
+        _c.vosk_recognizer_final_result(self._handle, uid)
diff --git a/src/Makefile b/src/Makefile
index 54e96ca7..96c21949 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -18,16 +18,18 @@ EXTRA_LDFLAGS?=
 OUTDIR?=.
 
 VOSK_SOURCES= \
-	kaldi_recognizer.cc \
+	recognizer.cc \
 	language_model.cc \
 	model.cc \
 	spk_model.cc \
+	batch_recognizer.cc \
 	vosk_api.cc
 
 VOSK_HEADERS= \
-	kaldi_recognizer.h \
+	recognizer.h \
 	language_model.h \
 	model.h \
+	batch_recognizer.h \
 	spk_model.h \
 	vosk_api.h
 
@@ -76,7 +78,10 @@ endif
 
 ifeq ($(HAVE_CUDA), 1)
     CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include
-    LIBS+=-L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
+    LIBS+=\
+        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
+        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
+        -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
 endif
 
 all: $(OUTDIR)/libvosk.$(EXT)
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
new file mode 100644
index 00000000..112f1fe9
--- /dev/null
+++ b/src/batch_recognizer.cc
@@ -0,0 +1,107 @@
+// Copyright 2019-2020 Alpha Cephei Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "batch_recognizer.h"
+
+#include "fstext/fstext-utils.h"
+#include "lat/sausages.h"
+
+using namespace fst;
+using namespace kaldi::nnet3;
+using CorrelationID = CudaOnlinePipelineDynamicBatcher::CorrelationID;
+
+BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(model), sample_frequency_(sample_frequency) {
+    model_->Ref();
+
+    BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config;
+
+    cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline 
+         (batched_decoder_config, *model_->hclg_fst_, *model_->nnet_, *model_->trans_model_);
+
+    CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config;
+    dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config,
+                                                            *cuda_pipeline_);
+
+    InitRescoring();
+}
+
+BatchRecognizer::~BatchRecognizer() {
+    delete lm_to_subtract_;
+    delete carpa_to_add_;
+    delete carpa_to_add_scale_;
+
+    delete cuda_pipeline_;
+    delete dynamic_batcher_;
+
+    model_->Unref();
+}
+
+void BatchRecognizer::InitRescoring()
+{
+    if (model_->graph_lm_fst_) {
+        fst::CacheOptions cache_opts(true, -1);
+        fst::ArcMapFstOptions mapfst_opts(cache_opts);
+        fst::StdToLatticeMapper<BaseFloat> mapper;
+        lm_to_subtract_ = new fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >(*model_->graph_lm_fst_, mapper, mapfst_opts);
+        carpa_to_add_ = new ConstArpaLmDeterministicFst(model_->const_arpa_);
+    }
+}
+
+void BatchRecognizer::FinishStream(uint64_t id)
+{
+    streams_.erase(id);
+}
+
+void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
+{
+    bool first = false;
+
+    if (streams_.find(id) == streams_.end()) {
+        first = true;
+        streams_.insert(id);
+
+        // Define the callback for results.
+        cuda_pipeline_->SetBestPathCallback(
+          id,
+          [&, id](const std::string &str, bool partial,
+                       bool endpoint_detected) {
+              if (partial) {
+                  KALDI_LOG << "id #" << id << " [partial] : " << str;
+              }
+
+              if (endpoint_detected) {
+                  KALDI_LOG << "id #" << id << " [endpoint detected]";
+              }
+
+              if (!partial) {
+                  KALDI_LOG << "id #" << id << " : " << str;
+              }
+            });
+    }
+
+    Vector<BaseFloat> wave;
+    wave.Resize(len / 2, kUndefined);
+    for (int i = 0; i < len / 2; i++)
+        wave(i) = *(((short *)data) + i);
+    SubVector<BaseFloat> chunk(wave.Data(), 0);
+
+    dynamic_batcher_->Push(id, first, false, chunk);
+}
+
+const char* BatchRecognizer::PullResults()
+{
+    dynamic_batcher_->WaitForCompletion();
+    cudaDeviceSynchronize();
+    return "";
+}
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
new file mode 100644
index 00000000..00f4a0db
--- /dev/null
+++ b/src/batch_recognizer.h
@@ -0,0 +1,67 @@
+// Copyright 2019 Alpha Cephei Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VOSK_GPU_RECOGNIZER_H
+#define VOSK_GPU_RECOGNIZER_H
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fstext/fstext-lib.h"
+#include "fstext/fstext-utils.h"
+#include "decoder/lattice-faster-decoder.h"
+#include "feat/feature-mfcc.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/word-align-lattice.h"
+#include "lat/compose-lattice-pruned.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "nnet3/nnet-utils.h"
+
+#include "cudadecoder/cuda-online-pipeline-dynamic-batcher.h"
+#include "cudadecoder/batched-threaded-nnet3-cuda-online-pipeline.h"
+#include "cudadecoder/batched-threaded-nnet3-cuda-pipeline2.h"
+#include "cudadecoder/cuda-pipeline-common.h"
+
+#include "model.h"
+
+using namespace kaldi;
+using namespace kaldi::cuda_decoder;
+
+class BatchRecognizer {
+    public:
+        BatchRecognizer(Model *model, float sample_frequency);
+        ~BatchRecognizer();
+
+        void FinishStream(uint64_t id);
+        void AcceptWaveform(uint64_t id, const char *data, int len);
+        const char* PullResults();
+
+    private:
+        void InitRescoring();
+
+        Model *model_ = nullptr;
+        BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr;
+        CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr;
+
+        std::set<int> streams_;
+
+        // Rescoring
+        fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> > *lm_to_subtract_ = nullptr;
+        kaldi::ConstArpaLmDeterministicFst *carpa_to_add_ = nullptr;
+        fst::ScaleDeterministicOnDemandFst *carpa_to_add_scale_ = nullptr;
+
+        float sample_frequency_;
+};
+
+#endif /* VOSK_GPU_RECOGNIZER_H */
diff --git a/src/model.h b/src/model.h
index d5feedd0..c36a96aa 100644
--- a/src/model.h
+++ b/src/model.h
@@ -36,7 +36,8 @@
 using namespace kaldi;
 using namespace std;
 
-class KaldiRecognizer;
+class Recognizer;
+class BatchRecognizer;
 
 class Model {
 
@@ -52,7 +53,8 @@ class Model {
     void ConfigureV2();
     void ReadDataFiles();
 
-    friend class KaldiRecognizer;
+    friend class Recognizer;
+    friend class BatchRecognizer;
 
     string model_path_str_;
     string nnet3_rxfilename_;
diff --git a/src/kaldi_recognizer.cc b/src/recognizer.cc
similarity index 93%
rename from src/kaldi_recognizer.cc
rename to src/recognizer.cc
index 86cf9bdd..f25ff0ee 100644
--- a/src/kaldi_recognizer.cc
+++ b/src/recognizer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kaldi_recognizer.h"
+#include "recognizer.h"
 #include "json.h"
 #include "fstext/fstext-utils.h"
 #include "lat/sausages.h"
@@ -21,7 +21,7 @@
 using namespace fst;
 using namespace kaldi::nnet3;
 
-KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) {
+Recognizer::Recognizer(Model *model, float sample_frequency) : model_(model), spk_model_(0), sample_frequency_(sample_frequency) {
 
     model_->Ref();
 
@@ -46,7 +46,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency) : model_(
     InitRescoring();
 }
 
-KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency)
+Recognizer::Recognizer(Model *model, float sample_frequency, char const *grammar) : model_(model), spk_model_(0), sample_frequency_(sample_frequency)
 {
     model_->Ref();
 
@@ -107,7 +107,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, char cons
     InitRescoring();
 }
 
-KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) {
+Recognizer::Recognizer(Model *model, float sample_frequency, SpkModel *spk_model) : model_(model), spk_model_(spk_model), sample_frequency_(sample_frequency) {
 
     model_->Ref();
     spk_model->Ref();
@@ -135,7 +135,7 @@ KaldiRecognizer::KaldiRecognizer(Model *model, float sample_frequency, SpkModel
     InitRescoring();
 }
 
-KaldiRecognizer::~KaldiRecognizer() {
+Recognizer::~Recognizer() {
     delete decoder_;
     delete feature_pipeline_;
     delete silence_weighting_;
@@ -155,7 +155,7 @@ KaldiRecognizer::~KaldiRecognizer() {
          spk_model_->Unref();
 }
 
-void KaldiRecognizer::InitState()
+void Recognizer::InitState()
 {
     frame_offset_ = 0;
     samples_processed_ = 0;
@@ -164,7 +164,7 @@ void KaldiRecognizer::InitState()
     state_ = RECOGNIZER_INITIALIZED;
 }
 
-void KaldiRecognizer::InitRescoring()
+void Recognizer::InitRescoring()
 {
     if (model_->graph_lm_fst_) {
 
@@ -185,7 +185,7 @@ void KaldiRecognizer::InitRescoring()
     }
 }
 
-void KaldiRecognizer::CleanUp()
+void Recognizer::CleanUp()
 {
     delete silence_weighting_;
     silence_weighting_ = new kaldi::OnlineSilenceWeighting(*model_->trans_model_, model_->feature_info_.silence_weighting_config, 3);
@@ -223,7 +223,7 @@ void KaldiRecognizer::CleanUp()
     }
 }
 
-void KaldiRecognizer::UpdateSilenceWeights()
+void Recognizer::UpdateSilenceWeights()
 {
     if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0 &&
         feature_pipeline_->IvectorFeature() != nullptr) {
@@ -236,17 +236,17 @@ void KaldiRecognizer::UpdateSilenceWeights()
     }
 }
 
-void KaldiRecognizer::SetMaxAlternatives(int max_alternatives)
+void Recognizer::SetMaxAlternatives(int max_alternatives)
 {
     max_alternatives_ = max_alternatives;
 }
 
-void KaldiRecognizer::SetWords(bool words)
+void Recognizer::SetWords(bool words)
 {
     words_ = words;
 }
 
-void KaldiRecognizer::SetSpkModel(SpkModel *spk_model)
+void Recognizer::SetSpkModel(SpkModel *spk_model)
 {
     if (state_ == RECOGNIZER_RUNNING) {
         KALDI_ERR << "Can't add speaker model to already running recognizer";
@@ -257,7 +257,7 @@ void KaldiRecognizer::SetSpkModel(SpkModel *spk_model)
     spk_feature_ = new OnlineMfcc(spk_model_->spkvector_mfcc_opts);
 }
 
-bool KaldiRecognizer::AcceptWaveform(const char *data, int len)
+bool Recognizer::AcceptWaveform(const char *data, int len)
 {
     Vector<BaseFloat> wave;
     wave.Resize(len / 2, kUndefined);
@@ -266,7 +266,7 @@ bool KaldiRecognizer::AcceptWaveform(const char *data, int len)
     return AcceptWaveform(wave);
 }
 
-bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len)
+bool Recognizer::AcceptWaveform(const short *sdata, int len)
 {
     Vector<BaseFloat> wave;
     wave.Resize(len, kUndefined);
@@ -275,7 +275,7 @@ bool KaldiRecognizer::AcceptWaveform(const short *sdata, int len)
     return AcceptWaveform(wave);
 }
 
-bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len)
+bool Recognizer::AcceptWaveform(const float *fdata, int len)
 {
     Vector<BaseFloat> wave;
     wave.Resize(len, kUndefined);
@@ -284,7 +284,7 @@ bool KaldiRecognizer::AcceptWaveform(const float *fdata, int len)
     return AcceptWaveform(wave);
 }
 
-bool KaldiRecognizer::AcceptWaveform(Vector<BaseFloat> &wdata)
+bool Recognizer::AcceptWaveform(Vector<BaseFloat> &wdata)
 {
     // Cleanup if we finalized previous utterance or the whole feature pipeline
     if (!(state_ == RECOGNIZER_RUNNING || state_ == RECOGNIZER_INITIALIZED)) {
@@ -343,7 +343,7 @@ static void RunNnetComputation(const MatrixBase<BaseFloat> &features,
 
 #define MIN_SPK_FEATS 50
 
-bool KaldiRecognizer::GetSpkVector(Vector<BaseFloat> &out_xvector, int *num_spk_frames)
+bool Recognizer::GetSpkVector(Vector<BaseFloat> &out_xvector, int *num_spk_frames)
 {
     vector<int32> nonsilence_frames;
     if (silence_weighting_->Active() && feature_pipeline_->NumFramesReady() > 0) {
@@ -409,7 +409,7 @@ bool KaldiRecognizer::GetSpkVector(Vector<BaseFloat> &out_xvector, int *num_spk_
 }
 
 
-const char *KaldiRecognizer::MbrResult(CompactLattice &rlat)
+const char *Recognizer::MbrResult(CompactLattice &rlat)
 {
     CompactLattice aligned_lat;
     if (model_->winfo_) {
@@ -523,7 +523,7 @@ static bool CompactLatticeToWordAlignmentWeight(const CompactLattice &clat,
 }
 
 
-const char *KaldiRecognizer::NbestResult(CompactLattice &clat)
+const char *Recognizer::NbestResult(CompactLattice &clat)
 {
     Lattice lat;
     Lattice nbest_lat;
@@ -584,7 +584,7 @@ const char *KaldiRecognizer::NbestResult(CompactLattice &clat)
     return StoreReturn(obj.dump());
 }
 
-const char* KaldiRecognizer::GetResult()
+const char* Recognizer::GetResult()
 {
     if (decoder_->NumFramesDecoded() == 0) {
         return StoreEmptyReturn();
@@ -645,7 +645,7 @@ const char* KaldiRecognizer::GetResult()
 }
 
 
-const char* KaldiRecognizer::PartialResult()
+const char* Recognizer::PartialResult()
 {
     if (state_ != RECOGNIZER_RUNNING) {
         return StoreEmptyReturn();
@@ -676,7 +676,7 @@ const char* KaldiRecognizer::PartialResult()
     return StoreReturn(res.dump());
 }
 
-const char* KaldiRecognizer::Result()
+const char* Recognizer::Result()
 {
     if (state_ != RECOGNIZER_RUNNING) {
         return StoreEmptyReturn();
@@ -686,7 +686,7 @@ const char* KaldiRecognizer::Result()
     return GetResult();
 }
 
-const char* KaldiRecognizer::FinalResult()
+const char* Recognizer::FinalResult()
 {
     if (state_ != RECOGNIZER_RUNNING) {
         return StoreEmptyReturn();
@@ -714,7 +714,7 @@ const char* KaldiRecognizer::FinalResult()
     return last_result_.c_str();
 }
 
-void KaldiRecognizer::Reset()
+void Recognizer::Reset()
 {
     if (state_ == RECOGNIZER_RUNNING) {
         decoder_->FinalizeDecoding();
@@ -723,7 +723,7 @@ void KaldiRecognizer::Reset()
     state_ = RECOGNIZER_ENDPOINT;
 }
 
-const char *KaldiRecognizer::StoreEmptyReturn()
+const char *Recognizer::StoreEmptyReturn()
 {
     if (!max_alternatives_) {
         return StoreReturn("{\"text\": \"\"}");
@@ -733,7 +733,7 @@ const char *KaldiRecognizer::StoreEmptyReturn()
 }
 
 // Store result in recognizer and return as const string
-const char *KaldiRecognizer::StoreReturn(const string &res)
+const char *Recognizer::StoreReturn(const string &res)
 {
     last_result_ = res;
     return last_result_.c_str();
diff --git a/src/kaldi_recognizer.h b/src/recognizer.h
similarity index 91%
rename from src/kaldi_recognizer.h
rename to src/recognizer.h
index 934e237e..e5a733d1 100644
--- a/src/kaldi_recognizer.h
+++ b/src/recognizer.h
@@ -33,19 +33,19 @@
 
 using namespace kaldi;
 
-enum KaldiRecognizerState {
+enum RecognizerState {
     RECOGNIZER_INITIALIZED,
     RECOGNIZER_RUNNING,
     RECOGNIZER_ENDPOINT,
     RECOGNIZER_FINALIZED
 };
 
-class KaldiRecognizer {
+class Recognizer {
     public:
-        KaldiRecognizer(Model *model, float sample_frequency);
-        KaldiRecognizer(Model *model, float sample_frequency, SpkModel *spk_model);
-        KaldiRecognizer(Model *model, float sample_frequency, char const *grammar);
-        ~KaldiRecognizer();
+        Recognizer(Model *model, float sample_frequency);
+        Recognizer(Model *model, float sample_frequency, SpkModel *spk_model);
+        Recognizer(Model *model, float sample_frequency, char const *grammar);
+        ~Recognizer();
         void SetMaxAlternatives(int max_alternatives);
         void SetSpkModel(SpkModel *spk_model);
         void SetWords(bool words);
@@ -101,7 +101,7 @@ class KaldiRecognizer {
         int64 samples_processed_;
         int64 samples_round_start_;
 
-        KaldiRecognizerState state_;
+        RecognizerState state_;
         string last_result_;
 };
 
diff --git a/src/spk_model.h b/src/spk_model.h
index 07cbd4b0..9a76c62a 100644
--- a/src/spk_model.h
+++ b/src/spk_model.h
@@ -22,7 +22,7 @@
 
 using namespace kaldi;
 
-class KaldiRecognizer;
+class Recognizer;
 
 class SpkModel {
 
@@ -32,7 +32,7 @@ class SpkModel {
     void Unref();
 
 protected:
-    friend class KaldiRecognizer;
+    friend class Recognizer;
     ~SpkModel() {};
 
     kaldi::nnet3::Nnet speaker_nnet;
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index ba76a73b..2c5b3b82 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "vosk_api.h"
-#include "kaldi_recognizer.h"
+
+#include "recognizer.h"
+#include "batch_recognizer.h"
 #include "model.h"
 #include "spk_model.h"
 
@@ -67,7 +69,7 @@ void vosk_spk_model_free(VoskSpkModel *model)
 VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate)
 {
     try {
-        return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate);
+        return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate);
     } catch (...) {
         return nullptr;
     }
@@ -76,7 +78,7 @@ VoskRecognizer *vosk_recognizer_new(VoskModel *model, float sample_rate)
 VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, VoskSpkModel *spk_model)
 {
     try {
-        return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, (SpkModel *)spk_model);
+        return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, (SpkModel *)spk_model);
     } catch (...) {
         return nullptr;
     }
@@ -85,7 +87,7 @@ VoskRecognizer *vosk_recognizer_new_spk(VoskModel *model, float sample_rate, Vos
 VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, const char *grammar)
 {
     try {
-        return (VoskRecognizer *)new KaldiRecognizer((Model *)model, sample_rate, grammar);
+        return (VoskRecognizer *)new Recognizer((Model *)model, sample_rate, grammar);
     } catch (...) {
         return nullptr;
     }
@@ -93,12 +95,12 @@ VoskRecognizer *vosk_recognizer_new_grm(VoskModel *model, float sample_rate, con
 
 void vosk_recognizer_set_max_alternatives(VoskRecognizer *recognizer, int max_alternatives)
 {
-    ((KaldiRecognizer *)recognizer)->SetMaxAlternatives(max_alternatives);
+    ((Recognizer *)recognizer)->SetMaxAlternatives(max_alternatives);
 }
 
 void vosk_recognizer_set_words(VoskRecognizer *recognizer, int words)
 {
-    ((KaldiRecognizer *)recognizer)->SetWords((bool)words);
+    ((Recognizer *)recognizer)->SetWords((bool)words);
 }
 
 void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk_model)
@@ -106,13 +108,13 @@ void vosk_recognizer_set_spk_model(VoskRecognizer *recognizer, VoskSpkModel *spk
     if (recognizer == nullptr || spk_model == nullptr) {
        return;
     }
-    ((KaldiRecognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model);
+    ((Recognizer *)recognizer)->SetSpkModel((SpkModel *)spk_model);
 }
 
 int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data, int length)
 {
     try {
-        return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length);
+        return ((Recognizer *)(recognizer))->AcceptWaveform(data, length);
     } catch (...) {
         return -1;
     }
@@ -121,7 +123,7 @@ int vosk_recognizer_accept_waveform(VoskRecognizer *recognizer, const char *data
 int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *data, int length)
 {
     try {
-        return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length);
+        return ((Recognizer *)(recognizer))->AcceptWaveform(data, length);
     } catch (...) {
         return -1;
     }
@@ -130,7 +132,7 @@ int vosk_recognizer_accept_waveform_s(VoskRecognizer *recognizer, const short *d
 int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *data, int length)
 {
     try {
-        return ((KaldiRecognizer *)(recognizer))->AcceptWaveform(data, length);
+        return ((Recognizer *)(recognizer))->AcceptWaveform(data, length);
     } catch (...) {
         return -1;
     }
@@ -138,27 +140,27 @@ int vosk_recognizer_accept_waveform_f(VoskRecognizer *recognizer, const float *d
 
 const char *vosk_recognizer_result(VoskRecognizer *recognizer)
 {
-    return ((KaldiRecognizer *)recognizer)->Result();
+    return ((Recognizer *)recognizer)->Result();
 }
 
 const char *vosk_recognizer_partial_result(VoskRecognizer *recognizer)
 {
-    return ((KaldiRecognizer *)recognizer)->PartialResult();
+    return ((Recognizer *)recognizer)->PartialResult();
 }
 
 const char *vosk_recognizer_final_result(VoskRecognizer *recognizer)
 {
-    return ((KaldiRecognizer *)recognizer)->FinalResult();
+    return ((Recognizer *)recognizer)->FinalResult();
 }
 
 void vosk_recognizer_reset(VoskRecognizer *recognizer)
 {
-    ((KaldiRecognizer *)recognizer)->Reset();
+    ((Recognizer *)recognizer)->Reset();
 }
 
 void vosk_recognizer_free(VoskRecognizer *recognizer)
 {
-    delete (KaldiRecognizer *)(recognizer);
+    delete (Recognizer *)(recognizer);
 }
 
 void vosk_set_log_level(int log_level)
@@ -180,3 +182,28 @@ void vosk_gpu_thread_init()
     kaldi::CuDevice::Instantiate();
 #endif
 }
+
+VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency)
+{
+    return (VoskBatchRecognizer *)(new BatchRecognizer((Model *)model, sample_frequency));
+}
+
+void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer)
+{
+    delete ((BatchRecognizer *)recognizer);
+}
+
+void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length)
+{
+    ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length);
+}
+
+void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id)
+{
+    ((BatchRecognizer *)recognizer)->FinishStream(id);
+}
+
+const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer)
+{
+    return ((BatchRecognizer *)recognizer)->PullResults();
+}
diff --git a/src/vosk_api.h b/src/vosk_api.h
index 7636caa6..df951858 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -39,6 +39,10 @@ typedef struct VoskSpkModel VoskSpkModel;
  *  speaker information and so on */
 typedef struct VoskRecognizer VoskRecognizer;
 
+/**
+ * Batch recognizer object
+ */
+typedef struct VoskBatchRecognizer VoskBatchRecognizer;
 
 /** Loads model data from the file and returns the model object
  *
@@ -285,6 +289,26 @@ void vosk_gpu_init();
  */
 void vosk_gpu_thread_init();
 
+/** Creates the batch recognizer object
+ *  The recognizers process the speech and return text using shared model data
+ *  @param model       VoskModel containing static data for recognizer. Model can be
+ *                     shared across recognizers, even running in different threads.
+ *  @returns recognizer object or NULL if problem occured */
+VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency);
+
+/** Releases batch recognizer object
+ *  Underlying model is also unreferenced and if needed released */
+void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer);
+
+/** Accept batch voice data */
+void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length);
+
+/** Closes the stream */
+void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id);
+
+/** Return results */
+const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer);
+
 #ifdef __cplusplus
 }
 #endif

From 344e137a61f81887afc974027a93500c0c986436 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Mon, 13 Dec 2021 01:21:59 +0100
Subject: [PATCH 2/8] Decoding works, results are empty yet

---
 python/example/batch/test_batch.py | 13 ++++++++-----
 python/vosk/__init__.py            |  4 ++--
 src/Makefile                       |  8 ++++----
 src/batch_recognizer.cc            | 18 +++++++++++++++++-
 src/model.cc                       |  6 +++---
 src/vosk_api.cc                    |  4 ++--
 src/vosk_api.h                     |  4 ++--
 7 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
index fb1bb7e9..3fadab6a 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/batch/test_batch.py
@@ -1,21 +1,24 @@
 #!/usr/bin/env python3
 
-from vosk import Model, BatchRecognizer
+from vosk import Model, BatchRecognizer, GpuInit, GpuThreadInit
 import sys
 import os
 import wave
 
+GpuInit()
+GpuThreadInit()
+
 model = Model("model")
 rec = BatchRecognizer(model, 16000.0)
 
 fnames = open("tedlium.list").readlines()
-fds = [open(x) for x in fnames]
+fds = [open(x.strip(), "rb") for x in fnames]
 ended = set()
 while True:
-    for i, fd in fds:
-        if i in ended():
+    for i, fd in enumerate(fds):
+        if i in ended:
             continue
-        data = fd.read(4000)
+        data = fd.read(16000)
         if len(data) == 0:
             rec.FinishStream(i)
             ended.add(i)
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index 02e1df97..9e25229c 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -117,7 +117,7 @@ def AcceptWaveform(self, uid, data):
         res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data))
 
     def Results(self):
-        return _ffi.string(_c.vosk_batch_recognizer_result(self._handle)).decode('utf-8')
+        return _ffi.string(_c.vosk_batch_recognizer_results(self._handle)).decode('utf-8')
 
     def FinishStream(self, uid):
-        _c.vosk_recognizer_final_result(self._handle, uid)
+        _c.vosk_batch_recognizer_finish_stream(self._handle, uid)
diff --git a/src/Makefile b/src/Makefile
index 96c21949..823a4aaf 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -37,17 +37,19 @@ CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LIN
 	-I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS) 
 
 LIBS= \
+        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
+        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
 	$(KALDI_ROOT)/src/online2/kaldi-online2.a \
 	$(KALDI_ROOT)/src/decoder/kaldi-decoder.a \
 	$(KALDI_ROOT)/src/ivector/kaldi-ivector.a \
 	$(KALDI_ROOT)/src/gmm/kaldi-gmm.a \
-	$(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \
 	$(KALDI_ROOT)/src/tree/kaldi-tree.a \
 	$(KALDI_ROOT)/src/feat/kaldi-feat.a \
 	$(KALDI_ROOT)/src/lat/kaldi-lat.a \
 	$(KALDI_ROOT)/src/lm/kaldi-lm.a \
 	$(KALDI_ROOT)/src/rnnlm/kaldi-rnnlm.a \
 	$(KALDI_ROOT)/src/hmm/kaldi-hmm.a \
+	$(KALDI_ROOT)/src/nnet3/kaldi-nnet3.a \
 	$(KALDI_ROOT)/src/transform/kaldi-transform.a \
 	$(KALDI_ROOT)/src/cudamatrix/kaldi-cudamatrix.a \
 	$(KALDI_ROOT)/src/matrix/kaldi-matrix.a \
@@ -68,7 +70,7 @@ ifeq ($(HAVE_OPENBLAS_CLAPACK), 1)
 endif
 
 ifeq ($(HAVE_MKL), 1)
-    CFLAGS += -I$(MKL_ROOT)/include
+    CFLAGS += -DHAVE_MKL=1 -I$(MKL_ROOT)/include
     LIBS += -L$(MKL_ROOT)/lib/intel64 -Wl,-rpath=$(MKL_ROOT)/lib/intel64 -lmkl_rt -lmkl_intel_lp64 -lmkl_core -lmkl_sequential
 endif
 
@@ -79,8 +81,6 @@ endif
 ifeq ($(HAVE_CUDA), 1)
     CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include
     LIBS+=\
-        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
-        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
         -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
 endif
 
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 112f1fe9..969a62aa 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -25,9 +25,22 @@ BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(
     model_->Ref();
 
     BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config;
+    batched_decoder_config.num_worker_threads = 4;
+    batched_decoder_config.max_batch_size = 100;
+
+    batched_decoder_config.feature_opts.feature_type = "mfcc";
+    batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
+    batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf";
+    batched_decoder_config.decoder_opts.max_active = 7000;
+    batched_decoder_config.decoder_opts.default_beam = 13.0;
+    batched_decoder_config.decoder_opts.lattice_beam = 8.0;
+    batched_decoder_config.compute_opts.acoustic_scale = 1.0;
+    batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
+    batched_decoder_config.compute_opts.frames_per_chunk = 312;
 
     cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline 
          (batched_decoder_config, *model_->hclg_fst_, *model_->nnet_, *model_->trans_model_);
+    cuda_pipeline_->SetSymbolTable(*model_->word_syms_);
 
     CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config;
     dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config,
@@ -60,6 +73,9 @@ void BatchRecognizer::InitRescoring()
 
 void BatchRecognizer::FinishStream(uint64_t id)
 {
+    Vector<BaseFloat> wave;
+    SubVector<BaseFloat> chunk(wave.Data(), 0);
+    dynamic_batcher_->Push(id, false, true, chunk);
     streams_.erase(id);
 }
 
@@ -77,7 +93,7 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
           [&, id](const std::string &str, bool partial,
                        bool endpoint_detected) {
               if (partial) {
-                  KALDI_LOG << "id #" << id << " [partial] : " << str;
+                  KALDI_LOG << "id #" << id << " [partial] : " << str << ":";
               }
 
               if (endpoint_detected) {
diff --git a/src/model.cc b/src/model.cc
index 8b5e12cc..eecaed97 100644
--- a/src/model.cc
+++ b/src/model.cc
@@ -241,9 +241,9 @@ void Model::ReadDataFiles()
         SetDropoutTestMode(true, &(nnet_->GetNnet()));
         nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet()));
     }
-    decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_,
-                                                               nnet_);
 
+/*    decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_,
+                                                               nnet_);
     if (stat(final_ie_rxfilename_.c_str(), &buffer) == 0) {
         KALDI_LOG << "Loading i-vector extractor from " << final_ie_rxfilename_;
 
@@ -261,7 +261,7 @@ void Model::ReadDataFiles()
     } else {
         feature_info_.use_ivectors = false;
     }
-
+*/
     if (stat(global_cmvn_stats_rxfilename_.c_str(), &buffer) == 0) {
         KALDI_LOG << "Reading CMVN stats from " << global_cmvn_stats_rxfilename_;
         feature_info_.use_cmvn = true;
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 2c5b3b82..f95adc07 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -193,12 +193,12 @@ void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer)
     delete ((BatchRecognizer *)recognizer);
 }
 
-void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length)
+void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length)
 {
     ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length);
 }
 
-void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id)
+void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id)
 {
     ((BatchRecognizer *)recognizer)->FinishStream(id);
 }
diff --git a/src/vosk_api.h b/src/vosk_api.h
index df951858..e085afe7 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -301,10 +301,10 @@ VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_fr
 void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer);
 
 /** Accept batch voice data */
-void vosk_batch_recognizer_accept_waveform(VoskRecognizer *recognizer, int id, const char *data, int length);
+void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length);
 
 /** Closes the stream */
-void vosk_batch_recognizer_finish_stream(VoskRecognizer *recognizer, int id);
+void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id);
 
 /** Return results */
 const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer);

From 60f0396fe0647d57b73ff59e51f09bba69c54ad5 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 17 Dec 2021 01:13:09 +0100
Subject: [PATCH 3/8] Reset lattice on endpoint

---
 python/example/batch/test_batch.py |  5 +-
 python/vosk/__init__.py            |  2 +-
 src/batch_recognizer.cc            | 95 +++++++++++++++++++++++-------
 src/batch_recognizer.h             | 15 ++++-
 src/model.cc                       |  4 +-
 src/vosk_api.cc                    |  4 +-
 src/vosk_api.h                     |  6 +-
 7 files changed, 94 insertions(+), 37 deletions(-)

diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
index 3fadab6a..f93eb6ea 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/batch/test_batch.py
@@ -8,8 +8,7 @@
 GpuInit()
 GpuThreadInit()
 
-model = Model("model")
-rec = BatchRecognizer(model, 16000.0)
+rec = BatchRecognizer()
 
 fnames = open("tedlium.list").readlines()
 fds = [open(x.strip(), "rb") for x in fnames]
@@ -18,7 +17,7 @@
     for i, fd in enumerate(fds):
         if i in ended:
             continue
-        data = fd.read(16000)
+        data = fd.read(8000)
         if len(data) == 0:
             rec.FinishStream(i)
             ended.add(i)
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index 9e25229c..964a0ac2 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -105,7 +105,7 @@ def GpuThreadInit():
 class BatchRecognizer(object):
 
     def __init__(self, *args):
-        self._handle = _c.vosk_batch_recognizer_new(args[0]._handle, args[1])
+        self._handle = _c.vosk_batch_recognizer_new()
 
         if self._handle == _ffi.NULL:
             raise Exception("Failed to create a recognizer")
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 969a62aa..184fb8a2 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -17,16 +17,22 @@
 #include "fstext/fstext-utils.h"
 #include "lat/sausages.h"
 
+#include <sys/stat.h>
+
 using namespace fst;
 using namespace kaldi::nnet3;
 using CorrelationID = CudaOnlinePipelineDynamicBatcher::CorrelationID;
 
-BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(model), sample_frequency_(sample_frequency) {
-    model_->Ref();
-
+BatchRecognizer::BatchRecognizer() {
     BatchedThreadedNnet3CudaOnlinePipelineConfig batched_decoder_config;
+
+    kaldi::ParseOptions po("something");
+    batched_decoder_config.Register(&po);
+    po.ReadConfigFile("model/conf/model.conf");
+
     batched_decoder_config.num_worker_threads = 4;
     batched_decoder_config.max_batch_size = 100;
+    batched_decoder_config.reset_on_endpoint = true;
 
     batched_decoder_config.feature_opts.feature_type = "mfcc";
     batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
@@ -38,37 +44,78 @@ BatchRecognizer::BatchRecognizer(Model *model, float sample_frequency) : model_(
     batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
     batched_decoder_config.compute_opts.frames_per_chunk = 312;
 
+    struct stat buffer;
+
+    string nnet3_rxfilename_ = "model/am/final.mdl";
+    string hclg_fst_rxfilename_ = "model/graph/HCLG.fst";
+    string word_syms_rxfilename_ = "model/graph/words.txt";
+    string winfo_rxfilename_ = "model/graph/phones/word_boundary.int";
+    string std_fst_rxfilename_ = "model/rescore/G.fst";
+    string carpa_rxfilename_ = "model/rescore/G.carpa";
+
+    trans_model_ = new kaldi::TransitionModel();
+    nnet_ = new kaldi::nnet3::AmNnetSimple();
+    {
+        bool binary;
+        kaldi::Input ki(nnet3_rxfilename_, &binary);
+        trans_model_->Read(ki.Stream(), binary);
+        nnet_->Read(ki.Stream(), binary);
+        SetBatchnormTestMode(true, &(nnet_->GetNnet()));
+        SetDropoutTestMode(true, &(nnet_->GetNnet()));
+        nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet()));
+    }
+
+    if (stat(hclg_fst_rxfilename_.c_str(), &buffer) == 0) {
+        KALDI_LOG << "Loading HCLG from " << hclg_fst_rxfilename_;
+        hclg_fst_ = fst::ReadFstKaldiGeneric(hclg_fst_rxfilename_);
+    }
+
+    KALDI_LOG << "Loading words from " << word_syms_rxfilename_;
+    if (!(word_syms_ = fst::SymbolTable::ReadText(word_syms_rxfilename_))) {
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_rxfilename_;
+    }
+    KALDI_ASSERT(word_syms_);
+
+    if (stat(winfo_rxfilename_.c_str(), &buffer) == 0) {
+        KALDI_LOG << "Loading winfo " << winfo_rxfilename_;
+        kaldi::WordBoundaryInfoNewOpts opts;
+        winfo_ = new kaldi::WordBoundaryInfo(opts, winfo_rxfilename_);
+    }
+
+    if (stat(carpa_rxfilename_.c_str(), &buffer) == 0) {
+        KALDI_LOG << "Loading subtract G.fst model from " << std_fst_rxfilename_;
+        graph_lm_fst_ = fst::ReadAndPrepareLmFst(std_fst_rxfilename_);
+        KALDI_LOG << "Loading CARPA model from " << carpa_rxfilename_;
+        ReadKaldiObject(carpa_rxfilename_, &const_arpa_);
+    }
+
+
+
     cuda_pipeline_ = new BatchedThreadedNnet3CudaOnlinePipeline 
-         (batched_decoder_config, *model_->hclg_fst_, *model_->nnet_, *model_->trans_model_);
-    cuda_pipeline_->SetSymbolTable(*model_->word_syms_);
+         (batched_decoder_config, *hclg_fst_, *nnet_, *trans_model_);
+    cuda_pipeline_->SetSymbolTable(*word_syms_);
 
     CudaOnlinePipelineDynamicBatcherConfig dynamic_batcher_config;
     dynamic_batcher_ = new CudaOnlinePipelineDynamicBatcher(dynamic_batcher_config,
                                                             *cuda_pipeline_);
-
-    InitRescoring();
 }
 
 BatchRecognizer::~BatchRecognizer() {
+
+    delete trans_model_;
+    delete nnet_;
+    delete word_syms_;
+    delete winfo_;
+    delete hclg_fst_;
+    delete graph_lm_fst_;
+
     delete lm_to_subtract_;
     delete carpa_to_add_;
     delete carpa_to_add_scale_;
 
     delete cuda_pipeline_;
     delete dynamic_batcher_;
-
-    model_->Unref();
-}
-
-void BatchRecognizer::InitRescoring()
-{
-    if (model_->graph_lm_fst_) {
-        fst::CacheOptions cache_opts(true, -1);
-        fst::ArcMapFstOptions mapfst_opts(cache_opts);
-        fst::StdToLatticeMapper<BaseFloat> mapper;
-        lm_to_subtract_ = new fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> >(*model_->graph_lm_fst_, mapper, mapfst_opts);
-        carpa_to_add_ = new ConstArpaLmDeterministicFst(model_->const_arpa_);
-    }
 }
 
 void BatchRecognizer::FinishStream(uint64_t id)
@@ -104,13 +151,18 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
                   KALDI_LOG << "id #" << id << " : " << str;
               }
             });
+        cuda_pipeline_->SetLatticeCallback(
+          id,
+          [&, id](CompactLattice &clat) {
+              KALDI_LOG << "Got lattice from the stream " << id;
+          });
     }
 
     Vector<BaseFloat> wave;
     wave.Resize(len / 2, kUndefined);
     for (int i = 0; i < len / 2; i++)
         wave(i) = *(((short *)data) + i);
-    SubVector<BaseFloat> chunk(wave.Data(), 0);
+    SubVector<BaseFloat> chunk(wave.Data(), wave.Dim());
 
     dynamic_batcher_->Push(id, first, false, chunk);
 }
@@ -118,6 +170,5 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
 const char* BatchRecognizer::PullResults()
 {
     dynamic_batcher_->WaitForCompletion();
-    cudaDeviceSynchronize();
     return "";
 }
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
index 00f4a0db..c8045d53 100644
--- a/src/batch_recognizer.h
+++ b/src/batch_recognizer.h
@@ -40,7 +40,7 @@ using namespace kaldi::cuda_decoder;
 
 class BatchRecognizer {
     public:
-        BatchRecognizer(Model *model, float sample_frequency);
+        BatchRecognizer();
         ~BatchRecognizer();
 
         void FinishStream(uint64_t id);
@@ -48,12 +48,21 @@ class BatchRecognizer {
         const char* PullResults();
 
     private:
-        void InitRescoring();
 
-        Model *model_ = nullptr;
+        kaldi::TransitionModel *trans_model_ = nullptr;
+        kaldi::nnet3::AmNnetSimple *nnet_ = nullptr;
+        const fst::SymbolTable *word_syms_ = nullptr;
+
+        fst::Fst<fst::StdArc> *hclg_fst_ = nullptr;
+        kaldi::WordBoundaryInfo *winfo_ = nullptr;
+
+        fst::VectorFst<fst::StdArc> *graph_lm_fst_ = nullptr;
+        kaldi::ConstArpaLm const_arpa_;
+
         BatchedThreadedNnet3CudaOnlinePipeline *cuda_pipeline_ = nullptr;
         CudaOnlinePipelineDynamicBatcher *dynamic_batcher_ = nullptr;
 
+
         std::set<int> streams_;
 
         // Rescoring
diff --git a/src/model.cc b/src/model.cc
index eecaed97..c83d07a8 100644
--- a/src/model.cc
+++ b/src/model.cc
@@ -242,7 +242,7 @@ void Model::ReadDataFiles()
         nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(nnet_->GetNnet()));
     }
 
-/*    decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_,
+    decodable_info_ = new nnet3::DecodableNnetSimpleLoopedInfo(decodable_opts_,
                                                                nnet_);
     if (stat(final_ie_rxfilename_.c_str(), &buffer) == 0) {
         KALDI_LOG << "Loading i-vector extractor from " << final_ie_rxfilename_;
@@ -261,7 +261,7 @@ void Model::ReadDataFiles()
     } else {
         feature_info_.use_ivectors = false;
     }
-*/
+
     if (stat(global_cmvn_stats_rxfilename_.c_str(), &buffer) == 0) {
         KALDI_LOG << "Reading CMVN stats from " << global_cmvn_stats_rxfilename_;
         feature_info_.use_cmvn = true;
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index f95adc07..a53dbf87 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -183,9 +183,9 @@ void vosk_gpu_thread_init()
 #endif
 }
 
-VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency)
+VoskBatchRecognizer *vosk_batch_recognizer_new()
 {
-    return (VoskBatchRecognizer *)(new BatchRecognizer((Model *)model, sample_frequency));
+    return (VoskBatchRecognizer *)(new BatchRecognizer());
 }
 
 void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer)
diff --git a/src/vosk_api.h b/src/vosk_api.h
index e085afe7..c5b92f1c 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -290,11 +290,9 @@ void vosk_gpu_init();
 void vosk_gpu_thread_init();
 
 /** Creates the batch recognizer object
- *  The recognizers process the speech and return text using shared model data
- *  @param model       VoskModel containing static data for recognizer. Model can be
- *                     shared across recognizers, even running in different threads.
+ *
  *  @returns recognizer object or NULL if problem occured */
-VoskBatchRecognizer *vosk_batch_recognizer_new(VoskModel *model, float sample_frequency);
+VoskBatchRecognizer *vosk_batch_recognizer_new();
 
 /** Releases batch recognizer object
  *  Underlying model is also unreferenced and if needed released */

From 848b2dc753a823c2a3f1ca6e2bb4fd4f1d7eab31 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 17 Dec 2021 22:22:30 +0100
Subject: [PATCH 4/8] Expose results in Python

---
 python/example/batch/asr_server_gpu.py | 85 +++++++++++++++++++++++++
 python/example/batch/test_batch.py     | 24 +++++--
 python/vosk/__init__.py                | 10 ++-
 src/batch_recognizer.cc                | 87 +++++++++++++++++++++++---
 src/batch_recognizer.h                 |  6 +-
 src/json.h                             |  8 +--
 src/vosk_api.cc                        | 14 ++++-
 src/vosk_api.h                         |  8 ++-
 8 files changed, 217 insertions(+), 25 deletions(-)
 create mode 100755 python/example/batch/asr_server_gpu.py

diff --git a/python/example/batch/asr_server_gpu.py b/python/example/batch/asr_server_gpu.py
new file mode 100755
index 00000000..f58587c9
--- /dev/null
+++ b/python/example/batch/asr_server_gpu.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+
+import json
+import os
+import sys
+import asyncio
+import pathlib
+import websockets
+import logging
+
+from vosk import BatchRecognizer, GpuInit
+
+
+async def recognize(websocket, path):
+    global args
+    global loop
+    global pool
+    global rec
+    global client_cnt
+
+    uid = client_cnt
+    client_cnt += 1
+
+    logging.info('Connection %d from %s', uid, websocket.remote_address);
+
+    while True:
+
+        message = await websocket.recv()
+
+        if message == '{"eof" : 1}':
+            rec.FinishStream(uid)
+            break
+
+        if isinstance(message, str) and 'config' in message:
+            continue
+
+        rec.AcceptWaveform(uid, message)
+        await asyncio.sleep(len(message) / 16000.0 / 2)
+        res = rec.Result(uid)
+        if len(res) == 0:
+            await websocket.send('{ "partial" : "" }')
+        else:
+            await websocket.send(res)
+
+    rec.Wait()
+    res = rec.Result(uid)
+    await websocket.send(res)
+
+def start():
+
+    global rec
+    global args
+    global loop
+    global client_cnt
+
+    # Enable loging if needed
+    #
+    # logger = logging.getLogger('websockets')
+    # logger.setLevel(logging.INFO)
+    # logger.addHandler(logging.StreamHandler())
+    logging.basicConfig(level=logging.INFO)
+
+    args = type('', (), {})()
+
+    args.interface = os.environ.get('VOSK_SERVER_INTERFACE', '0.0.0.0')
+    args.port = int(os.environ.get('VOSK_SERVER_PORT', 2700))
+
+    GpuInit()
+
+    rec = BatchRecognizer()
+
+    client_cnt = 0
+
+    loop = asyncio.get_event_loop()
+
+    start_server = websockets.serve(
+        recognize, args.interface, args.port)
+
+    logging.info("Listening on %s:%d", args.interface, args.port)
+    loop.run_until_complete(start_server)
+    loop.run_forever()
+
+
+if __name__ == '__main__':
+    start()
diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
index f93eb6ea..32aa021e 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/batch/test_batch.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 
-from vosk import Model, BatchRecognizer, GpuInit, GpuThreadInit
 import sys
 import os
 import wave
+from time import sleep
+
+from vosk import Model, BatchRecognizer, GpuInit
 
 GpuInit()
-GpuThreadInit()
 
 rec = BatchRecognizer()
 
@@ -14,6 +15,7 @@
 fds = [open(x.strip(), "rb") for x in fnames]
 ended = set()
 while True:
+
     for i, fd in enumerate(fds):
         if i in ended:
             continue
@@ -21,8 +23,20 @@
         if len(data) == 0:
             rec.FinishStream(i)
             ended.add(i)
-        else:
-            rec.AcceptWaveform(i, data)
-    rec.Results()
+            continue
+        rec.AcceptWaveform(i, data)
+
+    sleep(0.3)
+    for i, fd in enumerate(fds):
+       res = rec.Result(i)
+       print (i, res)
+
     if len(ended) == len(fds):
         break
+
+sleep(20)
+print ("Done")
+for i, fd in enumerate(fds):
+   res = rec.Result(i)
+   print (i, res)
+rec.Wait()
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index 964a0ac2..c83a7e34 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -116,8 +116,14 @@ def __del__(self):
     def AcceptWaveform(self, uid, data):
         res = _c.vosk_batch_recognizer_accept_waveform(self._handle, uid, data, len(data))
 
-    def Results(self):
-        return _ffi.string(_c.vosk_batch_recognizer_results(self._handle)).decode('utf-8')
+    def Result(self, uid):
+        ptr = _c.vosk_batch_recognizer_front_result(self._handle, uid)
+        res = _ffi.string(ptr).decode('utf-8')
+        _c.vosk_batch_recognizer_pop(self._handle, uid)
+        return res
 
     def FinishStream(self, uid):
         _c.vosk_batch_recognizer_finish_stream(self._handle, uid)
+
+    def Wait(self):
+        _c.vosk_batch_recognizer_wait(self._handle)
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 184fb8a2..1773fc0e 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -16,6 +16,7 @@
 
 #include "fstext/fstext-utils.h"
 #include "lat/sausages.h"
+#include "json.h"
 
 #include <sys/stat.h>
 
@@ -37,12 +38,12 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.feature_opts.feature_type = "mfcc";
     batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
     batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf";
-    batched_decoder_config.decoder_opts.max_active = 7000;
-    batched_decoder_config.decoder_opts.default_beam = 13.0;
-    batched_decoder_config.decoder_opts.lattice_beam = 8.0;
+    batched_decoder_config.decoder_opts.max_active = 5000;
+    batched_decoder_config.decoder_opts.default_beam = 10.0;
+    batched_decoder_config.decoder_opts.lattice_beam = 4.0;
     batched_decoder_config.compute_opts.acoustic_scale = 1.0;
     batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
-    batched_decoder_config.compute_opts.frames_per_chunk = 312;
+    batched_decoder_config.compute_opts.frames_per_chunk = 51;
 
     struct stat buffer;
 
@@ -126,6 +127,47 @@ void BatchRecognizer::FinishStream(uint64_t id)
     streams_.erase(id);
 }
 
+
+void BatchRecognizer::PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset)
+{
+    fst::ScaleLattice(fst::GraphLatticeScale(0.9), &clat);
+
+    CompactLattice aligned_lat;
+    WordAlignLattice(clat, *trans_model_, *winfo_, 0, &aligned_lat);
+
+    MinimumBayesRisk mbr(aligned_lat);
+    const vector<BaseFloat> &conf = mbr.GetOneBestConfidences();
+    const vector<int32> &words = mbr.GetOneBest();
+    const vector<pair<BaseFloat, BaseFloat> > &times =
+          mbr.GetOneBestTimes();
+
+    int size = words.size();
+
+    json::JSON obj;
+    stringstream text;
+
+    // Create JSON object
+    for (int i = 0; i < size; i++) {
+        json::JSON word;
+
+        word["word"] = word_syms_->Find(words[i]);
+        word["start"] = times[i].first * 0.03 + offset;
+        word["end"] = times[i].second * 0.03 + offset;
+        word["conf"] = conf[i];
+        obj["result"].append(word);
+
+        if (i) {
+            text << " ";
+        }
+        text << word_syms_->Find(words[i]);
+    }
+    obj["text"] = text.str();
+
+//    KALDI_LOG << "Result " << id << " " << obj.dump();
+
+    results_[id].push(obj.dump());
+}
+
 void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
 {
     bool first = false;
@@ -135,7 +177,8 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
         streams_.insert(id);
 
         // Define the callback for results.
-        cuda_pipeline_->SetBestPathCallback(
+#if 0
+         cuda_pipeline_->SetBestPathCallback(
           id,
           [&, id](const std::string &str, bool partial,
                        bool endpoint_detected) {
@@ -151,11 +194,19 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
                   KALDI_LOG << "id #" << id << " : " << str;
               }
             });
+#endif
         cuda_pipeline_->SetLatticeCallback(
           id,
-          [&, id](CompactLattice &clat) {
-              KALDI_LOG << "Got lattice from the stream " << id;
-          });
+          [&, id](SegmentedLatticeCallbackParams& params) {
+              if (params.results.empty()) {
+                  KALDI_WARN << "Empty result for callback";
+                  return;
+              }
+              CompactLattice *clat = params.results[0].GetLatticeResult();
+              BaseFloat offset = params.results[0].GetTimeOffsetSeconds();
+              PushLattice(id, *clat, offset);
+          },
+          CudaPipelineResult::RESULT_TYPE_LATTICE);
     }
 
     Vector<BaseFloat> wave;
@@ -167,8 +218,24 @@ void BatchRecognizer::AcceptWaveform(uint64_t id, const char *data, int len)
     dynamic_batcher_->Push(id, first, false, chunk);
 }
 
-const char* BatchRecognizer::PullResults()
+const char* BatchRecognizer::FrontResult(uint64_t id)
+{
+    if (results_[id].empty()) {
+        return "";
+    }
+    return results_[id].front().c_str();
+}
+
+void BatchRecognizer::Pop(uint64_t id)
+{
+    if (results_[id].empty()) {
+        return;
+    }
+    results_[id].pop();
+}
+
+void BatchRecognizer::WaitForCompletion()
 {
     dynamic_batcher_->WaitForCompletion();
-    return "";
 }
+
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
index c8045d53..0082a364 100644
--- a/src/batch_recognizer.h
+++ b/src/batch_recognizer.h
@@ -45,9 +45,12 @@ class BatchRecognizer {
 
         void FinishStream(uint64_t id);
         void AcceptWaveform(uint64_t id, const char *data, int len);
-        const char* PullResults();
+        const char *FrontResult(uint64_t id);
+        void Pop(uint64_t id);
+        void WaitForCompletion();
 
     private:
+        void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset);
 
         kaldi::TransitionModel *trans_model_ = nullptr;
         kaldi::nnet3::AmNnetSimple *nnet_ = nullptr;
@@ -64,6 +67,7 @@ class BatchRecognizer {
 
 
         std::set<int> streams_;
+        std::map<int, std::queue<std::string> > results_;
 
         // Rescoring
         fst::ArcMapFst<fst::StdArc, LatticeArc, fst::StdToLatticeMapper<BaseFloat> > *lm_to_subtract_ = nullptr;
diff --git a/src/json.h b/src/json.h
index 463912ec..2159392b 100644
--- a/src/json.h
+++ b/src/json.h
@@ -424,7 +424,7 @@ class JSON
         Class Type = Class::Null;
 };
 
-JSON Array() {
+inline JSON Array() {
     return JSON::Make( JSON::Class::Array );
 }
 
@@ -435,11 +435,11 @@ JSON Array( T... args ) {
     return arr;
 }
 
-JSON Object() {
+inline JSON Object() {
     return JSON::Make( JSON::Class::Object );
 }
 
-std::ostream& operator<<( std::ostream &os, const JSON &json ) {
+inline std::ostream& operator<<( std::ostream &os, const JSON &json ) {
     os << json.dump();
     return os;
 }
@@ -647,7 +647,7 @@ namespace {
     }
 }
 
-JSON JSON::Load( const string &str ) {
+inline JSON JSON::Load( const string &str ) {
     size_t offset = 0;
     return parse_next( str, offset );
 }
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index a53dbf87..b2a7a6a4 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -203,7 +203,17 @@ void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id
     ((BatchRecognizer *)recognizer)->FinishStream(id);
 }
 
-const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer)
+const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id)
 {
-    return ((BatchRecognizer *)recognizer)->PullResults();
+    return ((BatchRecognizer *)recognizer)->FrontResult(id);
+}
+
+void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id)
+{
+    return ((BatchRecognizer *)recognizer)->Pop(id);
+}
+
+void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer)
+{
+    ((BatchRecognizer *)recognizer)->WaitForCompletion();
 }
diff --git a/src/vosk_api.h b/src/vosk_api.h
index c5b92f1c..7177009c 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -305,7 +305,13 @@ void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int
 void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id);
 
 /** Return results */
-const char *vosk_batch_recognizer_results(VoskBatchRecognizer *recognizer);
+const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id);
+
+/** Release and free first retrieved result */
+void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id);
+
+/** Wait for the processing */
+void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer);
 
 #ifdef __cplusplus
 }

From cb0f8e64110ad502f8660a2e3066e490bedfcddc Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Thu, 23 Dec 2021 22:34:47 +0100
Subject: [PATCH 5/8] Per-stream wait API

---
 python/example/batch/asr_server_gpu.py |  9 ++++++--
 python/example/batch/test_batch.py     | 32 +++++++++++++++++++-------
 python/vosk/__init__.py                |  3 +++
 src/batch_recognizer.cc                | 10 +++++---
 src/batch_recognizer.h                 |  1 +
 src/vosk_api.cc                        |  7 +++++-
 src/vosk_api.h                         |  3 +++
 7 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/python/example/batch/asr_server_gpu.py b/python/example/batch/asr_server_gpu.py
index f58587c9..11885e9f 100755
--- a/python/example/batch/asr_server_gpu.py
+++ b/python/example/batch/asr_server_gpu.py
@@ -35,14 +35,19 @@ async def recognize(websocket, path):
             continue
 
         rec.AcceptWaveform(uid, message)
-        await asyncio.sleep(len(message) / 16000.0 / 2)
+
+        while rec.GetPendingChunks(uid) > 0:
+            await asyncio.sleep(0.1)
+
         res = rec.Result(uid)
         if len(res) == 0:
             await websocket.send('{ "partial" : "" }')
         else:
             await websocket.send(res)
 
-    rec.Wait()
+    while rec.GetPendingChunks(uid) > 0:
+        await asyncio.sleep(0.1)
+
     res = rec.Result(uid)
     await websocket.send(res)
 
diff --git a/python/example/batch/test_batch.py b/python/example/batch/test_batch.py
index 32aa021e..8737a746 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/batch/test_batch.py
@@ -4,6 +4,9 @@
 import os
 import wave
 from time import sleep
+import json
+from timeit import default_timer as timer
+
 
 from vosk import Model, BatchRecognizer, GpuInit
 
@@ -13,9 +16,16 @@
 
 fnames = open("tedlium.list").readlines()
 fds = [open(x.strip(), "rb") for x in fnames]
+uids = [fname.strip().split('/')[-1][:-4] for fname in fnames]
+results = [""] * len(fnames)
 ended = set()
+tot_samples = 0
+
+start_time = timer()
+
 while True:
 
+    # Feed in the data
     for i, fd in enumerate(fds):
         if i in ended:
             continue
@@ -25,18 +35,24 @@
             ended.add(i)
             continue
         rec.AcceptWaveform(i, data)
+        tot_samples += len(data)
 
-    sleep(0.3)
+    # Wait for results from CUDA
+    rec.Wait()
+
+    # Retrieve and add results
     for i, fd in enumerate(fds):
        res = rec.Result(i)
-       print (i, res)
+       if len(res) != 0:
+           results[i] = results[i] + " " + json.loads(res)['text']
 
     if len(ended) == len(fds):
         break
 
-sleep(20)
-print ("Done")
-for i, fd in enumerate(fds):
-   res = rec.Result(i)
-   print (i, res)
-rec.Wait()
+end_time = timer()
+
+for i in range(len(results)):
+    print (uids[i], results[i].strip())
+
+print ("Processed %d seconds of audio in %d seconds (%f xRT)" % (tot_samples / 16000.0 / 2, end_time - start_time, 
+    (tot_samples / 16000.0 / 2 / (end_time - start_time))), file=sys.stderr)
diff --git a/python/vosk/__init__.py b/python/vosk/__init__.py
index c83a7e34..0e60c2ba 100644
--- a/python/vosk/__init__.py
+++ b/python/vosk/__init__.py
@@ -127,3 +127,6 @@ def FinishStream(self, uid):
 
     def Wait(self):
         _c.vosk_batch_recognizer_wait(self._handle)
+
+    def GetPendingChunks(self, uid):
+        return _c.vosk_batch_recognizer_get_pending_chunks(self._handle, uid)
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 1773fc0e..972e31dc 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -38,9 +38,9 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.feature_opts.feature_type = "mfcc";
     batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
     batched_decoder_config.feature_opts.ivector_extraction_config = "model/conf/ivector.conf";
-    batched_decoder_config.decoder_opts.max_active = 5000;
-    batched_decoder_config.decoder_opts.default_beam = 10.0;
-    batched_decoder_config.decoder_opts.lattice_beam = 4.0;
+    batched_decoder_config.decoder_opts.max_active = 7000;
+    batched_decoder_config.decoder_opts.default_beam = 13.0;
+    batched_decoder_config.decoder_opts.lattice_beam = 6.0;
     batched_decoder_config.compute_opts.acoustic_scale = 1.0;
     batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
     batched_decoder_config.compute_opts.frames_per_chunk = 51;
@@ -239,3 +239,7 @@ void BatchRecognizer::WaitForCompletion()
     dynamic_batcher_->WaitForCompletion();
 }
 
+int BatchRecognizer::GetPendingChunks(uint64_t id)
+{
+    return dynamic_batcher_->GetPendingChunks(id);
+}
diff --git a/src/batch_recognizer.h b/src/batch_recognizer.h
index 0082a364..f26dd54b 100644
--- a/src/batch_recognizer.h
+++ b/src/batch_recognizer.h
@@ -48,6 +48,7 @@ class BatchRecognizer {
         const char *FrontResult(uint64_t id);
         void Pop(uint64_t id);
         void WaitForCompletion();
+        int GetPendingChunks(uint64_t id);
 
     private:
         void PushLattice(uint64_t id, CompactLattice &clat, BaseFloat offset);
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index b2a7a6a4..1f77eb6c 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -210,10 +210,15 @@ const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer,
 
 void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id)
 {
-    return ((BatchRecognizer *)recognizer)->Pop(id);
+    ((BatchRecognizer *)recognizer)->Pop(id);
 }
 
 void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer)
 {
     ((BatchRecognizer *)recognizer)->WaitForCompletion();
 }
+
+int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id)
+{
+    return ((BatchRecognizer *)recognizer)->GetPendingChunks(id);
+}
diff --git a/src/vosk_api.h b/src/vosk_api.h
index 7177009c..f6a981cb 100644
--- a/src/vosk_api.h
+++ b/src/vosk_api.h
@@ -313,6 +313,9 @@ void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id);
 /** Wait for the processing */
 void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer);
 
+/** Get amount of pending chunks for more intelligent waiting */
+int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id);
+
 #ifdef __cplusplus
 }
 #endif

From 93e81c3bc8ed3960754b4eb6962b6dcc1fa26541 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 24 Dec 2021 00:22:42 +0100
Subject: [PATCH 6/8] Bigger frames per chunk for our big models

---
 src/batch_recognizer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 972e31dc..78cfc6f2 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -43,7 +43,7 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.decoder_opts.lattice_beam = 6.0;
     batched_decoder_config.compute_opts.acoustic_scale = 1.0;
     batched_decoder_config.compute_opts.frame_subsampling_factor = 3;
-    batched_decoder_config.compute_opts.frames_per_chunk = 51;
+    batched_decoder_config.compute_opts.frames_per_chunk = 180;
 
     struct stat buffer;
 

From 72bf210164ed6f347abce642025751f285b8284c Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 24 Dec 2021 01:07:38 +0100
Subject: [PATCH 7/8] Put the demo into main folder

---
 python/example/batch/asr_server_gpu.py        | 90 -------------------
 .../test_batch.py => test_gpu_batch.py}       |  2 +-
 src/batch_recognizer.cc                       |  5 +-
 src/vosk_api.cc                               |  2 +
 4 files changed, 6 insertions(+), 93 deletions(-)
 delete mode 100755 python/example/batch/asr_server_gpu.py
 rename python/example/{batch/test_batch.py => test_gpu_batch.py} (97%)

diff --git a/python/example/batch/asr_server_gpu.py b/python/example/batch/asr_server_gpu.py
deleted file mode 100755
index 11885e9f..00000000
--- a/python/example/batch/asr_server_gpu.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env python3
-
-import json
-import os
-import sys
-import asyncio
-import pathlib
-import websockets
-import logging
-
-from vosk import BatchRecognizer, GpuInit
-
-
-async def recognize(websocket, path):
-    global args
-    global loop
-    global pool
-    global rec
-    global client_cnt
-
-    uid = client_cnt
-    client_cnt += 1
-
-    logging.info('Connection %d from %s', uid, websocket.remote_address);
-
-    while True:
-
-        message = await websocket.recv()
-
-        if message == '{"eof" : 1}':
-            rec.FinishStream(uid)
-            break
-
-        if isinstance(message, str) and 'config' in message:
-            continue
-
-        rec.AcceptWaveform(uid, message)
-
-        while rec.GetPendingChunks(uid) > 0:
-            await asyncio.sleep(0.1)
-
-        res = rec.Result(uid)
-        if len(res) == 0:
-            await websocket.send('{ "partial" : "" }')
-        else:
-            await websocket.send(res)
-
-    while rec.GetPendingChunks(uid) > 0:
-        await asyncio.sleep(0.1)
-
-    res = rec.Result(uid)
-    await websocket.send(res)
-
-def start():
-
-    global rec
-    global args
-    global loop
-    global client_cnt
-
-    # Enable loging if needed
-    #
-    # logger = logging.getLogger('websockets')
-    # logger.setLevel(logging.INFO)
-    # logger.addHandler(logging.StreamHandler())
-    logging.basicConfig(level=logging.INFO)
-
-    args = type('', (), {})()
-
-    args.interface = os.environ.get('VOSK_SERVER_INTERFACE', '0.0.0.0')
-    args.port = int(os.environ.get('VOSK_SERVER_PORT', 2700))
-
-    GpuInit()
-
-    rec = BatchRecognizer()
-
-    client_cnt = 0
-
-    loop = asyncio.get_event_loop()
-
-    start_server = websockets.serve(
-        recognize, args.interface, args.port)
-
-    logging.info("Listening on %s:%d", args.interface, args.port)
-    loop.run_until_complete(start_server)
-    loop.run_forever()
-
-
-if __name__ == '__main__':
-    start()
diff --git a/python/example/batch/test_batch.py b/python/example/test_gpu_batch.py
similarity index 97%
rename from python/example/batch/test_batch.py
rename to python/example/test_gpu_batch.py
index 8737a746..3a65bda8 100755
--- a/python/example/batch/test_batch.py
+++ b/python/example/test_gpu_batch.py
@@ -29,7 +29,7 @@
     for i, fd in enumerate(fds):
         if i in ended:
             continue
-        data = fd.read(8000)
+        data = fd.read(16000)
         if len(data) == 0:
             rec.FinishStream(i)
             ended.add(i)
diff --git a/src/batch_recognizer.cc b/src/batch_recognizer.cc
index 78cfc6f2..3337ee10 100644
--- a/src/batch_recognizer.cc
+++ b/src/batch_recognizer.cc
@@ -31,9 +31,10 @@ BatchRecognizer::BatchRecognizer() {
     batched_decoder_config.Register(&po);
     po.ReadConfigFile("model/conf/model.conf");
 
-    batched_decoder_config.num_worker_threads = 4;
-    batched_decoder_config.max_batch_size = 100;
+    batched_decoder_config.num_worker_threads = -1;
+    batched_decoder_config.max_batch_size = 200;
     batched_decoder_config.reset_on_endpoint = true;
+    batched_decoder_config.use_gpu_feature_extraction = true;
 
     batched_decoder_config.feature_opts.feature_type = "mfcc";
     batched_decoder_config.feature_opts.mfcc_config = "model/conf/mfcc.conf";
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 1f77eb6c..3f740d7b 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -171,6 +171,8 @@ void vosk_set_log_level(int log_level)
 void vosk_gpu_init()
 {
 #if HAVE_CUDA
+//    kaldi::CuDevice::EnableTensorCores(true);
+//    kaldi::CuDevice::EnableTf32Compute(true);
     kaldi::CuDevice::Instantiate().SelectGpuId("yes");
     kaldi::CuDevice::Instantiate().AllowMultithreading();
 #endif

From 525b722c44e6b152926178ea226e9ce1c7ba3154 Mon Sep 17 00:00:00 2001
From: Nickolay Shmyrev <nshmyrev@gmail.com>
Date: Fri, 24 Dec 2021 01:35:06 +0100
Subject: [PATCH 8/8] Compile without CUDA too

---
 src/Makefile    | 13 ++++++++-----
 src/vosk_api.cc | 24 +++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 823a4aaf..9965db65 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -22,14 +22,12 @@ VOSK_SOURCES= \
 	language_model.cc \
 	model.cc \
 	spk_model.cc \
-	batch_recognizer.cc \
 	vosk_api.cc
 
 VOSK_HEADERS= \
 	recognizer.h \
 	language_model.h \
 	model.h \
-	batch_recognizer.h \
 	spk_model.h \
 	vosk_api.h
 
@@ -37,8 +35,6 @@ CFLAGS=-g -O3 -std=c++17 -Wno-deprecated-declarations -fPIC -DFST_NO_DYNAMIC_LIN
 	-I. -I$(KALDI_ROOT)/src -I$(OPENFST_ROOT)/include $(EXTRA_CFLAGS) 
 
 LIBS= \
-        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
-        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
 	$(KALDI_ROOT)/src/online2/kaldi-online2.a \
 	$(KALDI_ROOT)/src/decoder/kaldi-decoder.a \
 	$(KALDI_ROOT)/src/ivector/kaldi-ivector.a \
@@ -79,8 +75,15 @@ ifeq ($(HAVE_ACCELERATE), 1)
 endif
 
 ifeq ($(HAVE_CUDA), 1)
+    VOSK_SOURCES += batch_recognizer.cc
+    VOSK_HEADERS += batch_recognizer.h
+
     CFLAGS+=-DHAVE_CUDA=1 -I$(CUDA_ROOT)/include
-    LIBS+=\
+
+    LIBS := \
+        $(KALDI_ROOT)/src/cudadecoder/kaldi-cudadecoder.a \
+        $(KALDI_ROOT)/src/cudafeat/kaldi-cudafeat.a \
+        $(LIBS) \
         -L$(CUDA_ROOT)/lib64 -lcuda -lcublas -lcusparse -lcudart -lcurand -lcufft -lcusolver -lnvToolsExt
 endif
 
diff --git a/src/vosk_api.cc b/src/vosk_api.cc
index 3f740d7b..65356038 100644
--- a/src/vosk_api.cc
+++ b/src/vosk_api.cc
@@ -15,12 +15,12 @@
 #include "vosk_api.h"
 
 #include "recognizer.h"
-#include "batch_recognizer.h"
 #include "model.h"
 #include "spk_model.h"
 
 #if HAVE_CUDA
 #include "cudamatrix/cu-device.h"
+#include "batch_recognizer.h"
 #endif
 
 #include <string.h>
@@ -187,40 +187,62 @@ void vosk_gpu_thread_init()
 
 VoskBatchRecognizer *vosk_batch_recognizer_new()
 {
+#if HAVE_CUDA
     return (VoskBatchRecognizer *)(new BatchRecognizer());
+#else
+    return NULL;
+#endif
 }
 
 void vosk_batch_recognizer_free(VoskBatchRecognizer *recognizer)
 {
+#if HAVE_CUDA
     delete ((BatchRecognizer *)recognizer);
+#endif
 }
 
 void vosk_batch_recognizer_accept_waveform(VoskBatchRecognizer *recognizer, int id, const char *data, int length)
 {
+#if HAVE_CUDA
     ((BatchRecognizer *)recognizer)->AcceptWaveform(id, data, length);
+#endif
 }
 
 void vosk_batch_recognizer_finish_stream(VoskBatchRecognizer *recognizer, int id)
 {
+#if HAVE_CUDA
     ((BatchRecognizer *)recognizer)->FinishStream(id);
+#endif
 }
 
 const char *vosk_batch_recognizer_front_result(VoskBatchRecognizer *recognizer, int id)
 {
+#if HAVE_CUDA
     return ((BatchRecognizer *)recognizer)->FrontResult(id);
+#else
+    return NULL;
+#endif
 }
 
 void vosk_batch_recognizer_pop(VoskBatchRecognizer *recognizer, int id)
 {
+#if HAVE_CUDA
     ((BatchRecognizer *)recognizer)->Pop(id);
+#endif
 }
 
 void vosk_batch_recognizer_wait(VoskBatchRecognizer *recognizer)
 {
+#if HAVE_CUDA
     ((BatchRecognizer *)recognizer)->WaitForCompletion();
+#endif
 }
 
 int vosk_batch_recognizer_get_pending_chunks(VoskBatchRecognizer *recognizer, int id)
 {
+#if HAVE_CUDA
     return ((BatchRecognizer *)recognizer)->GetPendingChunks(id);
+#else
+    return 0;
+#endif
 }