nvidia-riva · atomer-nvidia · Nov 8, 2024 · Aug 12, 2024 · Aug 22, 2024 · Aug 28, 2024
diff --git a/WORKSPACE b/WORKSPACE
@@ -70,7 +70,7 @@ grpc_extra_deps()
 git_repository(
      name = "nvriva_common",
      remote = "https://github.com/nvidia-riva/common.git",
-     commit = "1c7da5aed4e4df3a296d2672379c5099a193aaae"
+     commit = "d73d7c13d3e291aace10f619a8f0e6fc6be78156"
 )
 
 http_archive(

diff --git a/riva/clients/asr/client_call.cc b/riva/clients/asr/client_call.cc
@@ -13,45 +13,64 @@ ClientCall::ClientCall(uint32_t corr_id, bool word_time_offsets)
   recv_final_flags.reserve(1000);
 }
 
+ClientCall::~ClientCall()
+{
+  if (pipeline_states_logs_.is_open()) {
+    pipeline_states_logs_.close();
+  }
+}
+
 void
 ClientCall::AppendResult(const nr_asr::StreamingRecognitionResult& result)
 {
-  bool is_final = result.is_final();
   if (latest_result_.final_transcripts.size() < 1) {
     latest_result_.final_transcripts.resize(1);
     latest_result_.final_transcripts[0] = "";
   }
-
-  if (is_final) {
-    int num_alternatives = result.alternatives_size();
-    latest_result_.final_transcripts.resize(num_alternatives);
-    latest_result_.final_scores.resize(num_alternatives);
-    latest_result_.final_time_stamps.resize(num_alternatives);
-    for (int a = 0; a < num_alternatives; ++a) {
-      // Append to transcript
-      latest_result_.final_transcripts[a] += result.alternatives(a).transcript();
-      latest_result_.final_scores[a] += result.alternatives(a).confidence();
+  if (result.has_pipeline_states()) {
+    auto pipeline_states = result.pipeline_states();
+    size_t prob_states_count = pipeline_states.vad_probabilities_size();
+    std::string vad_log = "";
+    for (size_t i = 0; i < prob_states_count; i++) {
+      vad_log += std::to_string(pipeline_states.vad_probabilities(i)) + " ";
+    }
+    if (!pipeline_states_logs_.is_open()) {
+      pipeline_states_logs_.open("riva_asr_pipeline_states.log");
     }
-    VLOG(1) << "Final transcript: " << result.alternatives(0).transcript();
+    pipeline_states_logs_ << "VAD states: " << vad_log << std::endl;
+  } else {
+    bool is_final = result.is_final();
+    if (is_final) {
+      int num_alternatives = result.alternatives_size();
+      latest_result_.final_transcripts.resize(num_alternatives);
+      latest_result_.final_scores.resize(num_alternatives);
+      latest_result_.final_time_stamps.resize(num_alternatives);
+      for (int a = 0; a < num_alternatives; ++a) {
+        // Append to transcript
+        latest_result_.final_transcripts[a] += result.alternatives(a).transcript();
+        latest_result_.final_scores[a] += result.alternatives(a).confidence();
+      }
+      VLOG(1) << "Final transcript: " << result.alternatives(0).transcript();
 
-    if (word_time_offsets_) {
-      if (num_alternatives > 0) {
-        for (int a = 0; a < num_alternatives; ++a) {
-          for (int w = 0; w < result.alternatives(a).words_size(); ++w) {
-            latest_result_.final_time_stamps[a].push_back(result.alternatives(a).words(w));
+      if (word_time_offsets_) {
+        if (num_alternatives > 0) {
+          for (int a = 0; a < num_alternatives; ++a) {
+            for (int w = 0; w < result.alternatives(a).words_size(); ++w) {
+              latest_result_.final_time_stamps[a].push_back(result.alternatives(a).words(w));
+            }
           }
         }
       }
-    }
-  } else {
-    if (result.alternatives_size() > 0) {
-      if (result.stability() == 1) {
-        VLOG(1) << "Intermediate transcript: " << result.alternatives(0).transcript();
-      } else {
-        latest_result_.partial_transcript += result.alternatives(0).transcript();
-        if (word_time_offsets_) {
-          for (int w = 0; w < result.alternatives(0).words_size(); ++w) {
-            latest_result_.partial_time_stamps.emplace_back(result.alternatives(0).words(w));
+    } else {
+      if (result.alternatives_size() > 0) {
+        if (result.stability() == 1) {
+          VLOG(1) << "Intermediate transcript: " << result.alternatives(0).transcript();
+        } else {
+          latest_result_.partial_transcript += result.alternatives(0).transcript();
+          if (word_time_offsets_) {
+            for (int w = 0; w < result.alternatives(0).words_size(); ++w) {
+              latest_result_.partial_time_stamps.emplace_back(result.alternatives(0).words(w));
+            }
           }
         }
       }

diff --git a/riva/clients/asr/client_call.h b/riva/clients/asr/client_call.h
@@ -37,6 +37,7 @@ namespace nr_asr = nvidia::riva::asr;
 class ClientCall {
  public:
   ClientCall(uint32_t _corr_id, bool word_time_offsets);
+  ~ClientCall();
 
   void AppendResult(const nr_asr::StreamingRecognitionResult& result);
 
@@ -66,5 +67,6 @@ class ClientCall {
   std::vector<bool> recv_final_flags;
 
   grpc::Status finish_status;
+  std::ofstream pipeline_states_logs_;
 
 };  // ClientCall
diff --git a/riva/clients/asr/riva_asr_client.cc b/riva/clients/asr/riva_asr_client.cc
@@ -67,7 +67,9 @@ DEFINE_bool(
     "Whether to use SSL credentials or not. If ssl_cert is specified, "
     "this is assumed to be true");
 DEFINE_bool(speaker_diarization, false, "Flag that controls if speaker diarization is requested");
-DEFINE_int32(diarization_max_speakers, 3, "Max number of speakers to detect when performing speaker diarization");
+DEFINE_int32(
+    diarization_max_speakers, 3,
+    "Max number of speakers to detect when performing speaker diarization");
 DEFINE_string(metadata, "", "Comma separated key-value pair(s) of metadata to be sent to server");
 DEFINE_int32(start_history, -1, "Value to detect and initiate start of speech utterance");
 DEFINE_double(
@@ -92,14 +94,15 @@ class RecognizeClient {
       bool automatic_punctuation, bool separate_recognition_per_channel, bool print_transcripts,
       std::string output_filename, std::string model_name, bool ctm, bool verbatim_transcripts,
       const std::string& boosted_phrases_file, float boosted_phrases_score,
-      bool speaker_diarization, int32_t diarization_max_speakers, int32_t start_history, float start_threshold, int32_t stop_history,
-      int32_t stop_history_eou, float stop_threshold, float stop_threshold_eou,
-      std::string custom_configuration)
+      bool speaker_diarization, int32_t diarization_max_speakers, int32_t start_history,
+      float start_threshold, int32_t stop_history, int32_t stop_history_eou, float stop_threshold,
+      float stop_threshold_eou, std::string custom_configuration)
       : stub_(nr_asr::RivaSpeechRecognition::NewStub(channel)), language_code_(language_code),
         max_alternatives_(max_alternatives), profanity_filter_(profanity_filter),
         word_time_offsets_(word_time_offsets), automatic_punctuation_(automatic_punctuation),
         separate_recognition_per_channel_(separate_recognition_per_channel),
-        speaker_diarization_(speaker_diarization), diarization_max_speakers_(diarization_max_speakers), print_transcripts_(print_transcripts),
+        speaker_diarization_(speaker_diarization),
+        diarization_max_speakers_(diarization_max_speakers), print_transcripts_(print_transcripts),
         done_sending_(false), num_requests_(0), num_responses_(0), num_failed_requests_(0),
         total_audio_processed_(0.), model_name_(model_name), output_filename_(output_filename),
         verbatim_transcripts_(verbatim_transcripts), boosted_phrases_score_(boosted_phrases_score),
@@ -533,9 +536,9 @@ main(int argc, char** argv)
       FLAGS_word_time_offsets, FLAGS_automatic_punctuation,
       /* separate_recognition_per_channel*/ false, FLAGS_print_transcripts, FLAGS_output_filename,
       FLAGS_model_name, FLAGS_output_ctm, FLAGS_verbatim_transcripts, FLAGS_boosted_words_file,
-      (float)FLAGS_boosted_words_score, FLAGS_speaker_diarization, FLAGS_diarization_max_speakers, FLAGS_start_history,
-      FLAGS_start_threshold, FLAGS_stop_history, FLAGS_stop_history_eou, FLAGS_stop_threshold,
-      FLAGS_stop_threshold_eou, FLAGS_custom_configuration);
+      (float)FLAGS_boosted_words_score, FLAGS_speaker_diarization, FLAGS_diarization_max_speakers,
+      FLAGS_start_history, FLAGS_start_threshold, FLAGS_stop_history, FLAGS_stop_history_eou,
+      FLAGS_stop_threshold, FLAGS_stop_threshold_eou, FLAGS_custom_configuration);
 
   // Preload all wav files, sort by size to reduce tail effects
   std::vector<std::shared_ptr<WaveData>> all_wav;

diff --git a/riva/clients/nmt/riva_nmt_t2t_client.cc b/riva/clients/nmt/riva_nmt_t2t_client.cc
@@ -200,6 +200,7 @@ main(int argc, char** argv)
   if (FLAGS_text != "") {
     nr_nmt::TranslateTextRequest request;
     nr_nmt::TranslateTextResponse response;
+    VLOG(1) << "Setting up t2t config.";
     request.set_model(FLAGS_model_name);
     request.set_source_language(FLAGS_source_language_code);
     request.set_target_language(FLAGS_target_language_code);