You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#include <iostream>
#include <vector>
#include <sstream>
#include <cstring>
#include <limits>
#include <chrono>
#include <memory>
#include <string>
#include <stdexcept>
#include <iostream>
#include <string>
#include "onnxruntime_cxx_api.h"
#include "wav.h"
#include <cstdio>
#include <cstdarg>
#if __cplusplus < 201703L
#include <memory>
#endif
//#define __DEBUG_SPEECH_PROB___
class timestamp_t
{
public:
int start;
int end;
// default + parameterized constructor
timestamp_t(int start = -1, int end = -1)
: start(start), end(end)
{
};
// assignment operator modifies object, therefore non-const
timestamp_t& operator=(const timestamp_t& a)
{
start = a.start;
end = a.end;
return *this;
};
// equality comparison. doesn't modify object. therefore const.
bool operator==(const timestamp_t& a) const
{
return (start == a.start && end == a.end);
};
std::string c_str()
{
//return std::format("timestamp {:08d}, {:08d}", start, end);
return format("{start:%.3f,end:%.3f}", start/16000., end/16000.);
};
private:
std::string format(const char* fmt, ...)
{
char buf[256];
va_list args;
va_start(args, fmt);
const auto r = std::vsnprintf(buf, sizeof buf, fmt, args);
va_end(args);
if (r < 0)
// conversion failed
return {};
const size_t len = r;
if (len < sizeof buf)
// we fit in the buffer
return { buf, len };
#if __cplusplus >= 201703L
// C++17: Create a string and write to its underlying array
std::string s(len, '\0');
va_start(args, fmt);
std::vsnprintf(s.data(), len + 1, fmt, args);
va_end(args);
return s;
#else
// C++11 or C++14: We need to allocate scratch memory
auto vbuf = std::unique_ptr<char[]>(new char[len + 1]);
va_start(args, fmt);
std::vsnprintf(vbuf.get(), len + 1, fmt, args);
va_end(args);
return { vbuf.get(), len };
#endif
};
};
class VadIterator
{
private:
// OnnxRuntime resources
Ort::Env env;
Ort::SessionOptions session_options;
std::shared_ptr<Ort::Session> session = nullptr;
Ort::AllocatorWithDefaultOptions allocator;
Ort::MemoryInfo memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeCPU);
private:
void init_engine_threads(int inter_threads, int intra_threads)
{
// The method should be called in each thread/proc in multi-thread/proc work
session_options.SetIntraOpNumThreads(intra_threads);
session_options.SetInterOpNumThreads(inter_threads);
session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL);
};
void init_onnx_model(const std::string& model_path)
{
// Init threads = 1 for
init_engine_threads(1, 1);
// Load model
session = std::make_shared<Ort::Session>(env, model_path.c_str(), session_options);
};
void reset_states()
{
// Call reset before each audio start
std::memset(_state.data(), 0.0f, _state.size() * sizeof(float));
triggered = false;
temp_end = 0;
current_sample = 0;
prev_end = next_start = 0;
speeches.clear();
current_speech = timestamp_t();
};
void predict(const std::vector<float> &data)
{
// Infer
// Create ort tensors
input.assign(data.begin(), data.end());
Ort::Value input_ort = Ort::Value::CreateTensor<float>(
memory_info, input.data(), input.size(), input_node_dims, 2);
Ort::Value state_ort = Ort::Value::CreateTensor<float>(
memory_info, _state.data(), _state.size(), state_node_dims, 3);
Ort::Value sr_ort = Ort::Value::CreateTensor<int64_t>(
memory_info, sr.data(), sr.size(), sr_node_dims, 1);
// Clear and add inputs
ort_inputs.clear();
ort_inputs.emplace_back(std::move(input_ort));
ort_inputs.emplace_back(std::move(state_ort));
ort_inputs.emplace_back(std::move(sr_ort));
// Infer
ort_outputs = session->Run(
Ort::RunOptions{nullptr},
input_node_names.data(), ort_inputs.data(), ort_inputs.size(),
output_node_names.data(), output_node_names.size());
// Output probability & update h,c recursively
float speech_prob = ort_outputs[0].GetTensorMutableData<float>()[0];
float *stateN = ort_outputs[1].GetTensorMutableData<float>();
std::memcpy(_state.data(), stateN, size_state * sizeof(float));
// Push forward sample index
current_sample += window_size_samples;
// Reset temp_end when > threshold
if ((speech_prob >= threshold))
{
#ifdef __DEBUG_SPEECH_PROB___
float speech = current_sample - window_size_samples; // minus window_size_samples to get precise start time point.
printf("{ start: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample- window_size_samples);
#endif //__DEBUG_SPEECH_PROB___
if (temp_end != 0)
{
temp_end = 0;
if (next_start < prev_end)
next_start = current_sample - window_size_samples;
}
if (triggered == false)
{
triggered = true;
current_speech.start = current_sample - window_size_samples;
}
return;
}
if (
(triggered == true)
&& ((current_sample - current_speech.start) > max_speech_samples)
) {
if (prev_end > 0) {
current_speech.end = prev_end;
speeches.push_back(current_speech);
current_speech = timestamp_t();
// previously reached silence(< neg_thres) and is still not speech(< thres)
if (next_start < prev_end)
triggered = false;
else{
current_speech.start = next_start;
}
prev_end = 0;
next_start = 0;
temp_end = 0;
}
else{
current_speech.end = current_sample;
speeches.push_back(current_speech);
current_speech = timestamp_t();
prev_end = 0;
next_start = 0;
temp_end = 0;
triggered = false;
}
return;
}
if ((speech_prob >= (threshold - 0.15)) && (speech_prob < threshold))
{
if (triggered) {
#ifdef __DEBUG_SPEECH_PROB___
float speech = current_sample - window_size_samples; // minus window_size_samples to get precise start time point.
printf("{ speeking: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample - window_size_samples);
#endif //__DEBUG_SPEECH_PROB___
}
else {
#ifdef __DEBUG_SPEECH_PROB___
float speech = current_sample - window_size_samples; // minus window_size_samples to get precise start time point.
printf("{ silence: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample - window_size_samples);
#endif //__DEBUG_SPEECH_PROB___
}
return;
}
// 4) End
if ((speech_prob < (threshold - 0.15)))
{
#ifdef __DEBUG_SPEECH_PROB___
float speech = current_sample - window_size_samples - speech_pad_samples; // minus window_size_samples to get precise start time point.
printf("{ end: %.3f s (%.3f) %08d}\n", 1.0 * speech / sample_rate, speech_prob, current_sample - window_size_samples);
#endif //__DEBUG_SPEECH_PROB___
if (triggered == true)
{
if (temp_end == 0)
{
temp_end = current_sample;
}
if (current_sample - temp_end > min_silence_samples_at_max_speech)
prev_end = temp_end;
// a. silence < min_slience_samples, continue speaking
if ((current_sample - temp_end) < min_silence_samples)
{
}
// b. silence >= min_slience_samples, end speaking
else
{
current_speech.end = temp_end;
if (current_speech.end - current_speech.start > min_speech_samples)
{
speeches.push_back(current_speech);
current_speech = timestamp_t();
prev_end = 0;
next_start = 0;
temp_end = 0;
triggered = false;
}
}
}
else {
// may first windows see end state.
}
return;
}
};
public:
void process(const std::vector<float>& input_wav)
{
reset_states();
audio_length_samples = input_wav.size();
for (int j = 0; j < audio_length_samples; j += window_size_samples)
{
if (j + window_size_samples > audio_length_samples)
break;
std::vector<float> r{ &input_wav[0] + j, &input_wav[0] + j + window_size_samples };
predict(r);
}
if (current_speech.start >= 0) {
current_speech.end = audio_length_samples;
speeches.push_back(current_speech);
current_speech = timestamp_t();
prev_end = 0;
next_start = 0;
temp_end = 0;
triggered = false;
}
};
void process(const std::vector<float>& input_wav, std::vector<float>& output_wav)
{
process(input_wav);
collect_chunks(input_wav, output_wav);
}
void collect_chunks(const std::vector<float>& input_wav, std::vector<float>& output_wav)
{
output_wav.clear();
for (int i = 0; i < speeches.size(); i++) {
#ifdef __DEBUG_SPEECH_PROB___
std::cout << speeches[i].c_str() << std::endl;
#endif //#ifdef __DEBUG_SPEECH_PROB___
std::vector<float> slice(&input_wav[speeches[i].start], &input_wav[speeches[i].end]);
output_wav.insert(output_wav.end(),slice.begin(),slice.end());
}
};
const std::vector<timestamp_t> get_speech_timestamps() const
{
return speeches;
}
void drop_chunks(const std::vector<float>& input_wav, std::vector<float>& output_wav)
{
output_wav.clear();
int current_start = 0;
for (int i = 0; i < speeches.size(); i++) {
std::vector<float> slice(&input_wav[current_start],&input_wav[speeches[i].start]);
output_wav.insert(output_wav.end(), slice.begin(), slice.end());
current_start = speeches[i].end;
}
std::vector<float> slice(&input_wav[current_start], &input_wav[input_wav.size()]);
output_wav.insert(output_wav.end(), slice.begin(), slice.end());
};
private:
// model config
int64_t window_size_samples; // Assign when init, support 256 512 768 for 8k; 512 1024 1536 for 16k.
int sample_rate; //Assign when init support 16000 or 8000
int sr_per_ms; // Assign when init, support 8 or 16
float threshold;
int min_silence_samples; // sr_per_ms * #ms
int min_silence_samples_at_max_speech; // sr_per_ms * #98
int min_speech_samples; // sr_per_ms * #ms
float max_speech_samples;
int speech_pad_samples; // usually a
int audio_length_samples;
// model states
bool triggered = false;
unsigned int temp_end = 0;
unsigned int current_sample = 0;
// MAX 4294967295 samples / 8sample per ms / 1000 / 60 = 8947 minutes
int prev_end;
int next_start = 0;
//Output timestamp
std::vector<timestamp_t> speeches;
timestamp_t current_speech;
// Onnx model
// Inputs
std::vector<Ort::Value> ort_inputs;
std::vector<const char *> input_node_names = {"input", "state", "sr"};
std::vector<float> input;
unsigned int size_state = 2 * 1 * 128; // It's FIXED.
std::vector<float> _state;
std::vector<int64_t> sr;
int64_t input_node_dims[2] = {};
const int64_t state_node_dims[3] = {2, 1, 128};
const int64_t sr_node_dims[1] = {1};
// Outputs
std::vector<Ort::Value> ort_outputs;
std::vector<const char *> output_node_names = {"output", "stateN"};
public:
// Construction
VadIterator(const std::string ModelPath,
int Sample_rate = 16000, int windows_frame_size = 32,
float Threshold = 0.5, int min_silence_duration_ms = 0,
int speech_pad_ms = 32, int min_speech_duration_ms = 32,
float max_speech_duration_s = 20)
{
init_onnx_model(ModelPath);
threshold = Threshold;
sample_rate = Sample_rate;
sr_per_ms = sample_rate / 1000;
window_size_samples = windows_frame_size * sr_per_ms;
min_speech_samples = sr_per_ms * min_speech_duration_ms;
speech_pad_samples = sr_per_ms * speech_pad_ms;
max_speech_samples = (
sample_rate * max_speech_duration_s
- window_size_samples
- 2 * speech_pad_samples
);
min_silence_samples = sr_per_ms * min_silence_duration_ms;
min_silence_samples_at_max_speech = sr_per_ms * 98;
input.resize(window_size_samples);
input_node_dims[0] = 1;
input_node_dims[1] = window_size_samples;
_state.resize(size_state);
sr.resize(1);
sr[0] = sample_rate;
};
};
int main(int argc, char* argv[])
{
std::vector<timestamp_t> stamps;
// Read wav
std::string file_path = argv[1];
wav::WavReader wav_reader(file_path.c_str()); //16000,1,32float
std::vector<float> input_wav(wav_reader.num_samples());
std::vector<float> output_wav;
for (int i = 0; i < wav_reader.num_samples(); i++)
{
input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
}
// ===== Test configs =====
std::string path = "silero_vad/data/silero_vad.onnx";
VadIterator vad(path);
// ==============================================
// ==== = Example 1 of full function =====
// ==============================================
vad.process(input_wav);
// 1.a get_speech_timestamps
stamps = vad.get_speech_timestamps();
for (int i = 0; i < stamps.size(); i++) {
std::cout << stamps[i].c_str() << std::endl;
}
}
compile the cpp code
Provide a 16k16bit audio, in which there is a sound interval exceeding 20 seconds, like this:
expect result: vad will cut each segment containing sound, and the maximum segment length shall not exceed 20 seconds (I have set max_speech_duration_s = 20). But acctually I get the result like this:
compare with python version. I write a python code like this
import sys
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
model = load_silero_vad()
wav = read_audio('easy_2speakers_tmp.wav')
speech_timestamps = get_speech_timestamps(
wav,
model,
return_seconds=True, # Return speech timestamps in seconds (default is samples)
max_speech_duration_s=20
)
print(speech_timestamps)
The running results of the python version are as expected(Force the length of each interval to be divided into less than 20 seconds)
Expected behavior
Environment
Please copy and paste the output from this environment collection script
(or fill out the checklist below manually).
You can get the script and run it with:
wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
# For security purposes, please check the contents of collect_env.py before running it.
python collect_env.py
PyTorch Version (e.g., 1.0):
OS (e.g., Linux):
How you installed PyTorch (conda, pip, source):
Build command you used (if compiling from source):
Python version:
CUDA/cuDNN version:
GPU models and configuration:
Any other relevant information:
Additional context
The text was updated successfully, but these errors were encountered:
snakers4
changed the title
Bug report - [X] max_speech_duration_s does not take effect in C++ version
[C++] max_speech_duration_s does not take effect in C++ version
Dec 20, 2024
🐛 Bug
To Reproduce
Steps to reproduce the behavior:
The running results of the python version are as expected(Force the length of each interval to be divided into less than 20 seconds)
Expected behavior
Environment
Please copy and paste the output from this
environment collection script
(or fill out the checklist below manually).
You can get the script and run it with:
conda
,pip
, source):Additional context
The text was updated successfully, but these errors were encountered: