diff --git a/go-api-examples/README.md b/go-api-examples/README.md index ff7fe9bc4..51a44b389 100644 --- a/go-api-examples/README.md +++ b/go-api-examples/README.md @@ -23,4 +23,7 @@ for details. - [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer for speech recognition. +- [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper + for spoken language identification. + [sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx diff --git a/go-api-examples/vad-spoken-language-identification/go.mod b/go-api-examples/vad-spoken-language-identification/go.mod new file mode 100644 index 000000000..9a806cbd1 --- /dev/null +++ b/go-api-examples/vad-spoken-language-identification/go.mod @@ -0,0 +1,3 @@ +module vad-spoken-language-identification + +go 1.12 diff --git a/go-api-examples/vad-spoken-language-identification/main.go b/go-api-examples/vad-spoken-language-identification/main.go new file mode 100644 index 000000000..5db250e84 --- /dev/null +++ b/go-api-examples/vad-spoken-language-identification/main.go @@ -0,0 +1,141 @@ +package main + +import ( + "fmt" + iso639 "github.com/barbashov/iso639-3" + "github.com/gordonklaus/portaudio" + sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx" + "log" +) + +func main() { + log.SetFlags(log.LstdFlags | log.Lmicroseconds) + + // 1. Create VAD + config := sherpa.VadModelConfig{} + + // Please download silero_vad.onnx from + // https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx + + config.SileroVad.Model = "./silero_vad.onnx" + config.SileroVad.Threshold = 0.5 + config.SileroVad.MinSilenceDuration = 0.5 + config.SileroVad.MinSpeechDuration = 0.25 + config.SileroVad.WindowSize = 512 + config.SampleRate = 16000 + config.NumThreads = 1 + config.Provider = "cpu" + config.Debug = 1 + + var bufferSizeInSeconds float32 = 20 + + vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds) + defer sherpa.DeleteVoiceActivityDetector(vad) + + // 2. Create spoken language identifier + + c := sherpa.SpokenLanguageIdentificationConfig{} + c.Whisper.Encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx" + c.Whisper.Decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx" + c.NumThreads = 2 + c.Debug = 1 + c.Provider = "cpu" + + slid := sherpa.NewSpokenLanguageIdentification(&c) + defer sherpa.DeleteSpokenLanguageIdentification(slid) + + err := portaudio.Initialize() + if err != nil { + log.Fatalf("Unable to initialize portaudio: %v\n", err) + } + defer portaudio.Terminate() + + default_device, err := portaudio.DefaultInputDevice() + if err != nil { + log.Fatal("Failed to get default input device: %v\n", err) + } + log.Printf("Selected default input device: %s\n", default_device.Name) + param := portaudio.StreamParameters{} + param.Input.Device = default_device + param.Input.Channels = 1 + param.Input.Latency = default_device.DefaultHighInputLatency + + param.SampleRate = float64(config.SampleRate) + param.FramesPerBuffer = 0 + param.Flags = portaudio.ClipOff + + // you can choose another value for 0.1 if you want + samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second + samples := make([]float32, samplesPerCall) + + s, err := portaudio.OpenStream(param, samples) + if err != nil { + log.Fatalf("Failed to open the stream") + } + + defer s.Close() + chk(s.Start()) + + log.Print("Started! Please speak") + printed := false + + k := 0 + for { + chk(s.Read()) + vad.AcceptWaveform(samples) + + if vad.IsSpeech() && !printed { + printed = true + log.Print("Detected speech\n") + } + + if !vad.IsSpeech() { + printed = false + } + + for !vad.IsEmpty() { + speechSegment := vad.Front() + vad.Pop() + + duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate) + + audio := &sherpa.GeneratedAudio{} + audio.Samples = speechSegment.Samples + audio.SampleRate = config.SampleRate + + // Now decode it + go decode(slid, audio, k) + + k += 1 + + log.Printf("Duration: %.2f seconds\n", duration) + } + } + + chk(s.Stop()) +} + +func decode(slid *sherpa.SpokenLanguageIdentification, audio *sherpa.GeneratedAudio, id int) { + stream := slid.CreateStream() + defer sherpa.DeleteOfflineStream(stream) + + stream.AcceptWaveform(audio.SampleRate, audio.Samples) + result := slid.Compute(stream) + lang := iso639.FromPart1Code(result.Lang).Name + log.Printf("Detected language: %v", lang) + + duration := float32(len(audio.Samples)) / float32(audio.SampleRate) + + filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, lang) + ok := audio.Save(filename) + if ok { + log.Printf("Saved to %s", filename) + } + log.Print("----------\n") +} + +func chk(err error) { + if err != nil { + panic(err) + } +} diff --git a/go-api-examples/vad-spoken-language-identification/run.sh b/go-api-examples/vad-spoken-language-identification/run.sh new file mode 100755 index 000000000..fc3c219e7 --- /dev/null +++ b/go-api-examples/vad-spoken-language-identification/run.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + + +if [ ! -f ./silero_vad.onnx ]; then + curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx +fi + +if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2 + tar xvf sherpa-onnx-whisper-tiny.tar.bz2 + rm sherpa-onnx-whisper-tiny.tar.bz2 +fi + +go mod tidy +go build +./vad-spoken-language-identification diff --git a/scripts/go/_internal/vad-spoken-language-identification/.gitignore b/scripts/go/_internal/vad-spoken-language-identification/.gitignore new file mode 100644 index 000000000..8ca38fe20 --- /dev/null +++ b/scripts/go/_internal/vad-spoken-language-identification/.gitignore @@ -0,0 +1,2 @@ +vad-spoken-language-identification + diff --git a/scripts/go/_internal/vad-spoken-language-identification/go.mod b/scripts/go/_internal/vad-spoken-language-identification/go.mod new file mode 100644 index 000000000..9b66564ed --- /dev/null +++ b/scripts/go/_internal/vad-spoken-language-identification/go.mod @@ -0,0 +1,5 @@ +module vad-spoken-language-identification + +go 1.12 + +replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../ diff --git a/scripts/go/_internal/vad-spoken-language-identification/main.go b/scripts/go/_internal/vad-spoken-language-identification/main.go new file mode 120000 index 000000000..f831629b7 --- /dev/null +++ b/scripts/go/_internal/vad-spoken-language-identification/main.go @@ -0,0 +1 @@ +/Users/fangjun/open-source/sherpa-onnx/go-api-examples/vad-spoken-language-identification/main.go \ No newline at end of file diff --git a/scripts/go/_internal/vad-spoken-language-identification/run.sh b/scripts/go/_internal/vad-spoken-language-identification/run.sh new file mode 120000 index 000000000..9e71cc969 --- /dev/null +++ b/scripts/go/_internal/vad-spoken-language-identification/run.sh @@ -0,0 +1 @@ +/Users/fangjun/open-source/sherpa-onnx/go-api-examples/vad-spoken-language-identification/run.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 03f093fb0..01dc5948d 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -783,3 +783,72 @@ func (vad *VoiceActivityDetector) Front() *SpeechSegment { func (vad *VoiceActivityDetector) Reset() { C.SherpaOnnxVoiceActivityDetectorReset(vad.impl) } + +// Spoken language identification + +type SpokenLanguageIdentificationWhisperConfig struct { + Encoder string + Decoder string + TailPaddings int +} + +type SpokenLanguageIdentificationConfig struct { + Whisper SpokenLanguageIdentificationWhisperConfig + NumThreads int + Debug int + Provider string +} + +type SpokenLanguageIdentification struct { + impl *C.struct_SherpaOnnxSpokenLanguageIdentification +} + +type SpokenLanguageIdentificationResult struct { + Lang string +} + +func NewSpokenLanguageIdentification(config *SpokenLanguageIdentificationConfig) *SpokenLanguageIdentification { + c := C.struct_SherpaOnnxSpokenLanguageIdentificationConfig{} + + c.whisper.encoder = C.CString(config.Whisper.Encoder) + defer C.free(unsafe.Pointer(c.whisper.encoder)) + + c.whisper.decoder = C.CString(config.Whisper.Decoder) + defer C.free(unsafe.Pointer(c.whisper.decoder)) + + c.whisper.tail_paddings = C.int(config.Whisper.TailPaddings) + + c.num_threads = C.int(config.NumThreads) + c.debug = C.int(config.Debug) + + c.provider = C.CString(config.Provider) + defer C.free(unsafe.Pointer(c.provider)) + + slid := &SpokenLanguageIdentification{} + slid.impl = C.SherpaOnnxCreateSpokenLanguageIdentification(&c) + + return slid +} + +func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification) { + C.SherpaOnnxDestroySpokenLanguageIdentification(slid.impl) + slid.impl = nil +} + +// The user has to invoke DeleteOfflineStream() to free the returned value +// to avoid memory leak +func (slid *SpokenLanguageIdentification) CreateStream() *OfflineStream { + stream := &OfflineStream{} + stream.impl = C.SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid.impl) + return stream +} + +func (slid *SpokenLanguageIdentification) Compute(stream *OfflineStream) *SpokenLanguageIdentificationResult { + r := C.SherpaOnnxSpokenLanguageIdentificationCompute(slid.impl, stream.impl) + // defer C.SherpaOnnxDestroySpokenLanguageIdentificationResult(r) + + ans := &SpokenLanguageIdentificationResult{} + ans.Lang = C.GoString(r.lang) + + return ans +} diff --git a/sherpa-onnx/csrc/spoken-language-identification.cc b/sherpa-onnx/csrc/spoken-language-identification.cc index 868382835..2cb2d99a9 100644 --- a/sherpa-onnx/csrc/spoken-language-identification.cc +++ b/sherpa-onnx/csrc/spoken-language-identification.cc @@ -91,7 +91,7 @@ std::string SpokenLanguageIdentificationConfig::ToString() const { std::ostringstream os; os << "SpokenLanguageIdentificationConfig("; - os << "whisper=\"" << whisper.ToString() << "\", "; + os << "whisper=" << whisper.ToString() << ", "; os << "num_threads=" << num_threads << ", "; os << "debug=" << (debug ? "True" : "False") << ", "; os << "provider=\"" << provider << "\")";