Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Golang API for spoken language identification. #709

Merged
merged 1 commit into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions go-api-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ for details.
- [./vad-asr-paraformer](./vad-asr-paraformer) It shows how to use silero VAD + Paraformer
for speech recognition.

- [./vad-spoken-language-identification](./vad-spoken-language-identification) It shows how to use silero VAD + Whisper
for spoken language identification.

[sherpa-onnx]: https://github.com/k2-fsa/sherpa-onnx
3 changes: 3 additions & 0 deletions go-api-examples/vad-spoken-language-identification/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module vad-spoken-language-identification

go 1.12
141 changes: 141 additions & 0 deletions go-api-examples/vad-spoken-language-identification/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
package main

import (
"fmt"
iso639 "github.com/barbashov/iso639-3"
"github.com/gordonklaus/portaudio"
sherpa "github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx"
"log"
)

func main() {
log.SetFlags(log.LstdFlags | log.Lmicroseconds)

// 1. Create VAD
config := sherpa.VadModelConfig{}

// Please download silero_vad.onnx from
// https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx

config.SileroVad.Model = "./silero_vad.onnx"
config.SileroVad.Threshold = 0.5
config.SileroVad.MinSilenceDuration = 0.5
config.SileroVad.MinSpeechDuration = 0.25
config.SileroVad.WindowSize = 512
config.SampleRate = 16000
config.NumThreads = 1
config.Provider = "cpu"
config.Debug = 1

var bufferSizeInSeconds float32 = 20

vad := sherpa.NewVoiceActivityDetector(&config, bufferSizeInSeconds)
defer sherpa.DeleteVoiceActivityDetector(vad)

// 2. Create spoken language identifier

c := sherpa.SpokenLanguageIdentificationConfig{}
c.Whisper.Encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx"
c.Whisper.Decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx"
c.NumThreads = 2
c.Debug = 1
c.Provider = "cpu"

slid := sherpa.NewSpokenLanguageIdentification(&c)
defer sherpa.DeleteSpokenLanguageIdentification(slid)

err := portaudio.Initialize()
if err != nil {
log.Fatalf("Unable to initialize portaudio: %v\n", err)
}
defer portaudio.Terminate()

default_device, err := portaudio.DefaultInputDevice()
if err != nil {
log.Fatal("Failed to get default input device: %v\n", err)
}
log.Printf("Selected default input device: %s\n", default_device.Name)
param := portaudio.StreamParameters{}
param.Input.Device = default_device
param.Input.Channels = 1
param.Input.Latency = default_device.DefaultHighInputLatency

param.SampleRate = float64(config.SampleRate)
param.FramesPerBuffer = 0
param.Flags = portaudio.ClipOff

// you can choose another value for 0.1 if you want
samplesPerCall := int32(param.SampleRate * 0.1) // 0.1 second
samples := make([]float32, samplesPerCall)

s, err := portaudio.OpenStream(param, samples)
if err != nil {
log.Fatalf("Failed to open the stream")
}

defer s.Close()
chk(s.Start())

log.Print("Started! Please speak")
printed := false

k := 0
for {
chk(s.Read())
vad.AcceptWaveform(samples)

if vad.IsSpeech() && !printed {
printed = true
log.Print("Detected speech\n")
}

if !vad.IsSpeech() {
printed = false
}

for !vad.IsEmpty() {
speechSegment := vad.Front()
vad.Pop()

duration := float32(len(speechSegment.Samples)) / float32(config.SampleRate)

audio := &sherpa.GeneratedAudio{}
audio.Samples = speechSegment.Samples
audio.SampleRate = config.SampleRate

// Now decode it
go decode(slid, audio, k)

k += 1

log.Printf("Duration: %.2f seconds\n", duration)
}
}

chk(s.Stop())
}

func decode(slid *sherpa.SpokenLanguageIdentification, audio *sherpa.GeneratedAudio, id int) {
stream := slid.CreateStream()
defer sherpa.DeleteOfflineStream(stream)

stream.AcceptWaveform(audio.SampleRate, audio.Samples)
result := slid.Compute(stream)
lang := iso639.FromPart1Code(result.Lang).Name
log.Printf("Detected language: %v", lang)

duration := float32(len(audio.Samples)) / float32(audio.SampleRate)

filename := fmt.Sprintf("seg-%d-%.2f-seconds-%s.wav", id, duration, lang)
ok := audio.Save(filename)
if ok {
log.Printf("Saved to %s", filename)
}
log.Print("----------\n")
}

func chk(err error) {
if err != nil {
panic(err)
}
}
16 changes: 16 additions & 0 deletions go-api-examples/vad-spoken-language-identification/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/env bash


if [ ! -f ./silero_vad.onnx ]; then
curl -SL -O https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
fi

if [ ! -f ./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
tar xvf sherpa-onnx-whisper-tiny.tar.bz2
rm sherpa-onnx-whisper-tiny.tar.bz2
fi

go mod tidy
go build
./vad-spoken-language-identification
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
vad-spoken-language-identification

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module vad-spoken-language-identification

go 1.12

replace github.com/k2-fsa/sherpa-onnx-go/sherpa_onnx => ../
69 changes: 69 additions & 0 deletions scripts/go/sherpa_onnx.go
Original file line number Diff line number Diff line change
Expand Up @@ -783,3 +783,72 @@ func (vad *VoiceActivityDetector) Front() *SpeechSegment {
func (vad *VoiceActivityDetector) Reset() {
C.SherpaOnnxVoiceActivityDetectorReset(vad.impl)
}

// Spoken language identification

type SpokenLanguageIdentificationWhisperConfig struct {
Encoder string
Decoder string
TailPaddings int
}

type SpokenLanguageIdentificationConfig struct {
Whisper SpokenLanguageIdentificationWhisperConfig
NumThreads int
Debug int
Provider string
}

type SpokenLanguageIdentification struct {
impl *C.struct_SherpaOnnxSpokenLanguageIdentification
}

type SpokenLanguageIdentificationResult struct {
Lang string
}

func NewSpokenLanguageIdentification(config *SpokenLanguageIdentificationConfig) *SpokenLanguageIdentification {
c := C.struct_SherpaOnnxSpokenLanguageIdentificationConfig{}

c.whisper.encoder = C.CString(config.Whisper.Encoder)
defer C.free(unsafe.Pointer(c.whisper.encoder))

c.whisper.decoder = C.CString(config.Whisper.Decoder)
defer C.free(unsafe.Pointer(c.whisper.decoder))

c.whisper.tail_paddings = C.int(config.Whisper.TailPaddings)

c.num_threads = C.int(config.NumThreads)
c.debug = C.int(config.Debug)

c.provider = C.CString(config.Provider)
defer C.free(unsafe.Pointer(c.provider))

slid := &SpokenLanguageIdentification{}
slid.impl = C.SherpaOnnxCreateSpokenLanguageIdentification(&c)

return slid
}

func DeleteSpokenLanguageIdentification(slid *SpokenLanguageIdentification) {
C.SherpaOnnxDestroySpokenLanguageIdentification(slid.impl)
slid.impl = nil
}

// The user has to invoke DeleteOfflineStream() to free the returned value
// to avoid memory leak
func (slid *SpokenLanguageIdentification) CreateStream() *OfflineStream {
stream := &OfflineStream{}
stream.impl = C.SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(slid.impl)
return stream
}

func (slid *SpokenLanguageIdentification) Compute(stream *OfflineStream) *SpokenLanguageIdentificationResult {
r := C.SherpaOnnxSpokenLanguageIdentificationCompute(slid.impl, stream.impl)
// defer C.SherpaOnnxDestroySpokenLanguageIdentificationResult(r)

ans := &SpokenLanguageIdentificationResult{}
ans.Lang = C.GoString(r.lang)

return ans
}
2 changes: 1 addition & 1 deletion sherpa-onnx/csrc/spoken-language-identification.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ std::string SpokenLanguageIdentificationConfig::ToString() const {
std::ostringstream os;

os << "SpokenLanguageIdentificationConfig(";
os << "whisper=\"" << whisper.ToString() << "\", ";
os << "whisper=" << whisper.ToString() << ", ";
os << "num_threads=" << num_threads << ", ";
os << "debug=" << (debug ? "True" : "False") << ", ";
os << "provider=\"" << provider << "\")";
Expand Down
Loading