Skip to content

Commit

Permalink
Merge branch 'pr/53' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
t41372 committed Dec 14, 2024
2 parents 38a1318 + 6e964ad commit 9518cf9
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 14 deletions.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,22 @@ Install the respective package and turn it on using the `TTS_MODEL` option in `c
If you're using macOS, you need to enable the microphone permission of your terminal emulator (you run this program inside your terminal, right? Enable the microphone permission for your terminal). If you fail to do so, the speech recognition will not be able to hear you because it does not have permission to use your microphone.


## VAD Tuning

For web interface, this project utilizes client-side Voice Activity Detection (VAD) using the [ricky0123/vad-web](https://github.com/ricky0123/vad) library for efficient speech detection.

**Web Interface Controls:**

The following settings are available in the web interface to fine-tune the VAD:

* **Speech Prob. Threshold:** Controls the minimum speech probability for initial speech detection. Higher values require stronger speech input to trigger detection.
* **Negative Speech Threshold:** The probability threshold below which a frame is considered to not contain speech (i.e., part of a silence).
* **Redemption Frames:** Specifies how many consecutive frames of silence are required to end a speech segment. Higher values allow for more pause tolerance.

**Tuning Tips:**

Experiment with these parameters to find the optimal balance between sensitivity and accuracy for your environment and speaking style.

## Some other things

### Translation
Expand Down
78 changes: 64 additions & 14 deletions static/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,15 @@
<div class="sensitivity-container">
<span class="sensitivity-label">Speech Prob. Threshold:</span>
<input type="number" id="speechProbThreshold" min="1" max="100" value="97" title="Speech Detection Confidence Level Threshold (%)">
</div>
<div class="sensitivity-container">
<span class="sensitivity-label">Negative Speech Threshold:</span>
<input type="number" id="negativeSpeechThreshold" min="0" max="100" value="15"
title="Speech detection absence threshold (%)">
</div>
<div class="sensitivity-container">
<span class="sensitivity-label">Redemption Frames:</span>
<input type="number" id="redemptionFrames" min="1" max="100" value="20" title="Number of silent frames to wait before ending speech">
</div>
<select id="configDropdown" aria-label="Configuration Selection">
<option value="">Select Configuration</option>
Expand Down Expand Up @@ -99,13 +108,25 @@
let myvad;
let previousTriggeredProbability = 0; // the possibility that triggered the last speech start
let speechProbThreshold = document.getElementById('speechProbThreshold');
let negativeSpeechThreshold = document.getElementById('negativeSpeechThreshold');
let redemptionFrames = document.getElementById('redemptionFrames');

window.addEventListener('load', function() {
const savedThreshold = localStorage.getItem('speechProbThreshold');
if (savedThreshold) {
speechProbThreshold.value = savedThreshold;
}

const savedNegativeThreshold = localStorage.getItem('negativeSpeechThreshold');
if (savedNegativeThreshold) {
negativeSpeechThreshold.value = savedNegativeThreshold;
}

const savedRedemptionFrames = localStorage.getItem('redemptionFrames');
if (savedRedemptionFrames) {
redemptionFrames.value = savedRedemptionFrames;
}

const savedBackground = localStorage.getItem('selectedBackground');
if (savedBackground) {
setTimeout(() => {
Expand All @@ -119,6 +140,8 @@
myvad = await vad.MicVAD.new({
preSpeechPadFrames: 20,
positiveSpeechThreshold: speechProbThreshold.value / 100,
negativeSpeechThreshold: negativeSpeechThreshold.value / 100,
redemptionFrames: parseInt(redemptionFrames.value),
onSpeechStart: () => {
console.log("Speech start detected: " + previousTriggeredProbability);
if (state === "thinking-speaking") {
Expand Down Expand Up @@ -168,6 +191,33 @@
}
}
});

negativeSpeechThreshold.addEventListener('change', async function() {
localStorage.setItem('negativeSpeechThreshold', this.value);
if (myvad) {
await myvad.pause();
await init_vad();
if (micToggleState) {
await myvad.start();
} else {
await myvad.pause();
}
}
});

redemptionFrames.addEventListener('change', async function() {
localStorage.setItem('redemptionFrames', this.value);
if (myvad) {
await myvad.pause();
await init_vad();
if (micToggleState) {
await myvad.start();
} else {
await myvad.pause();
}
}
});


const chunkSize = 4096;
async function sendAudioPartition(audio) {
Expand Down Expand Up @@ -296,7 +346,7 @@
}
micStateBeforeConfigSwitch = null; // reset the state
break;
case "background-files":
case "background-files":
populateBgDropdown(message.files);
break;
default:
Expand All @@ -308,8 +358,8 @@
function fetchConfigurations() {
ws.send(JSON.stringify({ type: "fetch-configs" }));
}

function fetchBackgrounds() {
function fetchBackgrounds() {
ws.send(JSON.stringify({ type: "fetch-backgrounds" }));
}

Expand All @@ -323,7 +373,7 @@
});
}

function populateBgDropdown(files) {
function populateBgDropdown(files) {
bgDropdown.innerHTML = '<option value="">Select Background</option>';
files.forEach(file => {
const option = document.createElement('option');
Expand All @@ -332,24 +382,24 @@
bgDropdown.appendChild(option);
});
}

configDropdown.addEventListener('change', function () {
const selectedConfig = configDropdown.value;
if (selectedConfig) {
setState("switching-config");
document.getElementById("message").textContent = "Switching configuration...";
// avoid the mic being on when switching config
micStateBeforeConfigSwitch = micToggleState;
if (micToggleState) {
stop_mic();
}
if (micToggleState) {
stop_mic();
}

interrupt();
ws.send(JSON.stringify({ type: "switch-config", file: selectedConfig }));
}
});

bgDropdown.addEventListener('change', function () {
bgDropdown.addEventListener('change', function () {
const selectedBg = bgDropdown.value;
if (selectedBg) {
document.body.style.backgroundImage = `url('./bg/${selectedBg}')`;
Expand Down Expand Up @@ -397,14 +447,14 @@

audioTaskQueue = new TaskQueue(20); // 100ms delay between tasks
async function addAudioTask(audio_base64, volumes, slice_length, text = null, expression_list = null) {
console.log(`1. Adding audio task ${text} to queue`);

console.log(`1. Adding audio task ${text} to queue`);
// skip the task if interrupted
if (state === "interrupted") {
console.log("Skipping audio task due to interrupted state");
return;
}

audioTaskQueue.addTask(() => {
return new Promise((resolve, reject) => {
playAudioLipSync(audio_base64, volumes, slice_length, text, expression_list, onComplete=resolve);
Expand All @@ -415,7 +465,7 @@
}

function playAudioLipSync(audio_base64, volumes, slice_length, text = null, expression_list = null, onComplete) {
if (state === "interrupted") {
if (state === "interrupted") {
console.error("Audio playback blocked. State:", state);
onComplete();
return;
Expand All @@ -429,7 +479,7 @@

const displayExpression = expression_list ? expression_list[0] : null;
console.log("Start playing audio: ", text);

try {
model2.speak("data:audio/wav;base64," + audio_base64, {
expression: displayExpression,
Expand Down

0 comments on commit 9518cf9

Please sign in to comment.