Merge branch 'pr/53' into dev

t41372 · Dec 14, 2024 · 9518cf9 · 9518cf9
2 parents 38a1318 + 6e964ad
commit 9518cf9
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -445,6 +445,22 @@ Install the respective package and turn it on using the `TTS_MODEL` option in `c
 If you're using macOS, you need to enable the microphone permission of your terminal emulator (you run this program inside your terminal, right? Enable the microphone permission for your terminal). If you fail to do so, the speech recognition will not be able to hear you because it does not have permission to use your microphone.
 
 
+## VAD Tuning
+
+For web interface, this project utilizes client-side Voice Activity Detection (VAD) using the [ricky0123/vad-web](https://github.com/ricky0123/vad) library for efficient speech detection.
+
+**Web Interface Controls:**
+
+The following settings are available in the web interface to fine-tune the VAD:
+
+*   **Speech Prob. Threshold:** Controls the minimum speech probability for initial speech detection. Higher values require stronger speech input to trigger detection.
+*   **Negative Speech Threshold:** The probability threshold below which a frame is considered to not contain speech (i.e., part of a silence).
+*   **Redemption Frames:** Specifies how many consecutive frames of silence are required to end a speech segment. Higher values allow for more pause tolerance.
+
+**Tuning Tips:**
+
+Experiment with these parameters to find the optimal balance between sensitivity and accuracy for your environment and speaking style.
+
 ## Some other things
 
 ### Translation

diff --git a/static/index.html b/static/index.html
@@ -40,6 +40,15 @@
         <div class="sensitivity-container">
             <span class="sensitivity-label">Speech Prob. Threshold:</span>
             <input type="number" id="speechProbThreshold" min="1" max="100" value="97" title="Speech Detection Confidence Level Threshold (%)">
+        </div>
+         <div class="sensitivity-container">
+            <span class="sensitivity-label">Negative Speech Threshold:</span>
+            <input type="number" id="negativeSpeechThreshold" min="0" max="100" value="15"
+                title="Speech detection absence threshold (%)">
+        </div>
+        <div class="sensitivity-container">
+            <span class="sensitivity-label">Redemption Frames:</span>
+            <input type="number" id="redemptionFrames" min="1" max="100" value="20" title="Number of silent frames to wait before ending speech">
         </div>
         <select id="configDropdown" aria-label="Configuration Selection">
             <option value="">Select Configuration</option>
@@ -99,13 +108,25 @@
         let myvad;
         let previousTriggeredProbability = 0; // the possibility that triggered the last speech start
         let speechProbThreshold = document.getElementById('speechProbThreshold');
+        let negativeSpeechThreshold = document.getElementById('negativeSpeechThreshold');
+        let redemptionFrames = document.getElementById('redemptionFrames');
 
         window.addEventListener('load', function() {
             const savedThreshold = localStorage.getItem('speechProbThreshold');
             if (savedThreshold) {
                 speechProbThreshold.value = savedThreshold;
             }
 
+            const savedNegativeThreshold = localStorage.getItem('negativeSpeechThreshold');
+            if (savedNegativeThreshold) {
+                negativeSpeechThreshold.value = savedNegativeThreshold;
+            }
+
+            const savedRedemptionFrames = localStorage.getItem('redemptionFrames');
+            if (savedRedemptionFrames) {
+                redemptionFrames.value = savedRedemptionFrames;
+            }
+
             const savedBackground = localStorage.getItem('selectedBackground');
             if (savedBackground) {
                 setTimeout(() => {
@@ -119,6 +140,8 @@
             myvad = await vad.MicVAD.new({
                 preSpeechPadFrames: 20,
                 positiveSpeechThreshold: speechProbThreshold.value / 100,
+                negativeSpeechThreshold: negativeSpeechThreshold.value / 100,
+                redemptionFrames: parseInt(redemptionFrames.value),
                 onSpeechStart: () => {
                     console.log("Speech start detected: " + previousTriggeredProbability);
                     if (state === "thinking-speaking") {
@@ -168,6 +191,33 @@
                 }
             }
         });
+
+        negativeSpeechThreshold.addEventListener('change', async function() {
+            localStorage.setItem('negativeSpeechThreshold', this.value);
+            if (myvad) {
+                await myvad.pause();
+                await init_vad();
+                if (micToggleState) {
+                    await myvad.start();
+                } else {
+                    await myvad.pause();
+                }
+            }
+        });
+
+        redemptionFrames.addEventListener('change', async function() {
+            localStorage.setItem('redemptionFrames', this.value);
+             if (myvad) {
+                await myvad.pause();
+                await init_vad();
+                if (micToggleState) {
+                    await myvad.start();
+                } else {
+                    await myvad.pause();
+                }
+            }
+        });
+
 
         const chunkSize = 4096;
         async function sendAudioPartition(audio) {
@@ -296,7 +346,7 @@
                     }
                     micStateBeforeConfigSwitch = null;  // reset the state
                     break;
-                case "background-files":
+                 case "background-files":
                     populateBgDropdown(message.files);
                     break;
                 default:
@@ -308,8 +358,8 @@
         function fetchConfigurations() {
             ws.send(JSON.stringify({ type: "fetch-configs" }));
         }
-
-        function fetchBackgrounds() {
+        
+         function fetchBackgrounds() {
             ws.send(JSON.stringify({ type: "fetch-backgrounds" }));
         }
 
@@ -323,7 +373,7 @@
             });
         }
 
-        function populateBgDropdown(files) {
+         function populateBgDropdown(files) {
             bgDropdown.innerHTML = '<option value="">Select Background</option>';
             files.forEach(file => {
                 const option = document.createElement('option');
@@ -332,24 +382,24 @@
                 bgDropdown.appendChild(option);
             });
         }
-
+        
         configDropdown.addEventListener('change', function () {
             const selectedConfig = configDropdown.value;
             if (selectedConfig) {
                 setState("switching-config");
                 document.getElementById("message").textContent = "Switching configuration...";
                 // avoid the mic being on when switching config
                 micStateBeforeConfigSwitch = micToggleState;
-                if (micToggleState) {
-                    stop_mic();
-                }
+                 if (micToggleState) {
+                     stop_mic();
+                 }
 
                 interrupt();
                 ws.send(JSON.stringify({ type: "switch-config", file: selectedConfig }));
             }
         });
 
-        bgDropdown.addEventListener('change', function () {
+         bgDropdown.addEventListener('change', function () {
             const selectedBg = bgDropdown.value;
             if (selectedBg) {
                 document.body.style.backgroundImage = `url('./bg/${selectedBg}')`;
@@ -397,14 +447,14 @@
 
         audioTaskQueue = new TaskQueue(20); // 100ms delay between tasks
         async function addAudioTask(audio_base64, volumes, slice_length, text = null, expression_list = null) {
-            console.log(`1. Adding audio task ${text} to queue`);
-
+             console.log(`1. Adding audio task ${text} to queue`);
+             
             // skip the task if interrupted
             if (state === "interrupted") {
                 console.log("Skipping audio task due to interrupted state");
                 return;
             }
-            
+
             audioTaskQueue.addTask(() => {
                 return new Promise((resolve, reject) => {
                     playAudioLipSync(audio_base64, volumes, slice_length, text, expression_list, onComplete=resolve);
@@ -415,7 +465,7 @@
         }
 
         function playAudioLipSync(audio_base64, volumes, slice_length, text = null, expression_list = null, onComplete) {
-            if (state === "interrupted") {
+              if (state === "interrupted") {
                 console.error("Audio playback blocked. State:", state);
                 onComplete();
                 return;
@@ -429,7 +479,7 @@
 
             const displayExpression = expression_list ? expression_list[0] : null;
             console.log("Start playing audio: ", text);
-            
+
             try {
                 model2.speak("data:audio/wav;base64," + audio_base64, {
                     expression: displayExpression,