From 6dfda836ab5bf15efa7ed732ef8b3ab0fbd7d7d8 Mon Sep 17 00:00:00 2001 From: Akshat2512 Date: Sun, 1 Dec 2024 03:27:24 +0530 Subject: [PATCH] deploy --- app.py | 110 +++++++++-------- backend/speech_proccessing.py | 93 +++++++------- static/mystyle.css | 164 +++++++++++++++++++++---- static/script.js | 223 ++++++++++++++++++++++++---------- templates/index.html | 25 ++-- 5 files changed, 423 insertions(+), 192 deletions(-) diff --git a/app.py b/app.py index cfa94f1..ea65709 100644 --- a/app.py +++ b/app.py @@ -56,8 +56,7 @@ async def chat(websocket: WebSocket, user_id: str): try: result = await handle_audio_new(websocket, audio_queue) - - if not result: + if result==False: print('Stopping background process') process_task.cancel() break @@ -65,10 +64,11 @@ async def chat(websocket: WebSocket, user_id: str): # await audio_queue.put(result) # print(audio_queue.qsize()) - if not response_queue.empty(): + if not response_queue.empty() or result: await asyncio.sleep(0.1) - await generate_ai_response(response_queue, websocket, user_id, chat_history) # for generating ai responses and send it back to the client - + await generate_ai_response(response_queue, websocket, user_id, chat_history, prompt = result) # for generating ai responses and send it back to the client + + if not websocket.application_state.CONNECTED: break @@ -78,7 +78,7 @@ async def chat(websocket: WebSocket, user_id: str): except Exception as e: # print(f"Connection error: {e}") - await websocket.send_text('connection_closed') + await websocket.send_json({"status":"connection closed"}) await websocket.close() @@ -86,19 +86,23 @@ async def chat(websocket: WebSocket, user_id: str): async def handle_audio_new(websocket: WebSocket, audio_queue): try: - audio_data = await websocket.receive_bytes() # receives the audio stream from clients + message = await websocket.receive() # receives the audio stream from clients + if "bytes" in message: + with wave.open(io.BytesIO(message["bytes"]), 'rb') as wav_file: + # print(wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels(), wav_file.getnframes()) + + while True: + audio_data = wav_file.readframes(1024) + + if not audio_data: + break + await audio_queue.put(audio_data) + return None - with wave.open(io.BytesIO(audio_data), 'rb') as wav_file: - # print(wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels(), wav_file.getnframes()) - - while True: - audio_data = wav_file.readframes(1024) - - if not audio_data: - break - await audio_queue.put(audio_data) + elif "text" in message: + prompt = message['text'] + return prompt - return True except Exception as e: print(e) print("Websocket gets Disconnected") @@ -107,8 +111,9 @@ async def handle_audio_new(websocket: WebSocket, audio_queue): -async def generate_ai_response(response_queue, websocket, user_id, chat_history): - +async def generate_ai_response(response_queue, websocket, user_id, chat_history, prompt): + + if prompt == None: audio_path = "backend/temp/" if not os.path.exists(audio_path): os.makedirs(audio_path) @@ -118,43 +123,42 @@ async def generate_ai_response(response_queue, websocket, user_id, chat_history) if result == 'file saved': prompt = transcribe_audio(audio_path+f'recording_{user_id}.wav', os.getenv('OPENAI_API_KEY')) # generate texts from audio using whisper-1 model - - if len(prompt) >= 2: - - logger.info('Transcribing: %s', prompt) - - message = {"responseType" : "user", "text" : prompt[:-1]} + + if len(prompt) >= 2: + + logger.info('User: %s', prompt) + message = {"responseType" : "user", "text" : prompt} + # message = json.dumps(message) + await websocket.send_json(message) + + + response = generate_response(prompt, os.getenv('OPENAI_API_KEY'), chat_history) # generate natural language using gpt-4o model + await asyncio.sleep(0.1) + + if "CALL DALL-E" == response: + message = {"responseType" : "assistant", "text": response} # message = json.dumps(message) await websocket.send_json(message) - - - response = generate_response(prompt, os.getenv('OPENAI_API_KEY'), chat_history) # generate natural language using gpt-4o model await asyncio.sleep(0.1) - - if "CALL DALL-E" == response: - message = {"responseType" : "assistant", "text": response} - # message = json.dumps(message) - await websocket.send_json(message) - await asyncio.sleep(0.1) - - print('Generating Image ...') - - image = generate_image_response(prompt, os.getenv('OPENAI_API_KEY')) # generate image from text using DALL-E-3 model - - try: - message = {"responseType" : "assistant", "revised_prompt":image.revised_prompt, "image_url": image.url} - except Exception as e: - await websocket.send_json('{"status": "error"}') - - # message = json.dumps(message) - await websocket.send_json(message) - # print(message) - else: - message = {"responseType" : "assistant", "text" : response} - # message = json.dumps(message) - await websocket.send_json(message) - - logger.info('GPT-4o AI: %s', response) + + print('Generating Image ...') + + image = generate_image_response(prompt, os.getenv('OPENAI_API_KEY')) # generate image from text using DALL-E-3 model + + try: + message = {"responseType" : "assistant", "revised_prompt":image.revised_prompt, "image_url": image.url} + except Exception as e: + await websocket.send_json({"status": "error"}) + return False + # message = json.dumps(message) + await websocket.send_json(message) + # print(message) + else: + message = {"responseType" : "assistant", "text" : response} + # message = json.dumps(message) + await websocket.send_json(message) + + logger.info('GPT-4o AI: %s', response) diff --git a/backend/speech_proccessing.py b/backend/speech_proccessing.py index ceb6086..f8422ad 100644 --- a/backend/speech_proccessing.py +++ b/backend/speech_proccessing.py @@ -4,6 +4,7 @@ # import tensorflow.lite as tflite import zipfile import time +import asyncio import tflite_runtime.interpreter as tflite import logging @@ -42,55 +43,63 @@ async def process_audio_stream(audio_queue, response_queue): speak = 0 silence = 0 - + speech = 0 while True: try: - audio_data = await audio_queue.get() + audio_data = await audio_queue.get() + + audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 + + + audio_thres = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) # Calculate the root mean square (RMS) as the threshold value + threshold = np.sqrt(np.mean(np.square(audio_thres))) + if(threshold >= 5000): + speech = 1 + + audio_buffer = np.roll(audio_buffer, -len(audio_chunk)) + audio_buffer[-len(audio_chunk):] = audio_chunk + + # Set the tensor data + interpreter.set_tensor(waveform_input_index, audio_buffer) + + # Run the model + interpreter.invoke() + scores = interpreter.get_tensor(scores_output_index) - audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0 + # Get the top classification result + top_class_index = scores.argmax() + prediction = labels[top_class_index] + # await response_queue.put(prediction) + # print(response_queue.qsize()) + # print(prediction, len(audio_data) ) - audio_buffer = np.roll(audio_buffer, -len(audio_chunk)) - audio_buffer[-len(audio_chunk):] = audio_chunk - - # Set the tensor data - interpreter.set_tensor(waveform_input_index, audio_buffer) - - # Run the model - interpreter.invoke() - scores = interpreter.get_tensor(scores_output_index) + logger.info("%s, %d, %d", prediction, len(audio_data), threshold) + + if( prediction == 'Speech' and speech == 1): + audio_chunks.append(audio_data) + # await response_queue.put(audio_data) + speak = speak+1 + # silence = 0 + # i=5 + + elif(speak >= 20 and prediction !='Speech'): + audio_data = b''.join(audio_chunks) + await response_queue.put(audio_data) + audio_chunks = [] + silence = 0 + speak = 0 + speech = 0 + + elif(prediction !='Speech'): + silence = silence+1 + + if(silence == 5): + audio_chunks = [] + silence = 0 + speak = 0 - # Get the top classification result - top_class_index = scores.argmax() - prediction = labels[top_class_index] - # await response_queue.put(prediction) - # print(response_queue.qsize()) - # print(prediction, len(audio_data) ) - - logger.info("%s, %d", prediction, len(audio_data)) - - if( prediction == 'Speech'): - audio_chunks.append(audio_data) - # await response_queue.put(audio_data) - speak = speak+1 - # silence = 0 - # i=5 - elif(speak < 10 and prediction !='Speech'): - silence = silence+1 - - elif(speak >= 10 and prediction !='Speech'): - audio_data = b''.join(audio_chunks) - await response_queue.put(audio_data) - audio_chunks = [] - silence = 0 - speak = 0 - - if(silence == 5): - audio_chunks = [] - silence = 0 - speak = 0 - except Exception as e: # print(audio_data) print(e) diff --git a/static/mystyle.css b/static/mystyle.css index 24a91e4..7df9df3 100644 --- a/static/mystyle.css +++ b/static/mystyle.css @@ -1,12 +1,15 @@ -body { +body, html { font-family: Arial, sans-serif; display: flex; flex-direction: column; - justify-content: center; align-items: center; + justify-content:center; margin:0px; + /* padding: 20px 0px; */ + gap: 20px; overflow:hidden; height: 100vh; + width:100vw; background-color: #f0f0f0; transition: ease-in-out 0.7s; } @@ -22,7 +25,6 @@ body { border-radius: 10px; box-shadow: 0 0 10px rgba(0,0,0,0.1); text-align: center; - vertical-align: middle; transform: translateY(0%); transition: ease-in-out 0.7s; opacity: 0; @@ -32,14 +34,15 @@ body { overflow-x: hidden; */ } #title-container{ - z-index: 1; + z-index: 1; display: flex; align-items: center; justify-content: center; - /* border:1px solid black; */ + flex-direction: column; /* transform: translateY(50%); */ transition: ease-in-out 0.7s; + width: 100%; } #title-container > div{ display: flex; @@ -51,47 +54,148 @@ body { /* transform: translateY(0%); */ transition: ease-in-out 0.7s; } -#container-2 > h2{ +#uname{ + width: 100%; + box-shadow: 0 0 10px rgba(0,0,0,0.1); + border: 1px solid grey; + max-width:200px; + text-align: center; +} + +#send-container{ + /* display: none; */ + position: relative; opacity: 0; + display: flex; + justify-content: flex-start; + /* border: 1px solid black; */ + flex-direction: row; + pointer-events: none; + user-select:none; transition: ease-in-out 0.7s; + width: 90vw; } -#container-2{ + + +#writer{ + opacity: 1; + display: flex; + align-items: center; + gap:40px; + /* border: 1px solid black; */ + padding: 20px; + z-index:2; + transition: ease-in-out 0.7s; + /* height: inherit; */ + max-width: 80%; + width: 70%; +} + +#writer> :nth-child(1) { + resize: none; + overflow: auto; + width: 100%; + height: 40px; +} + +#writer>div:nth-child(2){ + display: flex; + align-items: center; + height:10px; + padding: 8px 10px 8px 8px; + background: #0288d1; + border-radius: 10px; + transform: scale(1.5); + box-shadow: 0px 0px 10px 0px rgb(0 0 0 / 48%); + + +} + +#writer>div:nth-child(2) i{ + transform: rotateZ(40deg); +} + +#listener > h2{ + /* display:none; */ + opacity: 0; + z-index: 1; + text-wrap: nowrap; + transition: ease-in-out 0.7s; + background-color: #f0f0f0; +} + +#listener{ + + position: absolute; + right:10px; + top:10px; display: flex; flex-direction: row; + justify-content:center ; + align-items: center; gap:80px; - opacity: 0; + opacity: 1; + width: 0%; + /* border: 1px solid black; */ + transition: ease-in-out 0.7s; + +} + +#stream:hover{ + background: #026aa3; + transition: ease-in-out 0.2s; +} + +#fa-keyboard{ + z-index: 0; + display: flex; + margin-left:auto; + width:0px; + justify-content:center; + align-items: center; + overflow: hidden; /* border: 1px solid black; */ transition: ease-in-out 0.7s; + background-color: #f0f0f0; + + +} +#fa-keyboard:hover{ + background: grey; + transition: ease-in-out 0.2s; } +#fa-keyboard i{ + transform: scale(1.8); +} #stream{ display: flex; align-items: center; justify-content: center; - color:rgb(58, 86, 110); + color:rgb(35 35 35); height:18px; padding: 5px; background: #0288d1; - transform: scale(2) translate(0,9px); + transform: scale(1.8); border-radius: 20px; font-size: 18px; + box-shadow: 0px 0px 10px 0px rgb(0 0 0 / 48%); } #status{ - position: absolute; - top: 10px; - right: 10px; - display: flex; - align-items: center; - justify-content: center; - color:rgb(58, 86, 110); + margin:20px 7% 0px ; + align-self: flex-end; + opacity: 0; + color:rgb(28 56 57); height:18px; - padding: 5px; + padding: 10px; background: #7c7c7c; border-radius: 20px; font-size: 18px; + transition: ease-in-out 0.7s; } + button { padding: 10px 20px; margin-top: 10px; @@ -105,8 +209,8 @@ button { } .response{ - width: inherit; - height: inherit; + width: 100%; + height: 100%; position: absolute; overflow-x: hidden; overflow-y: auto; @@ -114,13 +218,15 @@ button { transition: ease-in-out 0.7s; /* border: 1px solid black; */ } + .user{ - /* flex-grow: 1; */ - - /* border: 1px solid black; */ + display: flex; /* width: 100%; */ justify-content: start; + word-break: break-word; + /* max-width: 100%; */ + overflow-wrap: anywhere; } .user>div{ padding:10px; @@ -142,6 +248,8 @@ button { margin: 20px; /* text-align: right; */ flex-direction: column; + word-break: break-word; + overflow-wrap: anywhere; } .assistant>div{ @@ -153,6 +261,9 @@ button { margin: 20px 20px; width: 70%; } +.assistant>div:nth-child(2){ + min-height:300px; +} .image_process{ display: flex; @@ -163,11 +274,10 @@ button { background-color: gray; /* box-shadow: 0 0 10px rgba(0, 0, 0, 0.173); */ box-shadow: inset 0 0 10px rgba(0,0,0,0.1); - height: 500px; - width: 700px; + } img{ - height: inherit; - max-width: 100%; + border-radius:17px; + width: 100%; } \ No newline at end of file diff --git a/static/script.js b/static/script.js index aa02fff..989c980 100644 --- a/static/script.js +++ b/static/script.js @@ -1,14 +1,21 @@ -const startMic = document.getElementById('start-mic'); +const start = document.getElementById('start-mic'); const responseContainer = document.getElementById('message-container'); const titleContainer = document.getElementById('title-container'); -const container2 = document.getElementById('container-2'); +const sendContainer = document.getElementById('send-container'); +const listener = document.getElementById('listener'); +const writer = document.getElementById('writer'); +const fa_keyboard = document.getElementById('fa-keyboard'); const strm = document.getElementById('stream'); const response = document.querySelector('.response'); - +const ws_status = document.querySelector('#status'); const uname = document.querySelector('#uname'); +const send = document.querySelector('#send'); - -setTimeout(() => responseContainer.style.opacity = '1', 1000); +setTimeout(() => { + username = localStorage.getItem('uname'); + uname.value = username; + responseContainer.style.opacity = '1' +}, 1000); let recorder; @@ -20,16 +27,15 @@ let interval; let i = 0 async function start_recording() { - - startMic.disabled = true; - container2.querySelectorAll("h2")[1].innerText = "Listening ..."; - document.getElementById('status').disabled = false; + + start.disabled = true; + strm.innerHTML = ``; // Get access to the microphone try { audioStream = await navigator.mediaDevices.getUserMedia({ audio: true}); console.log('Microphone access is granted') - + strm.style.opacity = '1'; // Initialize the recorder recorder = new RecordRTC(audioStream, { @@ -44,8 +50,7 @@ async function start_recording() { const reader = new FileReader(); reader.onloadend = async function() { const audio_bytes = reader.result; - audioQueue.unshift(audio_bytes) - + audioQueue.unshift(audio_bytes); }; reader.readAsArrayBuffer(Blob); @@ -61,8 +66,9 @@ async function start_recording() { if (socket.readyState === WebSocket.OPEN) { socket.send(audioQueue.pop()); } - else { - container2.querySelectorAll("h2")[1].innerText = "Disconnected ..."; + else { + ws_status.innerText = "Disconnected"; + ws_status.style.backgroundColor = "grey"; } } else @@ -70,30 +76,57 @@ async function start_recording() { console.log('audioQueue is empty!') } - strm.onclick = stopRecording; - + strm.onclick = ()=>{ + flag = 1; + fa_keyboard.style.zIndex='1'; + stop_recording(); + console.log(flag) + } },100) + + setTimeout(async ()=>{ + + listener.style.gap = '40px'; + listener.querySelectorAll("h2").forEach(e=>{ + e.style.opacity = '1'; + }) + + }) + } catch (err){ console.log('Microphone access is denied') } } - -function stopRecording(){ - +let flag = 0 +function stop_recording(){ + try{ + recorder.stopRecording(); - - container2.querySelectorAll("h2")[1].innerText = " ... "; - strm.innerHTML = ''; + + strm.innerHTML = ''; audioStream.getTracks().forEach(track => track.stop()); clearInterval(interval); + + setTimeout(async ()=>{ + listener.style.gap = '80px'; + listener.querySelectorAll("h2").forEach(e=>{ + e.style.opacity = '0'; + }) + + }) + } + catch(e){ + + } strm.onclick = ()=>{ - container2.querySelectorAll("h2")[1].innerText = "Listening ..."; - strm.innerHTML = ``; - start_recording() + flag = 0; + fa_keyboard.style.zIndex='0'; + start_recording(); + console.log(flag) } @@ -112,6 +145,8 @@ async function connect_ws(user_id){ // Connection closed event socket.onclose = function(event) { console.log('WebSocket is closed.'); + ws_status.innerText = "Disconnected"; + ws_status.style.backgroundColor = "grey"; }; // Error event @@ -123,7 +158,8 @@ async function connect_ws(user_id){ socket.onmessage = function(event) { console.log('Message from server:', event.data); - receiveResponses(event.data) + if (flag == 0) + receiveResponses(event.data); // const messagesDiv = document.getElementById('messages'); // messagesDiv.innerHTML += `

${event.data}

`; }; @@ -133,22 +169,59 @@ async function connect_ws(user_id){ } async function start_connection(){ + try { - socket = await connect_ws('Akshat'); + socket = await connect_ws(uname.value); console.log('WebSocket connected successfully.'); // Example of sending a message through WebSocket - start_recording(); - + ws_status.innerText = "Connected"; + ws_status.style.backgroundColor = "green"; + ws_status.style.opacity = '1'; + sendContainer.style.opacity= '1'; + sendContainer.style.pointerEvents= 'visible'; + onStartup(); + // stop_recording(); } - catch (error) { - console.error('Error during WebSocket connection:', error); + console.error('Error during WebSocket connection:', error); + ws_status.innerText = "Disconnected"; + ws_status.style.backgroundColor = "grey"; + ws_status.style.opacity = '1'; + start_connection(); //if failed then keep trying connecting } + } +function closeWriter(){ + setTimeout(start_recording, 1000) + fa_keyboard.style.zIndex='0'; + writer.style.opacity = "0"; + writer.style.pointerEvents='none'; + writer.style.width = "35%"; + + fa_keyboard.style.width = '50px'; + listener.style.right = '50%'; + +} +function closeListener(){ + writer.style.opacity = "1"; + writer.style.pointerEvents='visible'; + writer.style.width = "100%"; + + fa_keyboard.style.width = '0px'; + listener.style.right = '0%'; + + stop_recording(); + onStartup(); +} -function scrollToBottom() { +function onStartup(){ + strm.onclick = closeWriter; + fa_keyboard.onclick = closeListener; +} + +function scrollToBottom(){ response.scrollTo({ top: response.scrollHeight, behavior: 'smooth' }); } @@ -158,33 +231,35 @@ function receiveResponses(message) message = JSON.parse(message) if(message.responseType == 'user') { + while_ai_prompt_generating(); response.innerHTML = response.innerHTML + `
${message.text}
`; } else if(message.responseType == 'assistant' && message.text == 'CALL DALL-E') { - e[e.length-1].innerHTML = `
`; e[e.length-1].querySelector('div').innerText = 'Generating image ...'; // response.innerHTML = response.innerHTML + `
`; } else if('image_url' in message) - { + { + after_ai_prompt_generated(); e[e.length-1].querySelector('.image_process').innerHTML = `Not found`; e[e.length-1].querySelector('.revised-prompt').innerText = message.revised_prompt } else if(message.responseType == 'assistant') { + after_ai_prompt_generated(); e[e.length-1].querySelector('div').innerText = `${message.text}`; } - else if('Recieved' in message) - { - console.log(`${message.Recieved}`); - return 0; - } - else - e[e.length-1].innerHTML =`
Content Policy Violation
`; + + else if(message.status == 'error'){ + // recorder.startRecording(); + after_ai_prompt_generated(); + e[e.length-1].innerHTML =`
Content Policy Violation
`; + } + console.table(message) scrollToBottom() @@ -194,37 +269,59 @@ function receiveResponses(message) -startMic.onclick = async function () { - +start.onclick = async function () { + + if (uname.value == '') + { + alert('Please enter username !!'); + return false; + } + else + { + localStorage.setItem('uname', uname.value); + } + titleContainer.style.opacity = '0'; response.style.opacity = '1'; - container2.style.opacity= '1'; - container2.querySelectorAll("h2")[1].innerText = "Listening ..."; - strm.innerHTML = ``; - responseContainer.style.cssText = ` - width: 90vw; - height: 80vh; + width:80%; + height: 70%; opacity: 1; `; - - - setTimeout(async ()=>{ titleContainer.style.display = 'none'; - container2.style.gap = '40px'; - container2.querySelectorAll("h2").forEach(e=>{ - e.style.opacity = '1'; - }) - - - start_connection(); - - }, 2000) + start_connection(); + }, 2000); + + +}; - -}; +function while_ai_prompt_generating(){ + if(writer.style.opacity == '0') + { + stop_recording(); + strm.onclick = null; + fa_keyboard.style.zIndex='1'; + } + +} +function after_ai_prompt_generated(){ + if(writer.style.opacity == '0') + { + start_recording(); + fa_keyboard.style.zIndex='0'; + } +} +send.onclick = ()=>{ + const message = writer.querySelector('textarea'); + flag = 0; + if (message.value != ''){ + socket.send(message.value); + message.value = ''; + } + +} diff --git a/templates/index.html b/templates/index.html index cd56a18..feacb27 100644 --- a/templates/index.html +++ b/templates/index.html @@ -10,23 +10,34 @@ +

AI Assistant

- + +
- +
-
-

AI Assistant

-
-

Listening ...

+
+
+ +
+
+ +
+

AI Assistant

+
+

Listening ...

+
+ +
-
+