deploy

Akshat2512 · Nov 30, 2024 · 6dfda83 · 6dfda83
1 parent 787a977
commit 6dfda83
Show file tree

Hide file tree

Showing 5 changed files with 423 additions and 192 deletions.
diff --git a/app.py b/app.py
@@ -56,19 +56,19 @@ async def chat(websocket: WebSocket, user_id: str):
 
         try:
             result = await handle_audio_new(websocket, audio_queue)
-
-            if not result:
+            if result==False:
                 print('Stopping background process')
                 process_task.cancel()
                 break
 
 
             # await audio_queue.put(result)
             # print(audio_queue.qsize())
-            if not response_queue.empty():
+            if not response_queue.empty() or result:
               await asyncio.sleep(0.1)
-              await generate_ai_response(response_queue, websocket, user_id, chat_history)   #  for generating ai responses and send it back to the client
-
+              await generate_ai_response(response_queue, websocket, user_id, chat_history, prompt = result)   #  for generating ai responses and send it back to the client
+
+
 
             if not websocket.application_state.CONNECTED:
                 break
@@ -78,27 +78,31 @@ async def chat(websocket: WebSocket, user_id: str):
         except Exception as e:
 
             # print(f"Connection error: {e}")
-            await websocket.send_text('connection_closed')
+            await websocket.send_json({"status":"connection closed"})
             await websocket.close()
 
 
 
 async def handle_audio_new(websocket: WebSocket, audio_queue):  
 
     try:
-        audio_data = await websocket.receive_bytes()   # receives the audio stream from clients
+        message = await websocket.receive()   # receives the audio stream from clients
+        if "bytes" in message:
+            with wave.open(io.BytesIO(message["bytes"]), 'rb') as wav_file:
+                # print(wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels(), wav_file.getnframes())
+
+                while True:
+                    audio_data = wav_file.readframes(1024)
+
+                    if not audio_data:
+                        break
+                    await audio_queue.put(audio_data)    
+                return None
 
-        with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
-            # print(wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels(), wav_file.getnframes())
-
-            while True:
-                audio_data = wav_file.readframes(1024)
-
-                if not audio_data:
-                    break
-                await audio_queue.put(audio_data)    
+        elif "text" in message:
+            prompt = message['text']
+            return prompt
 
-        return True
     except Exception as e:
         print(e)
         print("Websocket gets Disconnected")
@@ -107,8 +111,9 @@ async def handle_audio_new(websocket: WebSocket, audio_queue):
 
 
 
-async def generate_ai_response(response_queue, websocket, user_id, chat_history):        
-
+async def generate_ai_response(response_queue, websocket, user_id, chat_history, prompt):        
+
+            if prompt == None:
                audio_path = "backend/temp/"
                if not os.path.exists(audio_path):
                   os.makedirs(audio_path)
@@ -118,43 +123,42 @@ async def generate_ai_response(response_queue, websocket, user_id, chat_history)
 
                if result == 'file saved':
                   prompt = transcribe_audio(audio_path+f'recording_{user_id}.wav', os.getenv('OPENAI_API_KEY'))  # generate texts from audio using whisper-1 model
-
-                  if len(prompt) >= 2:
-
-                    logger.info('Transcribing: %s', prompt)
-
-                    message = {"responseType" : "user", "text" : prompt[:-1]}
+
+            if len(prompt) >= 2:
+
+                logger.info('User: %s', prompt)
+                message = {"responseType" : "user", "text" : prompt}
+                # message = json.dumps(message)
+                await websocket.send_json(message)
+
+
+                response = generate_response(prompt, os.getenv('OPENAI_API_KEY'), chat_history)  # generate natural language using gpt-4o model  
+                await asyncio.sleep(0.1)
+
+                if "CALL DALL-E" == response:
+                    message = {"responseType" : "assistant", "text": response}
                     # message = json.dumps(message)
                     await websocket.send_json(message)
-
-
-                    response = generate_response(prompt, os.getenv('OPENAI_API_KEY'), chat_history)  # generate natural language using gpt-4o model  
                     await asyncio.sleep(0.1)
-
-                    if "CALL DALL-E" == response:
-                        message = {"responseType" : "assistant", "text": response}
-                        # message = json.dumps(message)
-                        await websocket.send_json(message)
-                        await asyncio.sleep(0.1)
-
-                        print('Generating Image ...')
-
-                        image = generate_image_response(prompt, os.getenv('OPENAI_API_KEY')) # generate image from text using DALL-E-3 model
-
-                        try:
-                            message = {"responseType" : "assistant", "revised_prompt":image.revised_prompt, "image_url": image.url}
-                        except Exception as e:
-                            await websocket.send_json('{"status": "error"}')
-
-                        # message = json.dumps(message)
-                        await websocket.send_json(message)
-                        # print(message)
-                    else:
-                        message = {"responseType" : "assistant", "text" : response}
-                        # message = json.dumps(message)
-                        await websocket.send_json(message)
-
-                    logger.info('GPT-4o AI: %s', response)
+
+                    print('Generating Image ...')
+
+                    image = generate_image_response(prompt, os.getenv('OPENAI_API_KEY')) # generate image from text using DALL-E-3 model
+
+                    try:
+                        message = {"responseType" : "assistant", "revised_prompt":image.revised_prompt, "image_url": image.url}
+                    except Exception as e:
+                        await websocket.send_json({"status": "error"})
+                        return False
+                    # message = json.dumps(message)
+                    await websocket.send_json(message)
+                    # print(message)
+                else:
+                    message = {"responseType" : "assistant", "text" : response}
+                    # message = json.dumps(message)
+                    await websocket.send_json(message)
+
+                logger.info('GPT-4o AI: %s', response)
 
 
 

diff --git a/backend/speech_proccessing.py b/backend/speech_proccessing.py
@@ -4,6 +4,7 @@
 # import tensorflow.lite as tflite
 import zipfile
 import time
+import asyncio
 import tflite_runtime.interpreter as tflite
 
 import logging
@@ -42,55 +43,63 @@ async def process_audio_stream(audio_queue, response_queue):
 
     speak = 0
     silence = 0
-        
+    speech = 0
 
     while True:
           try: 
-            audio_data = await audio_queue.get()
+                audio_data = await audio_queue.get()
+
+                audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
+
+
+                audio_thres = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) # Calculate the root mean square (RMS) as the threshold value 
+                threshold = np.sqrt(np.mean(np.square(audio_thres)))
+                if(threshold >= 5000):
+                  speech = 1
+
+                audio_buffer = np.roll(audio_buffer, -len(audio_chunk))
+                audio_buffer[-len(audio_chunk):] = audio_chunk
+
+                # Set the tensor data
+                interpreter.set_tensor(waveform_input_index, audio_buffer)
+
+                # Run the model
+                interpreter.invoke()
+                scores = interpreter.get_tensor(scores_output_index)
 
-            audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
+                # Get the top classification result
+                top_class_index = scores.argmax()
+                prediction = labels[top_class_index]
+                # await response_queue.put(prediction)
+                # print(response_queue.qsize())
+                # print(prediction, len(audio_data) )
 
-            audio_buffer = np.roll(audio_buffer, -len(audio_chunk))
-            audio_buffer[-len(audio_chunk):] = audio_chunk
-
-            # Set the tensor data
-            interpreter.set_tensor(waveform_input_index, audio_buffer)
-
-            # Run the model
-            interpreter.invoke()
-            scores = interpreter.get_tensor(scores_output_index)
+                logger.info("%s, %d, %d", prediction, len(audio_data), threshold)
+
+                if( prediction == 'Speech' and speech == 1):
+                    audio_chunks.append(audio_data)
+                    # await response_queue.put(audio_data)
+                    speak = speak+1
+                    # silence = 0
+                    # i=5
+
+                elif(speak >= 20  and prediction !='Speech'):
+                    audio_data = b''.join(audio_chunks)
+                    await response_queue.put(audio_data)
+                    audio_chunks = []
+                    silence = 0
+                    speak = 0
+                    speech = 0
+
+                elif(prediction !='Speech'):
+                    silence = silence+1
+
+                if(silence == 5):
+                    audio_chunks = []
+                    silence = 0
+                    speak = 0
 
-            # Get the top classification result
-            top_class_index = scores.argmax()
-            prediction = labels[top_class_index]
-            # await response_queue.put(prediction)
-            # print(response_queue.qsize())
-            # print(prediction, len(audio_data) )
-
-            logger.info("%s, %d", prediction, len(audio_data))
-
-            if( prediction == 'Speech'):
-                audio_chunks.append(audio_data)
-                # await response_queue.put(audio_data)
-                speak = speak+1
-                # silence = 0
-                # i=5
-            elif(speak < 10 and prediction !='Speech'):
-                silence = silence+1
-
-            elif(speak >= 10  and prediction !='Speech'):
-                audio_data = b''.join(audio_chunks)
-                await response_queue.put(audio_data)
-                audio_chunks = []
-                silence = 0
-                speak = 0
-
 
-            if(silence == 5):
-                audio_chunks = []
-                silence = 0
-                speak = 0
-
           except Exception as e:
                 # print(audio_data)
                 print(e)