Skip to content

Commit

Permalink
deploy
Browse files Browse the repository at this point in the history
  • Loading branch information
Akshat2512 committed Nov 30, 2024
1 parent 787a977 commit 6dfda83
Show file tree
Hide file tree
Showing 5 changed files with 423 additions and 192 deletions.
110 changes: 57 additions & 53 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,19 +56,19 @@ async def chat(websocket: WebSocket, user_id: str):

try:
result = await handle_audio_new(websocket, audio_queue)

if not result:
if result==False:
print('Stopping background process')
process_task.cancel()
break


# await audio_queue.put(result)
# print(audio_queue.qsize())
if not response_queue.empty():
if not response_queue.empty() or result:
await asyncio.sleep(0.1)
await generate_ai_response(response_queue, websocket, user_id, chat_history) # for generating ai responses and send it back to the client

await generate_ai_response(response_queue, websocket, user_id, chat_history, prompt = result) # for generating ai responses and send it back to the client



if not websocket.application_state.CONNECTED:
break
Expand All @@ -78,27 +78,31 @@ async def chat(websocket: WebSocket, user_id: str):
except Exception as e:

# print(f"Connection error: {e}")
await websocket.send_text('connection_closed')
await websocket.send_json({"status":"connection closed"})
await websocket.close()



async def handle_audio_new(websocket: WebSocket, audio_queue):

try:
audio_data = await websocket.receive_bytes() # receives the audio stream from clients
message = await websocket.receive() # receives the audio stream from clients
if "bytes" in message:
with wave.open(io.BytesIO(message["bytes"]), 'rb') as wav_file:
# print(wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels(), wav_file.getnframes())

while True:
audio_data = wav_file.readframes(1024)

if not audio_data:
break
await audio_queue.put(audio_data)
return None

with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
# print(wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels(), wav_file.getnframes())

while True:
audio_data = wav_file.readframes(1024)

if not audio_data:
break
await audio_queue.put(audio_data)
elif "text" in message:
prompt = message['text']
return prompt

return True
except Exception as e:
print(e)
print("Websocket gets Disconnected")
Expand All @@ -107,8 +111,9 @@ async def handle_audio_new(websocket: WebSocket, audio_queue):



async def generate_ai_response(response_queue, websocket, user_id, chat_history):

async def generate_ai_response(response_queue, websocket, user_id, chat_history, prompt):

if prompt == None:
audio_path = "backend/temp/"
if not os.path.exists(audio_path):
os.makedirs(audio_path)
Expand All @@ -118,43 +123,42 @@ async def generate_ai_response(response_queue, websocket, user_id, chat_history)

if result == 'file saved':
prompt = transcribe_audio(audio_path+f'recording_{user_id}.wav', os.getenv('OPENAI_API_KEY')) # generate texts from audio using whisper-1 model

if len(prompt) >= 2:

logger.info('Transcribing: %s', prompt)

message = {"responseType" : "user", "text" : prompt[:-1]}

if len(prompt) >= 2:

logger.info('User: %s', prompt)
message = {"responseType" : "user", "text" : prompt}
# message = json.dumps(message)
await websocket.send_json(message)


response = generate_response(prompt, os.getenv('OPENAI_API_KEY'), chat_history) # generate natural language using gpt-4o model
await asyncio.sleep(0.1)

if "CALL DALL-E" == response:
message = {"responseType" : "assistant", "text": response}
# message = json.dumps(message)
await websocket.send_json(message)


response = generate_response(prompt, os.getenv('OPENAI_API_KEY'), chat_history) # generate natural language using gpt-4o model
await asyncio.sleep(0.1)

if "CALL DALL-E" == response:
message = {"responseType" : "assistant", "text": response}
# message = json.dumps(message)
await websocket.send_json(message)
await asyncio.sleep(0.1)

print('Generating Image ...')

image = generate_image_response(prompt, os.getenv('OPENAI_API_KEY')) # generate image from text using DALL-E-3 model

try:
message = {"responseType" : "assistant", "revised_prompt":image.revised_prompt, "image_url": image.url}
except Exception as e:
await websocket.send_json('{"status": "error"}')

# message = json.dumps(message)
await websocket.send_json(message)
# print(message)
else:
message = {"responseType" : "assistant", "text" : response}
# message = json.dumps(message)
await websocket.send_json(message)

logger.info('GPT-4o AI: %s', response)

print('Generating Image ...')

image = generate_image_response(prompt, os.getenv('OPENAI_API_KEY')) # generate image from text using DALL-E-3 model

try:
message = {"responseType" : "assistant", "revised_prompt":image.revised_prompt, "image_url": image.url}
except Exception as e:
await websocket.send_json({"status": "error"})
return False
# message = json.dumps(message)
await websocket.send_json(message)
# print(message)
else:
message = {"responseType" : "assistant", "text" : response}
# message = json.dumps(message)
await websocket.send_json(message)

logger.info('GPT-4o AI: %s', response)



Expand Down
93 changes: 51 additions & 42 deletions backend/speech_proccessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# import tensorflow.lite as tflite
import zipfile
import time
import asyncio
import tflite_runtime.interpreter as tflite

import logging
Expand Down Expand Up @@ -42,55 +43,63 @@ async def process_audio_stream(audio_queue, response_queue):

speak = 0
silence = 0
speech = 0

while True:
try:
audio_data = await audio_queue.get()
audio_data = await audio_queue.get()

audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0


audio_thres = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) # Calculate the root mean square (RMS) as the threshold value
threshold = np.sqrt(np.mean(np.square(audio_thres)))
if(threshold >= 5000):
speech = 1

audio_buffer = np.roll(audio_buffer, -len(audio_chunk))
audio_buffer[-len(audio_chunk):] = audio_chunk

# Set the tensor data
interpreter.set_tensor(waveform_input_index, audio_buffer)

# Run the model
interpreter.invoke()
scores = interpreter.get_tensor(scores_output_index)

audio_chunk = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
# Get the top classification result
top_class_index = scores.argmax()
prediction = labels[top_class_index]
# await response_queue.put(prediction)
# print(response_queue.qsize())
# print(prediction, len(audio_data) )

audio_buffer = np.roll(audio_buffer, -len(audio_chunk))
audio_buffer[-len(audio_chunk):] = audio_chunk

# Set the tensor data
interpreter.set_tensor(waveform_input_index, audio_buffer)

# Run the model
interpreter.invoke()
scores = interpreter.get_tensor(scores_output_index)
logger.info("%s, %d, %d", prediction, len(audio_data), threshold)

if( prediction == 'Speech' and speech == 1):
audio_chunks.append(audio_data)
# await response_queue.put(audio_data)
speak = speak+1
# silence = 0
# i=5

elif(speak >= 20 and prediction !='Speech'):
audio_data = b''.join(audio_chunks)
await response_queue.put(audio_data)
audio_chunks = []
silence = 0
speak = 0
speech = 0

elif(prediction !='Speech'):
silence = silence+1

if(silence == 5):
audio_chunks = []
silence = 0
speak = 0

# Get the top classification result
top_class_index = scores.argmax()
prediction = labels[top_class_index]
# await response_queue.put(prediction)
# print(response_queue.qsize())
# print(prediction, len(audio_data) )

logger.info("%s, %d", prediction, len(audio_data))

if( prediction == 'Speech'):
audio_chunks.append(audio_data)
# await response_queue.put(audio_data)
speak = speak+1
# silence = 0
# i=5
elif(speak < 10 and prediction !='Speech'):
silence = silence+1

elif(speak >= 10 and prediction !='Speech'):
audio_data = b''.join(audio_chunks)
await response_queue.put(audio_data)
audio_chunks = []
silence = 0
speak = 0


if(silence == 5):
audio_chunks = []
silence = 0
speak = 0

except Exception as e:
# print(audio_data)
print(e)
Expand Down
Loading

0 comments on commit 6dfda83

Please sign in to comment.