diff --git a/client/local_mic.py b/client/local_mic.py index dee76699b..44e232daf 100644 --- a/client/local_mic.py +++ b/client/local_mic.py @@ -8,7 +8,7 @@ class Mic: prev = None - def __init__(self, speaker, lmd, dictd, lmd_persona, dictd_persona): + def __init__(self, speaker, passive_stt_engine, active_stt_engine): return def passiveListen(self, PERSONA): diff --git a/client/main.py b/client/main.py index 6ba364562..3fc728fb8 100644 --- a/client/main.py +++ b/client/main.py @@ -1,6 +1,7 @@ import yaml import sys import speaker +import stt from conversation import Conversation @@ -21,8 +22,18 @@ def isLocal(): profile = yaml.safe_load(open("profile.yml", "r")) - mic = Mic(speaker.newSpeaker(), "languagemodel.lm", "dictionary.dic", - "languagemodel_persona.lm", "dictionary_persona.dic") + try: + api_key = profile['keys']['GOOGLE_SPEECH'] + except KeyError: + api_key = None + + try: + stt_engine_type = profile['stt_engine'] + except KeyError: + print "stt_engine not specified in profile, defaulting to PocketSphinx" + stt_engine_type = "sphinx" + + mic = Mic(speaker.newSpeaker(), stt.PocketSphinxSTT(), stt.newSTTEngine(stt_engine_type, api_key=api_key)) addendum = "" if 'first_name' in profile: diff --git a/client/mic.py b/client/mic.py index f4b70df78..b4ef66845 100644 --- a/client/mic.py +++ b/client/mic.py @@ -10,66 +10,23 @@ import alteration -# quirky bug where first import doesn't work -try: - import pocketsphinx as ps -except: - import pocketsphinx as ps - - class Mic: speechRec = None speechRec_persona = None - def __init__(self, speaker, lmd, dictd, lmd_persona, dictd_persona, lmd_music=None, dictd_music=None): + def __init__(self, speaker, passive_stt_engine, active_stt_engine): """ Initiates the pocketsphinx instance. Arguments: speaker -- handles platform-independent audio output - lmd -- filename of the full language model - dictd -- filename of the full dictionary (.dic) - lmd_persona -- filename of the 'Persona' language model (containing, e.g., 'Jasper') - dictd_persona -- filename of the 'Persona' dictionary (.dic) + passive_stt_engine -- performs STT while Jasper is in passive listen mode + acive_stt_engine -- performs STT while Jasper is in active listen mode """ self.speaker = speaker - hmdir = "/usr/local/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k" - - if lmd_music and dictd_music: - self.speechRec_music = ps.Decoder(hmm = hmdir, lm = lmd_music, dict = dictd_music) - self.speechRec_persona = ps.Decoder( - hmm=hmdir, lm=lmd_persona, dict=dictd_persona) - self.speechRec = ps.Decoder(hmm=hmdir, lm=lmd, dict=dictd) - - def transcribe(self, audio_file_path, PERSONA_ONLY=False, MUSIC=False): - """ - Performs TTS, transcribing an audio file and returning the result. - - Arguments: - audio_file_path -- the path to the audio file to-be transcribed - PERSONA_ONLY -- if True, uses the 'Persona' language model and dictionary - MUSIC -- if True, uses the 'Music' language model and dictionary - """ - - wavFile = file(audio_file_path, 'rb') - wavFile.seek(44) - - if MUSIC: - self.speechRec_music.decode_raw(wavFile) - result = self.speechRec_music.get_hyp() - elif PERSONA_ONLY: - self.speechRec_persona.decode_raw(wavFile) - result = self.speechRec_persona.get_hyp() - else: - self.speechRec.decode_raw(wavFile) - result = self.speechRec.get_hyp() - - print "===================" - print "JASPER: " + result[0] - print "===================" - - return result[0] + self.passive_stt_engine = passive_stt_engine + self.active_stt_engine = active_stt_engine def getScore(self, data): rms = audioop.rms(data, 2) @@ -210,7 +167,7 @@ def passiveListen(self, PERSONA): write_frames.close() # check if PERSONA was said - transcribed = self.transcribe(AUDIO_FILE, PERSONA_ONLY=True) + transcribed = self.passive_stt_engine.transcribe(AUDIO_FILE, PERSONA_ONLY=True) if PERSONA in transcribed: return (THRESHOLD, PERSONA) @@ -223,7 +180,7 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False): """ AUDIO_FILE = "active.wav" - RATE = 16000 + RATE = 16000 CHUNK = 1024 LISTEN_TIME = 12 @@ -232,7 +189,7 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False): if not os.path.exists(AUDIO_FILE): return None - return self.transcribe(AUDIO_FILE) + return self.active_stt_engine.transcribe(AUDIO_FILE) # check if no threshold provided if THRESHOLD == None: @@ -284,10 +241,7 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False): # DO SOME AMPLIFICATION # os.system("sox "+AUDIO_FILE+" temp.wav vol 20dB") - if MUSIC: - return self.transcribe(AUDIO_FILE, MUSIC=True) - - return self.transcribe(AUDIO_FILE) + return self.active_stt_engine.transcribe(AUDIO_FILE, MUSIC) def say(self, phrase, OPTIONS=" -vdefault+m3 -p 40 -s 160 --stdout > say.wav"): # alter phrase before speaking diff --git a/client/populate.py b/client/populate.py index 48b8670d1..dd5ca488e 100644 --- a/client/populate.py +++ b/client/populate.py @@ -4,7 +4,6 @@ from pytz import timezone import feedparser - def run(): profile = {} @@ -84,6 +83,26 @@ def verifyLocation(place): response = raw_input("Please choose email (E) or text message (T): ") profile['prefers_email'] = (response == 'E') + stt_engines = { + "sphinx" : None, + "google" : "GOOGLE_SPEECH" + } + + response = raw_input( + "\nIf you would like to choose a specific STT engine, please specify which." + + "\nAvailable implementations: %s. (Press Enter to default to PocketSphinx): " % stt_engines.keys()) + if (response in stt_engines): + profile["stt_engine"] = response + api_key_name = stt_engines[response] + if api_key_name: + key = raw_input("\nPlease enter your API key: ") + profile["keys"] = { api_key_name : key } + else: + print("Unrecognized STT engine. Available implementations: %s" % stt_engines.keys()) + profile["stt_engine"] = "sphinx" + + + # write to profile print("Writing to profile...") outputFile = open("profile.yml", "w") diff --git a/client/stt.py b/client/stt.py new file mode 100644 index 000000000..d261464ce --- /dev/null +++ b/client/stt.py @@ -0,0 +1,153 @@ +import os +import traceback +import json +import urllib2 + +""" +The default Speech-To-Text implementation which relies on PocketSphinx. +""" +class PocketSphinxSTT(object): + + def __init__(self, lmd = "languagemodel.lm", dictd = "dictionary.dic", + lmd_persona = "languagemodel_persona.lm", dictd_persona = "dictionary_persona.dic", + lmd_music=None, dictd_music=None, **kwargs): + """ + Initiates the pocketsphinx instance. + + Arguments: + speaker -- handles platform-independent audio output + lmd -- filename of the full language model + dictd -- filename of the full dictionary (.dic) + lmd_persona -- filename of the 'Persona' language model (containing, e.g., 'Jasper') + dictd_persona -- filename of the 'Persona' dictionary (.dic) + """ + + # quirky bug where first import doesn't work + try: + import pocketsphinx as ps + except: + import pocketsphinx as ps + + hmdir = "/usr/local/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k" + + if lmd_music and dictd_music: + self.speechRec_music = ps.Decoder(hmm = hmdir, lm = lmd_music, dict = dictd_music) + self.speechRec_persona = ps.Decoder( + hmm=hmdir, lm=lmd_persona, dict=dictd_persona) + self.speechRec = ps.Decoder(hmm=hmdir, lm=lmd, dict=dictd) + + def transcribe(self, audio_file_path, PERSONA_ONLY=False, MUSIC=False): + """ + Performs STT, transcribing an audio file and returning the result. + + Arguments: + audio_file_path -- the path to the audio file to-be transcribed + PERSONA_ONLY -- if True, uses the 'Persona' language model and dictionary + MUSIC -- if True, uses the 'Music' language model and dictionary + """ + + wavFile = file(audio_file_path, 'rb') + wavFile.seek(44) + + if MUSIC: + self.speechRec_music.decode_raw(wavFile) + result = self.speechRec_music.get_hyp() + elif PERSONA_ONLY: + self.speechRec_persona.decode_raw(wavFile) + result = self.speechRec_persona.get_hyp() + else: + self.speechRec.decode_raw(wavFile) + result = self.speechRec.get_hyp() + + print "===================" + print "JASPER: " + result[0] + print "===================" + + return result[0] + +""" +Speech-To-Text implementation which relies on the Google Speech API. + +This implementation requires a Google API key to be present in profile.yml + +To obtain an API key: +1. Join the Chromium Dev group: https://groups.google.com/a/chromium.org/forum/?fromgroups#!forum/chromium-dev +2. Create a project through the Google Developers console: https://console.developers.google.com/project +3. Select your project. In the sidebar, navigate to "APIs & Auth." Activate the Speech API. +4. Under "APIs & Auth," navigate to "Credentials." Create a new key for public API access. +5. Add your credentials to your profile.yml. Add an entry to the 'keys' section using the key name 'GOOGLE_SPEECH.' Sample configuration: +6. Set the value of the 'stt_engine' key in your profile.yml to 'google' + + +Excerpt from sample profile.yml: + + ... + timezone: US/Pacific + stt_engine: google + keys: + GOOGLE_SPEECH: $YOUR_KEY_HERE + +""" +class GoogleSTT(object): + + RATE = 16000 + + def __init__(self, api_key, **kwargs): + """ + Arguments: + api_key - the public api key which allows access to Google APIs + """ + self.api_key = api_key + + def transcribe(self, audio_file_path, PERSONA_ONLY=False, MUSIC=False): + """ + Performs STT via the Google Speech API, transcribing an audio file + and returning an English string. + + Arguments: + audio_file_path -- the path to the .wav file to be transcribed + """ + url = "https://www.google.com/speech-api/v2/recognize?output=json&client=chromium&key=%s&lang=%s&maxresults=6&pfilter=2" % (self.api_key, "en-us") + + wav = open(audio_file_path, 'rb') + data = wav.read() + wav.close() + + try: + req = urllib2.Request( + url, + data=data, + headers={ + 'Content-type': 'audio/l16; rate=%s' % GoogleSTT.RATE}) + response_url = urllib2.urlopen(req) + response_read = response_url.read() + response_read = response_read.decode('utf-8') + decoded = json.loads(response_read.split("\n")[1]) + print response_read + text = decoded['result'][0]['alternative'][0]['transcript'] + if text: + print "===================" + print "JASPER: " + text + print "===================" + return text + except Exception: + traceback.print_exc() + +""" +Returns a Speech-To-Text engine. + +Currently, the supported implementations are the default Pocket Sphinx and +the Google Speech API + +Arguments: + engine_type - one of "sphinx" or "google" + kwargs - keyword arguments passed to the constructor of the STT engine +""" +def newSTTEngine(engine_type, **kwargs): + t = engine_type.lower() + if t == "sphinx": + return PocketSphinxSTT(**kwargs) + elif t == "google": + return GoogleSTT(**kwargs) + else: + raise ValueError("Unsupported STT engine type: " + engine_type) diff --git a/client/test.py b/client/test.py index fb7d1477f..618664246 100644 --- a/client/test.py +++ b/client/test.py @@ -28,18 +28,17 @@ def setUp(self): self.jasper_clip = "../static/audio/jasper.wav" self.time_clip = "../static/audio/time.wav" - from mic import Mic - self.m = Mic(speaker.newSpeaker(), "languagemodel.lm", "dictionary.dic", - "languagemodel_persona.lm", "dictionary_persona.dic") + from stt import PocketSphinxSTT + self.stt = PocketSphinxSTT() def testTranscribeJasper(self): """Does Jasper recognize his name (i.e., passive listen)?""" - transcription = self.m.transcribe(self.jasper_clip, PERSONA_ONLY=True) + transcription = self.stt.transcribe(self.jasper_clip, PERSONA_ONLY=True) self.assertTrue("JASPER" in transcription) def testTranscribe(self): """Does Jasper recognize 'time' (i.e., active listen)?""" - transcription = self.m.transcribe(self.time_clip) + transcription = self.stt.transcribe(self.time_clip) self.assertTrue("TIME" in transcription)