-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Modularized STT implementation #118
Changes from 6 commits
6fb17a6
d7b9149
673db3b
34f9b12
874f888
3f5c1cc
e3dcbec
86f8bf4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,8 @@ | ||
import yaml | ||
import sys | ||
import speaker | ||
import stt | ||
from stt import PocketSphinxSTT | ||
from conversation import Conversation | ||
|
||
|
||
|
@@ -21,8 +23,13 @@ def isLocal(): | |
|
||
profile = yaml.safe_load(open("profile.yml", "r")) | ||
|
||
mic = Mic(speaker.newSpeaker(), "languagemodel.lm", "dictionary.dic", | ||
"languagemodel_persona.lm", "dictionary_persona.dic") | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than have it default to the Google STT if the API key is present, I'd prefer it if the desired STT was an enum in the profile itself. For example, we might add an In addition, we'd probably want to add a method to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also: can we have |
||
google_api_key = profile['google_api_key'] | ||
except KeyError: | ||
print "Google STT API Key not present in profile - defaulting to PocketSphinx..." | ||
google_api_key = None | ||
|
||
mic = Mic(speaker.newSpeaker(), PocketSphinxSTT(), stt.newSTTEngine(google_api_key)) | ||
|
||
addendum = "" | ||
if 'first_name' in profile: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,66 +10,23 @@ | |
import alteration | ||
|
||
|
||
# quirky bug where first import doesn't work | ||
try: | ||
import pocketsphinx as ps | ||
except: | ||
import pocketsphinx as ps | ||
|
||
|
||
class Mic: | ||
|
||
speechRec = None | ||
speechRec_persona = None | ||
|
||
def __init__(self, speaker, lmd, dictd, lmd_persona, dictd_persona, lmd_music=None, dictd_music=None): | ||
def __init__(self, speaker, passive_stt_engine, active_stt_engine): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sweet! This is much cleaner and clearer than before. |
||
""" | ||
Initiates the pocketsphinx instance. | ||
|
||
Arguments: | ||
speaker -- handles platform-independent audio output | ||
lmd -- filename of the full language model | ||
dictd -- filename of the full dictionary (.dic) | ||
lmd_persona -- filename of the 'Persona' language model (containing, e.g., 'Jasper') | ||
dictd_persona -- filename of the 'Persona' dictionary (.dic) | ||
passive_stt_engine -- performs STT while Jasper is in passive listen mode | ||
acive_stt_engine -- performs STT while Jasper is in active listen mode | ||
""" | ||
self.speaker = speaker | ||
hmdir = "/usr/local/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k" | ||
|
||
if lmd_music and dictd_music: | ||
self.speechRec_music = ps.Decoder(hmm = hmdir, lm = lmd_music, dict = dictd_music) | ||
self.speechRec_persona = ps.Decoder( | ||
hmm=hmdir, lm=lmd_persona, dict=dictd_persona) | ||
self.speechRec = ps.Decoder(hmm=hmdir, lm=lmd, dict=dictd) | ||
|
||
def transcribe(self, audio_file_path, PERSONA_ONLY=False, MUSIC=False): | ||
""" | ||
Performs TTS, transcribing an audio file and returning the result. | ||
|
||
Arguments: | ||
audio_file_path -- the path to the audio file to-be transcribed | ||
PERSONA_ONLY -- if True, uses the 'Persona' language model and dictionary | ||
MUSIC -- if True, uses the 'Music' language model and dictionary | ||
""" | ||
|
||
wavFile = file(audio_file_path, 'rb') | ||
wavFile.seek(44) | ||
|
||
if MUSIC: | ||
self.speechRec_music.decode_raw(wavFile) | ||
result = self.speechRec_music.get_hyp() | ||
elif PERSONA_ONLY: | ||
self.speechRec_persona.decode_raw(wavFile) | ||
result = self.speechRec_persona.get_hyp() | ||
else: | ||
self.speechRec.decode_raw(wavFile) | ||
result = self.speechRec.get_hyp() | ||
|
||
print "===================" | ||
print "JASPER: " + result[0] | ||
print "===================" | ||
|
||
return result[0] | ||
self.passive_stt_engine = passive_stt_engine | ||
self.active_stt_engine = active_stt_engine | ||
|
||
def getScore(self, data): | ||
rms = audioop.rms(data, 2) | ||
|
@@ -210,7 +167,7 @@ def passiveListen(self, PERSONA): | |
write_frames.close() | ||
|
||
# check if PERSONA was said | ||
transcribed = self.transcribe(AUDIO_FILE, PERSONA_ONLY=True) | ||
transcribed = self.passive_stt_engine.transcribe(AUDIO_FILE, PERSONA_ONLY=True) | ||
|
||
if PERSONA in transcribed: | ||
return (THRESHOLD, PERSONA) | ||
|
@@ -223,7 +180,7 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False): | |
""" | ||
|
||
AUDIO_FILE = "active.wav" | ||
RATE = 16000 | ||
RATE = 44100 | ||
CHUNK = 1024 | ||
LISTEN_TIME = 12 | ||
|
||
|
@@ -232,7 +189,7 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False): | |
if not os.path.exists(AUDIO_FILE): | ||
return None | ||
|
||
return self.transcribe(AUDIO_FILE) | ||
return self.active_stt_engine.transcribe(AUDIO_FILE) | ||
|
||
# check if no threshold provided | ||
if THRESHOLD == None: | ||
|
@@ -285,9 +242,9 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False): | |
# os.system("sox "+AUDIO_FILE+" temp.wav vol 20dB") | ||
|
||
if MUSIC: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Couldn't all of this logic just be |
||
return self.transcribe(AUDIO_FILE, MUSIC=True) | ||
return self.active_stt_engine.transcribe(AUDIO_FILE, MUSIC=True) | ||
|
||
return self.transcribe(AUDIO_FILE) | ||
return self.active_stt_engine.transcribe(AUDIO_FILE) | ||
|
||
def say(self, phrase, OPTIONS=" -vdefault+m3 -p 40 -s 160 --stdout > say.wav"): | ||
# alter phrase before speaking | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -84,6 +84,9 @@ def verifyLocation(place): | |
response = raw_input("Please choose email (E) or text message (T): ") | ||
profile['prefers_email'] = (response == 'E') | ||
|
||
print ("\nIf you wish to depend on the Google Speech To Text API, please enter your API key, or leave blank to use Jasper's default speech to text implementation.") | ||
simple_request('google_api_key', 'API Key') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just noting that this will have to change too if the above comments are addressed. |
||
|
||
# write to profile | ||
print("Writing to profile...") | ||
outputFile = open("profile.yml", "w") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import os | ||
import traceback | ||
import json | ||
import urllib2 | ||
|
||
""" | ||
The default Speech-To-Text implementation which relies on PocketSphinx. | ||
""" | ||
class PocketSphinxSTT(object): | ||
|
||
def __init__(self, lmd = "languagemodel.lm", dictd = "dictionary.dic", | ||
lmd_persona = "languagemodel_persona.lm", dictd_persona = "dictionary_persona.dic", | ||
lmd_music=None, dictd_music=None): | ||
""" | ||
Initiates the pocketsphinx instance. | ||
|
||
Arguments: | ||
speaker -- handles platform-independent audio output | ||
lmd -- filename of the full language model | ||
dictd -- filename of the full dictionary (.dic) | ||
lmd_persona -- filename of the 'Persona' language model (containing, e.g., 'Jasper') | ||
dictd_persona -- filename of the 'Persona' dictionary (.dic) | ||
""" | ||
|
||
# quirky bug where first import doesn't work | ||
try: | ||
import pocketsphinx as ps | ||
except: | ||
import pocketsphinx as ps | ||
|
||
hmdir = "/usr/local/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k" | ||
|
||
if lmd_music and dictd_music: | ||
self.speechRec_music = ps.Decoder(hmm = hmdir, lm = lmd_music, dict = dictd_music) | ||
self.speechRec_persona = ps.Decoder( | ||
hmm=hmdir, lm=lmd_persona, dict=dictd_persona) | ||
self.speechRec = ps.Decoder(hmm=hmdir, lm=lmd, dict=dictd) | ||
|
||
def transcribe(self, audio_file_path, PERSONA_ONLY=False, MUSIC=False): | ||
""" | ||
Performs STT, transcribing an audio file and returning the result. | ||
|
||
Arguments: | ||
audio_file_path -- the path to the audio file to-be transcribed | ||
PERSONA_ONLY -- if True, uses the 'Persona' language model and dictionary | ||
MUSIC -- if True, uses the 'Music' language model and dictionary | ||
""" | ||
|
||
wavFile = file(audio_file_path, 'rb') | ||
wavFile.seek(44) | ||
|
||
if MUSIC: | ||
self.speechRec_music.decode_raw(wavFile) | ||
result = self.speechRec_music.get_hyp() | ||
elif PERSONA_ONLY: | ||
self.speechRec_persona.decode_raw(wavFile) | ||
result = self.speechRec_persona.get_hyp() | ||
else: | ||
self.speechRec.decode_raw(wavFile) | ||
result = self.speechRec.get_hyp() | ||
|
||
print "===================" | ||
print "JASPER: " + result[0] | ||
print "===================" | ||
|
||
return result[0] | ||
|
||
""" | ||
Speech-To-Text implementation which relies on the Google Speech API. | ||
|
||
This implementation requires a Google API key to be present in profile.yml | ||
|
||
To obtain an API key: | ||
1. Join the Chromium Dev group: https://groups.google.com/a/chromium.org/forum/?fromgroups#!forum/chromium-dev | ||
2. Create a project through the Google Developers console: https://console.developers.google.com/project | ||
3. Select your project. In the sidebar, navigate to "APIs & Auth." Activate the Speech API. | ||
4. Under "APIs & Auth," navigate to "Credentials." Create a new key for public API access. | ||
5. Copy your API key and run client/populate.py. When prompted, paste this key for access to the Speech API. | ||
|
||
This implementation also requires that the avconv audio utility be present on your $PATH. On RPi, simply run: | ||
sudo apt-get install avconv | ||
""" | ||
class GoogleSTT(object): | ||
|
||
RATE = 44100 | ||
|
||
def __init__(self, api_key): | ||
""" | ||
Arguments: | ||
api_key - the public api key which allows access to Google APIs | ||
""" | ||
|
||
self.api_key = api_key | ||
for tool in ("avconv", "ffmpeg"): | ||
if os.system("which %s" % tool) == 0: | ||
self.audio_tool = tool | ||
break | ||
if not self.audio_tool: | ||
raise Exception("Could not find an audio tool to convert .wav files to .flac") | ||
|
||
def transcribe(self, audio_file_path): | ||
""" | ||
Performs STT via the Google Speech API, transcribing an audio file | ||
and returning an English string. | ||
audio_file_path -- the path to the audio file to-be transcribed | ||
|
||
""" | ||
AUDIO_FILE_FLAC = "active.flac" | ||
os.system("%s -y -i %s -f flac -b:a 44100 %s" % (self.audio_tool, audio_file_path, AUDIO_FILE_FLAC)) | ||
|
||
url = "https://www.google.com/speech-api/v2/recognize?output=json&client=chromium&key=%s&lang=%s&maxresults=6&pfilter=2" % (self.api_key, "en-us") | ||
flac = open(AUDIO_FILE_FLAC, 'rb') | ||
data = flac.read() | ||
flac.close() | ||
try: | ||
req = urllib2.Request( | ||
url, | ||
data=data, | ||
headers={ | ||
'Content-type': 'audio/x-flac; rate=%s' % GoogleSTT.RATE}) | ||
response_url = urllib2.urlopen(req) | ||
response_read = response_url.read() | ||
response_read = response_read.decode('utf-8') | ||
decoded = json.loads(response_read.split("\n")[1]) | ||
print response_read | ||
text = decoded['result'][0]['alternative'][0]['transcript'] | ||
if text: | ||
print "===================" | ||
print "JASPER: " + text | ||
print "===================" | ||
return text | ||
except Exception: | ||
traceback.print_exc() | ||
|
||
""" | ||
Returns a Speech-To-Text engine. | ||
|
||
If api_key is not supplied, Jasper will rely on the PocketSphinx STT engine for | ||
audio transcription. | ||
|
||
If api_key is supplied, Jasper will use the Google Speech API for transcribing | ||
audio while in the active listen phase. Jasper will continue to rely on the | ||
PocketSphinx engine during the passive listen phase, as the Google Speech API | ||
is rate limited to 50 requests/day. | ||
|
||
Arguments: | ||
api_key - if supplied, Jasper will use the Google Speech API for transcribing | ||
audio in the active listen phase. | ||
|
||
""" | ||
def newSTTEngine(api_key = None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: throughout the codebase, we've adopted the styling that keyword arguments should:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW: this will also have to change if the above comments are addressed. Lets try to make it less closely tied to these specific speech engines; maybe there should be two arguments:
|
||
if api_key: | ||
return GoogleSTT(api_key) | ||
else: | ||
return PocketSphinxSTT() |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,17 +29,19 @@ def setUp(self): | |
self.time_clip = "../static/audio/time.wav" | ||
|
||
from mic import Mic | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: this line can be removed, I think? |
||
self.m = Mic(speaker.newSpeaker(), "languagemodel.lm", "dictionary.dic", | ||
"languagemodel_persona.lm", "dictionary_persona.dic") | ||
from stt import PocketSphinxSTT | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: probably don't need a newline here. |
||
self.stt = PocketSphinxSTT() | ||
|
||
def testTranscribeJasper(self): | ||
"""Does Jasper recognize his name (i.e., passive listen)?""" | ||
transcription = self.m.transcribe(self.jasper_clip, PERSONA_ONLY=True) | ||
transcription = self.stt.transcribe(self.jasper_clip, PERSONA_ONLY=True) | ||
self.assertTrue("JASPER" in transcription) | ||
|
||
def testTranscribe(self): | ||
"""Does Jasper recognize 'time' (i.e., active listen)?""" | ||
transcription = self.m.transcribe(self.time_clip) | ||
transcription = self.stt.transcribe(self.time_clip) | ||
print transcription | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: remove this line? |
||
self.assertTrue("TIME" in transcription) | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we combine these into a single import? I prefer that practice, and all it requires is that line 32 use
stt.PocketSpinxSTT()
.