Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modularized STT implementation #118

Merged
merged 8 commits into from
Aug 10, 2014
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion client/local_mic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class Mic:
prev = None

def __init__(self, speaker, lmd, dictd, lmd_persona, dictd_persona):
def __init__(self, speaker, passive_stt_engine, active_stt_engine):
return

def passiveListen(self, PERSONA):
Expand Down
11 changes: 9 additions & 2 deletions client/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import yaml
import sys
import speaker
import stt

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we combine these into a single import? I prefer that practice, and all it requires is that line 32 use stt.PocketSpinxSTT().

from stt import PocketSphinxSTT
from conversation import Conversation


Expand All @@ -21,8 +23,13 @@ def isLocal():

profile = yaml.safe_load(open("profile.yml", "r"))

mic = Mic(speaker.newSpeaker(), "languagemodel.lm", "dictionary.dic",
"languagemodel_persona.lm", "dictionary_persona.dic")
try:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than have it default to the Google STT if the API key is present, I'd prefer it if the desired STT was an enum in the profile itself. For example, we might add an stt_engine key to the profile, and allow it to take on values "google" and "sphinx" (it should probably default to "sphinx" if not specified, regardless of the "google_api_key" value).

In addition, we'd probably want to add a method to stt.py that allows us to generate a newSTTEngineOfType(profile['stt_engine']). That would have the benefit of removing this logic from main.py and bundling it all in stt.py.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also: can we have google_api_key nested under a keys dict in the profile? (This would mirror how we've presented the FB key integration in the docs.)

google_api_key = profile['google_api_key']
except KeyError:
print "Google STT API Key not present in profile - defaulting to PocketSphinx..."
google_api_key = None

mic = Mic(speaker.newSpeaker(), PocketSphinxSTT(), stt.newSTTEngine(google_api_key))

addendum = ""
if 'first_name' in profile:
Expand Down
63 changes: 10 additions & 53 deletions client/mic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,66 +10,23 @@
import alteration


# quirky bug where first import doesn't work
try:
import pocketsphinx as ps
except:
import pocketsphinx as ps


class Mic:

speechRec = None
speechRec_persona = None

def __init__(self, speaker, lmd, dictd, lmd_persona, dictd_persona, lmd_music=None, dictd_music=None):
def __init__(self, speaker, passive_stt_engine, active_stt_engine):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sweet! This is much cleaner and clearer than before.

"""
Initiates the pocketsphinx instance.

Arguments:
speaker -- handles platform-independent audio output
lmd -- filename of the full language model
dictd -- filename of the full dictionary (.dic)
lmd_persona -- filename of the 'Persona' language model (containing, e.g., 'Jasper')
dictd_persona -- filename of the 'Persona' dictionary (.dic)
passive_stt_engine -- performs STT while Jasper is in passive listen mode
acive_stt_engine -- performs STT while Jasper is in active listen mode
"""
self.speaker = speaker
hmdir = "/usr/local/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k"

if lmd_music and dictd_music:
self.speechRec_music = ps.Decoder(hmm = hmdir, lm = lmd_music, dict = dictd_music)
self.speechRec_persona = ps.Decoder(
hmm=hmdir, lm=lmd_persona, dict=dictd_persona)
self.speechRec = ps.Decoder(hmm=hmdir, lm=lmd, dict=dictd)

def transcribe(self, audio_file_path, PERSONA_ONLY=False, MUSIC=False):
"""
Performs TTS, transcribing an audio file and returning the result.

Arguments:
audio_file_path -- the path to the audio file to-be transcribed
PERSONA_ONLY -- if True, uses the 'Persona' language model and dictionary
MUSIC -- if True, uses the 'Music' language model and dictionary
"""

wavFile = file(audio_file_path, 'rb')
wavFile.seek(44)

if MUSIC:
self.speechRec_music.decode_raw(wavFile)
result = self.speechRec_music.get_hyp()
elif PERSONA_ONLY:
self.speechRec_persona.decode_raw(wavFile)
result = self.speechRec_persona.get_hyp()
else:
self.speechRec.decode_raw(wavFile)
result = self.speechRec.get_hyp()

print "==================="
print "JASPER: " + result[0]
print "==================="

return result[0]
self.passive_stt_engine = passive_stt_engine
self.active_stt_engine = active_stt_engine

def getScore(self, data):
rms = audioop.rms(data, 2)
Expand Down Expand Up @@ -210,7 +167,7 @@ def passiveListen(self, PERSONA):
write_frames.close()

# check if PERSONA was said
transcribed = self.transcribe(AUDIO_FILE, PERSONA_ONLY=True)
transcribed = self.passive_stt_engine.transcribe(AUDIO_FILE, PERSONA_ONLY=True)

if PERSONA in transcribed:
return (THRESHOLD, PERSONA)
Expand All @@ -223,7 +180,7 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False):
"""

AUDIO_FILE = "active.wav"
RATE = 16000
RATE = 44100
CHUNK = 1024
LISTEN_TIME = 12

Expand All @@ -232,7 +189,7 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False):
if not os.path.exists(AUDIO_FILE):
return None

return self.transcribe(AUDIO_FILE)
return self.active_stt_engine.transcribe(AUDIO_FILE)

# check if no threshold provided
if THRESHOLD == None:
Expand Down Expand Up @@ -285,9 +242,9 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False):
# os.system("sox "+AUDIO_FILE+" temp.wav vol 20dB")

if MUSIC:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couldn't all of this logic just be return self.active_stt_engine.transcribe(AUDIO_FILE, MUSIC=MUSIC)? (This was an issue with the existing logic too, I now realize, but it seems like an appropriate time to clean it up.)

return self.transcribe(AUDIO_FILE, MUSIC=True)
return self.active_stt_engine.transcribe(AUDIO_FILE, MUSIC=True)

return self.transcribe(AUDIO_FILE)
return self.active_stt_engine.transcribe(AUDIO_FILE)

def say(self, phrase, OPTIONS=" -vdefault+m3 -p 40 -s 160 --stdout > say.wav"):
# alter phrase before speaking
Expand Down
3 changes: 3 additions & 0 deletions client/populate.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ def verifyLocation(place):
response = raw_input("Please choose email (E) or text message (T): ")
profile['prefers_email'] = (response == 'E')

print ("\nIf you wish to depend on the Google Speech To Text API, please enter your API key, or leave blank to use Jasper's default speech to text implementation.")
simple_request('google_api_key', 'API Key')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting that this will have to change too if the above comments are addressed.


# write to profile
print("Writing to profile...")
outputFile = open("profile.yml", "w")
Expand Down
155 changes: 155 additions & 0 deletions client/stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import os
import traceback
import json
import urllib2

"""
The default Speech-To-Text implementation which relies on PocketSphinx.
"""
class PocketSphinxSTT(object):

def __init__(self, lmd = "languagemodel.lm", dictd = "dictionary.dic",
lmd_persona = "languagemodel_persona.lm", dictd_persona = "dictionary_persona.dic",
lmd_music=None, dictd_music=None):
"""
Initiates the pocketsphinx instance.

Arguments:
speaker -- handles platform-independent audio output
lmd -- filename of the full language model
dictd -- filename of the full dictionary (.dic)
lmd_persona -- filename of the 'Persona' language model (containing, e.g., 'Jasper')
dictd_persona -- filename of the 'Persona' dictionary (.dic)
"""

# quirky bug where first import doesn't work
try:
import pocketsphinx as ps
except:
import pocketsphinx as ps

hmdir = "/usr/local/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k"

if lmd_music and dictd_music:
self.speechRec_music = ps.Decoder(hmm = hmdir, lm = lmd_music, dict = dictd_music)
self.speechRec_persona = ps.Decoder(
hmm=hmdir, lm=lmd_persona, dict=dictd_persona)
self.speechRec = ps.Decoder(hmm=hmdir, lm=lmd, dict=dictd)

def transcribe(self, audio_file_path, PERSONA_ONLY=False, MUSIC=False):
"""
Performs STT, transcribing an audio file and returning the result.

Arguments:
audio_file_path -- the path to the audio file to-be transcribed
PERSONA_ONLY -- if True, uses the 'Persona' language model and dictionary
MUSIC -- if True, uses the 'Music' language model and dictionary
"""

wavFile = file(audio_file_path, 'rb')
wavFile.seek(44)

if MUSIC:
self.speechRec_music.decode_raw(wavFile)
result = self.speechRec_music.get_hyp()
elif PERSONA_ONLY:
self.speechRec_persona.decode_raw(wavFile)
result = self.speechRec_persona.get_hyp()
else:
self.speechRec.decode_raw(wavFile)
result = self.speechRec.get_hyp()

print "==================="
print "JASPER: " + result[0]
print "==================="

return result[0]

"""
Speech-To-Text implementation which relies on the Google Speech API.

This implementation requires a Google API key to be present in profile.yml

To obtain an API key:
1. Join the Chromium Dev group: https://groups.google.com/a/chromium.org/forum/?fromgroups#!forum/chromium-dev
2. Create a project through the Google Developers console: https://console.developers.google.com/project
3. Select your project. In the sidebar, navigate to "APIs & Auth." Activate the Speech API.
4. Under "APIs & Auth," navigate to "Credentials." Create a new key for public API access.
5. Copy your API key and run client/populate.py. When prompted, paste this key for access to the Speech API.

This implementation also requires that the avconv audio utility be present on your $PATH. On RPi, simply run:
sudo apt-get install avconv
"""
class GoogleSTT(object):

RATE = 44100

def __init__(self, api_key):
"""
Arguments:
api_key - the public api key which allows access to Google APIs
"""

self.api_key = api_key
for tool in ("avconv", "ffmpeg"):
if os.system("which %s" % tool) == 0:
self.audio_tool = tool
break
if not self.audio_tool:
raise Exception("Could not find an audio tool to convert .wav files to .flac")

def transcribe(self, audio_file_path):
"""
Performs STT via the Google Speech API, transcribing an audio file
and returning an English string.
audio_file_path -- the path to the audio file to-be transcribed

"""
AUDIO_FILE_FLAC = "active.flac"
os.system("%s -y -i %s -f flac -b:a 44100 %s" % (self.audio_tool, audio_file_path, AUDIO_FILE_FLAC))

url = "https://www.google.com/speech-api/v2/recognize?output=json&client=chromium&key=%s&lang=%s&maxresults=6&pfilter=2" % (self.api_key, "en-us")
flac = open(AUDIO_FILE_FLAC, 'rb')
data = flac.read()
flac.close()
try:
req = urllib2.Request(
url,
data=data,
headers={
'Content-type': 'audio/x-flac; rate=%s' % GoogleSTT.RATE})
response_url = urllib2.urlopen(req)
response_read = response_url.read()
response_read = response_read.decode('utf-8')
decoded = json.loads(response_read.split("\n")[1])
print response_read
text = decoded['result'][0]['alternative'][0]['transcript']
if text:
print "==================="
print "JASPER: " + text
print "==================="
return text
except Exception:
traceback.print_exc()

"""
Returns a Speech-To-Text engine.

If api_key is not supplied, Jasper will rely on the PocketSphinx STT engine for
audio transcription.

If api_key is supplied, Jasper will use the Google Speech API for transcribing
audio while in the active listen phase. Jasper will continue to rely on the
PocketSphinx engine during the passive listen phase, as the Google Speech API
is rate limited to 50 requests/day.

Arguments:
api_key - if supplied, Jasper will use the Google Speech API for transcribing
audio in the active listen phase.

"""
def newSTTEngine(api_key = None):

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: throughout the codebase, we've adopted the styling that keyword arguments should:

  1. Be in all caps (e.g., API_KEY).
  2. Have no spaces between the equals signs and their values (e.g., API_KEY=None).

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW: this will also have to change if the above comments are addressed. Lets try to make it less closely tied to these specific speech engines; maybe there should be two arguments:

  1. type: the 'type' of SST engine (e.g., "Google" or "Sphinx").
  2. engine_args: a list of arguments that will be passed to the init function of the SST engine. For example, you would then call newSTTEngineOfType('google', ['abcdef...']), where 'abcdef...' is your API key.

if api_key:
return GoogleSTT(api_key)
else:
return PocketSphinxSTT()
10 changes: 6 additions & 4 deletions client/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,19 @@ def setUp(self):
self.time_clip = "../static/audio/time.wav"

from mic import Mic

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: this line can be removed, I think?

self.m = Mic(speaker.newSpeaker(), "languagemodel.lm", "dictionary.dic",
"languagemodel_persona.lm", "dictionary_persona.dic")
from stt import PocketSphinxSTT

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: probably don't need a newline here.

self.stt = PocketSphinxSTT()

def testTranscribeJasper(self):
"""Does Jasper recognize his name (i.e., passive listen)?"""
transcription = self.m.transcribe(self.jasper_clip, PERSONA_ONLY=True)
transcription = self.stt.transcribe(self.jasper_clip, PERSONA_ONLY=True)
self.assertTrue("JASPER" in transcription)

def testTranscribe(self):
"""Does Jasper recognize 'time' (i.e., active listen)?"""
transcription = self.m.transcribe(self.time_clip)
transcription = self.stt.transcribe(self.time_clip)
print transcription

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: remove this line?

self.assertTrue("TIME" in transcription)


Expand Down