jasperproject · shbhrsaha · Aug 10, 2014 · Jul 24, 2014 · Jul 26, 2014 · Jul 26, 2014
diff --git a/client/local_mic.py b/client/local_mic.py
@@ -8,7 +8,7 @@
 class Mic:
     prev = None
 
-    def __init__(self, speaker, lmd, dictd, lmd_persona, dictd_persona):
+    def __init__(self, speaker, passive_stt_engine, active_stt_engine):
         return
 
     def passiveListen(self, PERSONA):

diff --git a/client/main.py b/client/main.py
@@ -1,6 +1,8 @@
 import yaml
 import sys
 import speaker
+import stt
+from stt import PocketSphinxSTT
 from conversation import Conversation
 
 
@@ -21,8 +23,13 @@ def isLocal():
 
     profile = yaml.safe_load(open("profile.yml", "r"))
 
-    mic = Mic(speaker.newSpeaker(), "languagemodel.lm", "dictionary.dic",
-              "languagemodel_persona.lm", "dictionary_persona.dic")
+    try:
+        google_api_key = profile['google_api_key']
+    except KeyError:
+        print "Google STT API Key not present in profile - defaulting to PocketSphinx..."
+        google_api_key = None
+
+    mic = Mic(speaker.newSpeaker(), PocketSphinxSTT(), stt.newSTTEngine(google_api_key))
 
     addendum = ""
     if 'first_name' in profile:

diff --git a/client/mic.py b/client/mic.py
@@ -10,66 +10,23 @@
 import alteration
 
 
-# quirky bug where first import doesn't work
-try:
-    import pocketsphinx as ps
-except:
-    import pocketsphinx as ps
-
-
 class Mic:
 
     speechRec = None
     speechRec_persona = None
 
-    def __init__(self, speaker, lmd, dictd, lmd_persona, dictd_persona, lmd_music=None, dictd_music=None):
+    def __init__(self, speaker, passive_stt_engine, active_stt_engine):
         """
             Initiates the pocketsphinx instance.
 
             Arguments:
             speaker -- handles platform-independent audio output
-            lmd -- filename of the full language model
-            dictd -- filename of the full dictionary (.dic)
-            lmd_persona -- filename of the 'Persona' language model (containing, e.g., 'Jasper')
-            dictd_persona -- filename of the 'Persona' dictionary (.dic)
+            passive_stt_engine -- performs STT while Jasper is in passive listen mode
+            acive_stt_engine -- performs STT while Jasper is in active listen mode
         """
         self.speaker = speaker
-        hmdir = "/usr/local/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k"
-
-        if lmd_music and dictd_music:
-            self.speechRec_music = ps.Decoder(hmm = hmdir, lm = lmd_music, dict = dictd_music)
-        self.speechRec_persona = ps.Decoder(
-            hmm=hmdir, lm=lmd_persona, dict=dictd_persona)
-        self.speechRec = ps.Decoder(hmm=hmdir, lm=lmd, dict=dictd)
-
-    def transcribe(self, audio_file_path, PERSONA_ONLY=False, MUSIC=False):
-        """
-            Performs TTS, transcribing an audio file and returning the result.
-
-            Arguments:
-            audio_file_path -- the path to the audio file to-be transcribed
-            PERSONA_ONLY -- if True, uses the 'Persona' language model and dictionary
-            MUSIC -- if True, uses the 'Music' language model and dictionary
-        """
-
-        wavFile = file(audio_file_path, 'rb')
-        wavFile.seek(44)
-
-        if MUSIC:
-            self.speechRec_music.decode_raw(wavFile)
-            result = self.speechRec_music.get_hyp()
-        elif PERSONA_ONLY:
-            self.speechRec_persona.decode_raw(wavFile)
-            result = self.speechRec_persona.get_hyp()
-        else:
-            self.speechRec.decode_raw(wavFile)
-            result = self.speechRec.get_hyp()
-
-        print "==================="
-        print "JASPER: " + result[0]
-        print "==================="
-
-        return result[0]
+        self.passive_stt_engine = passive_stt_engine
+        self.active_stt_engine = active_stt_engine
 
     def getScore(self, data):
         rms = audioop.rms(data, 2)
@@ -210,7 +167,7 @@ def passiveListen(self, PERSONA):
         write_frames.close()
 
         # check if PERSONA was said
-        transcribed = self.transcribe(AUDIO_FILE, PERSONA_ONLY=True)
+        transcribed = self.passive_stt_engine.transcribe(AUDIO_FILE, PERSONA_ONLY=True)
 
         if PERSONA in transcribed:
             return (THRESHOLD, PERSONA)
@@ -223,7 +180,7 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False):
         """
 
         AUDIO_FILE = "active.wav"
-        RATE = 16000
+        RATE = 44100
         CHUNK = 1024
         LISTEN_TIME = 12
 
@@ -232,7 +189,7 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False):
             if not os.path.exists(AUDIO_FILE):
                 return None
 
-            return self.transcribe(AUDIO_FILE)
+            return self.active_stt_engine.transcribe(AUDIO_FILE)
 
         # check if no threshold provided
         if THRESHOLD == None:
@@ -285,9 +242,9 @@ def activeListen(self, THRESHOLD=None, LISTEN=True, MUSIC=False):
         # os.system("sox "+AUDIO_FILE+" temp.wav vol 20dB")
 
         if MUSIC:
-            return self.transcribe(AUDIO_FILE, MUSIC=True)
+            return self.active_stt_engine.transcribe(AUDIO_FILE, MUSIC=True)
 
-        return self.transcribe(AUDIO_FILE)
+        return self.active_stt_engine.transcribe(AUDIO_FILE)
 
     def say(self, phrase, OPTIONS=" -vdefault+m3 -p 40 -s 160 --stdout > say.wav"):
         # alter phrase before speaking

diff --git a/client/populate.py b/client/populate.py
@@ -84,6 +84,9 @@ def verifyLocation(place):
         response = raw_input("Please choose email (E) or text message (T): ")
     profile['prefers_email'] = (response == 'E')
 
+    print ("\nIf you wish to depend on the Google Speech To Text API, please enter your API key, or leave blank to use Jasper's default speech to text implementation.")
+    simple_request('google_api_key', 'API Key')
+
     # write to profile
     print("Writing to profile...")
     outputFile = open("profile.yml", "w")

diff --git a/client/stt.py b/client/stt.py
@@ -0,0 +1,155 @@
+import os
+import traceback
+import json
+import urllib2
+
+"""
+The default Speech-To-Text implementation which relies on PocketSphinx.
+"""
+class PocketSphinxSTT(object):
+
+    def __init__(self, lmd = "languagemodel.lm", dictd = "dictionary.dic",
+                lmd_persona = "languagemodel_persona.lm", dictd_persona = "dictionary_persona.dic",
+                lmd_music=None, dictd_music=None):
+        """
+            Initiates the pocketsphinx instance.
+
+            Arguments:
+            speaker -- handles platform-independent audio output
+            lmd -- filename of the full language model
+            dictd -- filename of the full dictionary (.dic)
+            lmd_persona -- filename of the 'Persona' language model (containing, e.g., 'Jasper')
+            dictd_persona -- filename of the 'Persona' dictionary (.dic)
+        """
+
+        # quirky bug where first import doesn't work
+        try:
+            import pocketsphinx as ps
+        except:
+            import pocketsphinx as ps
+
+        hmdir = "/usr/local/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k"
+
+        if lmd_music and dictd_music:
+            self.speechRec_music = ps.Decoder(hmm = hmdir, lm = lmd_music, dict = dictd_music)
+        self.speechRec_persona = ps.Decoder(
+            hmm=hmdir, lm=lmd_persona, dict=dictd_persona)
+        self.speechRec = ps.Decoder(hmm=hmdir, lm=lmd, dict=dictd)
+
+    def transcribe(self, audio_file_path, PERSONA_ONLY=False, MUSIC=False):
+            """
+                Performs STT, transcribing an audio file and returning the result.
+
+                Arguments:
+                audio_file_path -- the path to the audio file to-be transcribed
+                PERSONA_ONLY -- if True, uses the 'Persona' language model and dictionary
+                MUSIC -- if True, uses the 'Music' language model and dictionary
+            """
+
+            wavFile = file(audio_file_path, 'rb')
+            wavFile.seek(44)
+
+            if MUSIC:
+                self.speechRec_music.decode_raw(wavFile)
+                result = self.speechRec_music.get_hyp()
+            elif PERSONA_ONLY:
+                self.speechRec_persona.decode_raw(wavFile)
+                result = self.speechRec_persona.get_hyp()
+            else:
+                self.speechRec.decode_raw(wavFile)
+                result = self.speechRec.get_hyp()
+
+            print "==================="
+            print "JASPER: " + result[0]
+            print "==================="
+
+            return result[0]
+
+"""
+Speech-To-Text implementation which relies on the Google Speech API.
+
+This implementation requires a Google API key to be present in profile.yml
+
+To obtain an API key:
+1. Join the Chromium Dev group: https://groups.google.com/a/chromium.org/forum/?fromgroups#!forum/chromium-dev
+2. Create a project through the Google Developers console: https://console.developers.google.com/project
+3. Select your project. In the sidebar, navigate to "APIs & Auth." Activate the Speech API.
+4. Under "APIs & Auth," navigate to "Credentials." Create a new key for public API access.
+5. Copy your API key and run client/populate.py. When prompted, paste this key for access to the Speech API.
+
+This implementation also requires that the avconv audio utility be present on your $PATH. On RPi, simply run:
+    sudo apt-get install avconv
+"""
+class GoogleSTT(object):
+
+    RATE = 44100
+
+    def __init__(self, api_key):
+        """
+        Arguments:
+        api_key - the public api key which allows access to Google APIs
+        """
+
+        self.api_key = api_key
+        for tool in ("avconv", "ffmpeg"):
+            if os.system("which %s" % tool) == 0:
+                self.audio_tool = tool
+                break  
+        if not self.audio_tool:
+            raise Exception("Could not find an audio tool to convert .wav files to .flac")
+
+    def transcribe(self, audio_file_path):
+        """
+            Performs STT via the Google Speech API, transcribing an audio file 
+            and returning an English string.
+            audio_file_path -- the path to the audio file to-be transcribed
+
+        """
+        AUDIO_FILE_FLAC = "active.flac"
+        os.system("%s -y -i %s -f flac -b:a 44100 %s" % (self.audio_tool, audio_file_path, AUDIO_FILE_FLAC))
+
+        url = "https://www.google.com/speech-api/v2/recognize?output=json&client=chromium&key=%s&lang=%s&maxresults=6&pfilter=2" % (self.api_key, "en-us")
+        flac = open(AUDIO_FILE_FLAC, 'rb')
+        data = flac.read()
+        flac.close()
+        try:
+            req = urllib2.Request(
+                url,
+                data=data,
+                headers={
+                    'Content-type': 'audio/x-flac; rate=%s' % GoogleSTT.RATE})
+            response_url = urllib2.urlopen(req)
+            response_read = response_url.read()
+            response_read = response_read.decode('utf-8')
+            decoded = json.loads(response_read.split("\n")[1])
+            print response_read
+            text = decoded['result'][0]['alternative'][0]['transcript']
+            if text:
+                print "==================="
+                print "JASPER: " + text
+                print "==================="
+            return text
+        except Exception:
+            traceback.print_exc()
+
+"""
+Returns a Speech-To-Text engine.
+
+If api_key is not supplied, Jasper will rely on the PocketSphinx STT engine for
+audio transcription.
+
+If api_key is supplied, Jasper will use the Google Speech API for transcribing
+audio while in the active listen phase. Jasper will continue to rely on the
+PocketSphinx engine during the passive listen phase, as the Google Speech API 
+is rate limited to 50 requests/day.
+
+Arguments:
+api_key - if supplied, Jasper will use the Google Speech API for transcribing
+audio in the active listen phase.
+
+"""
+def newSTTEngine(api_key = None):
+    if api_key:
+        return GoogleSTT(api_key)
+    else:
+        return PocketSphinxSTT()
diff --git a/client/test.py b/client/test.py
@@ -29,17 +29,19 @@ def setUp(self):
         self.time_clip = "../static/audio/time.wav"
 
         from mic import Mic
-        self.m = Mic(speaker.newSpeaker(), "languagemodel.lm", "dictionary.dic",
-                     "languagemodel_persona.lm", "dictionary_persona.dic")
+        from stt import PocketSphinxSTT
+
+        self.stt = PocketSphinxSTT()
 
     def testTranscribeJasper(self):
         """Does Jasper recognize his name (i.e., passive listen)?"""
-        transcription = self.m.transcribe(self.jasper_clip, PERSONA_ONLY=True)
+        transcription = self.stt.transcribe(self.jasper_clip, PERSONA_ONLY=True)
         self.assertTrue("JASPER" in transcription)
 
     def testTranscribe(self):
         """Does Jasper recognize 'time' (i.e., active listen)?"""
-        transcription = self.m.transcribe(self.time_clip)
+        transcription = self.stt.transcribe(self.time_clip)
+        print transcription
         self.assertTrue("TIME" in transcription)