-
Notifications
You must be signed in to change notification settings - Fork 25
/
speech_reco.js
357 lines (320 loc) · 16.6 KB
/
speech_reco.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
/* Created by Fry on 3/29/17.*/
var path = require('path')
var GetUserMediaToText = require('getusermedia-to-text')
var key_path = path.join(__dirname, 'dexter-dev-env-code.json')
var code1 = `BDhx1L/42\nLAAjAfz7S61hAXJHgy3/Sh8nzAXsX0hVkNWcUMSDMJx65m9LSN7ojF0KukxiPVwY\niIvTnzbzjBpdea8Bg24lTiYb/BflQ4/WSEjrJsE6k+hI1BiNh/8vV/lyUXkmpJ72\n/V5M7aI9Hpy1iEhP5ESUg3OCP6JsuSlLeFiydLugl2FfAyPOEWPNQUZsjVRP5Kpk\nPd9xzJf+zuzFIhOOechqd6kaxkU3FV4bzoyoIKuEc36t80YuwyFO0D6t5JJ9UXm5\nHQFBg50WaI13gdKn+sFDgAuPOUq52kVI3UyIn5rokOKhs3AxrWorY32EjqNqjxFN\nEgJl6+ANAgMBAAECggEBAMh07q3TWDAe/GpJFPjLipcAmABQ1rH3XzsfjAVP1sJK\n8xWrXNJiD8QSqYktvk/W6f43RHSSV/wDd+SsLAE4QRKLQDkET3oQK+GEl3uMouwT\nw97RpAGj+oa+kiANesLXDM7f2tBYffefyNbrlx46NBrLVFyM6YHdQzYoFV1UZ482\nuFOXh0XFi1in/GuHxVcGxdjECrnIPib31TonCbTUknuBp1VfIzXzwQzjumx4fEAA\nvdL5O7huqBnkt5pNxLFUJndsVeYS/wKj0xQLIoHotwCmedUPDgDOU5HwnQR72lqQ\nzyGxTFvTNzf01dIf0c9vduKGJn5OBBUw8EUoUBE3HTUCgYEA+Hm9GSjmVZkWQTyP\n/9bmt7ZzRU9azFoDkug5GvyZHTUCf3TKom1Dtw5jKYGWRl2KM573GMdQP1/D5+vA\nLEJIVuVMibiVbhFh5yx1A2Xh/PuqnOg7r5sB/t4Ni/mW5Vtxnq1Ew698cutqCbUS\ne7kzgfCkkuW1gLFn0VS9kp5fFYMCgYEA8f9hE3RHPi9dU/L+bQQ0zBprzRUu3qdc\nFyQn8DE6+Ir5G+yea6Cv8ispUEHBud8COIuOft6HSaAHkSkmYIUe+u5YswLYEcfo\nf2LHltUfKmdKjd3RJZLCBmo04dABnQwB0IuQomISSPFm9G2ZMndAu8gYOWKbaFNm\ngeb65iUGzy8CgYEA6bBtRa1uvBCnmqFf+UlfYmcJEKWqiskZ1V/s3VG2m36Qo9ju\n/0ZuSksQvievcuxkn1ohoT9LXN8ve/8AlV+dc4RisKBWgdd9UZNBVfnLSkhTVREh\nmW3auZ6T9RK/dWTEfm0NYFG6ZMa9yvVutcggBlSSI8pBIxE2x+zAApZKbQsCgYEA\nnXm5GybEoZPC4Lk0lY5yKQtccTVCrcMsqyZtEuGGZWxSFLjSkmkbdbReiwpvXVvg\nnxk5nheC5AdORUkI3zBha5skf0DviAqVoieOh/mh8T8MBet9iTzKI9CDHxrzodXJ\nbIMVmIXHLk5g19hmI6/0oP3hvZ4MUURMZWYAiTk5CCMCgYEAk8pZpatUCJiVrIJP\nXgig9uRKJ15+Vqucms43bmWsWfuNa2N0nLVB6RucNVUw9RR99YPuOUZ8twSoWYnS\nPKzeu6UO5fBse698uuJjlqfFdDD2E1Hcswkpi/H1vGefw1+DobmL5AgeARh68OIk\nwSnfgW00iGapsR2/5qTXkVMfkCA=\n-----END PRIVATE KEY-----\n",
"client_email": "[email protected]",
"client_id": "112741637141669678658",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://accounts.google.com/o/oauth2/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/dexter-dev-env%40dexter-dev-env.iam.gserviceaccount.com"
}`
var code0 = `{
"type": "service_account",
"project_id": "dexter-dev-env",
"private_key_id": "b05da431ead6129406056df46fe1d5fc3ea8a8f7",
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEwAIBADANBgkqhkiG9w0BAQEFAASCBKowggSmAgEAAoIBAQDq4nq`
var s2t
//all these vars meaningful in ui only.
//var recognition = null
var recognize_speech_window_index = null
var recognize_speech_prompt = "" //string or array of strings
var recognize_speech_phrase_callback = null
var recognize_speech_finish_callback = null
//var recognize_speech_only_once = null
var recognize_speech_click_to_talk = null
var recognize_speech_last_text = null
var recognize_speech_last_confidence = null
var recognize_speech_phrases_array = [] //a array of arrays. the inner array is [recognized_text, confidence_float]
var recognize_speech_finish_phrase = "finish" //or a pos integer of # of phrases to reco. set by recognize_speech ui
var recognize_speech_cancel_finish = false
function s2t_init(){
out("init")
//if (!file_exists(key_path)){
// write_file(key_path, code0 + code1) //ffails to write a good file probably due to encoding
//}
s2t = new GetUserMediaToText({
projectId: 'dexter-dev-env',
keyFilename: key_path
// request: { options }
})
s2t.on('error', console.error)
s2t.on('status', console.log)
s2t.on('listening', function (isListening) {
isListening ? console.log('Listening!') : console.log('Stopped Listening!')
})
s2t.on('data', sr_on_data)
}
function start_recognition(){
s2t_init()
sr_start()
}
window.start_recognition = start_recognition
function sr_start() {
//out("top of sr_start()")
set_mic_and_instructions(true)
recognize_speech_cancel_finish = false
//out("calling s2t.start()")
s2t.start()
recognize_speech_type_in_id.focus() //often doesn't work.
recognize_speech_type_in_id.select()
}
function sr_on_data(data){
//out("data: " + data.speechEventType)
if(window.recognize_speech_type_in_id) { recognize_speech_type_in_id.focus() }
switch (data.speechEventType) { //the only value is "SPEECH_EVENT_UNSPECIFIED"
case "START_OF_SPEECH": //doesn't look like ever happens.
out("START_OF_SPEECH")
break;
case "END_OF_SPEECH": //doesn't look like ever happens.
out("END_OF_SPEECH")
set_mic_and_instructions()
break;
case "END_OF_UTTERANCE": //doesn't look like ever happens.
out("END_OF_UTTERANCE")
set_mic_and_instructions()
case "END_OF_AUDIO": //doesn't look like ever happens.
out("END_OF_AUDIO")
break;
case "SPEECH_EVENT_UNSPECIFIED":
set_mic_and_instructions() //don't talk, turns off animation
if(data && (data.results.length > 0)){
recognize_speech_last_text = data.results[0].transcript.trim() //the 2nd through nth recos start with sapce,
recognize_speech_last_confidence = data.results[0].confidence
//which would be good for continuous dictation, but bad for recognizing the ending phrase.
//I could get clever and allow, but see how always trimming works out in practice.
//out("Recognized: " + recognize_speech_last_text)
sr_result(data) //if the data is good, the phrase is pushed onto the result.
//but if not, it isn't so even if finish_prhase is 1, we might
//still get more input from user
//if user says the finish phrase, it will be pushed onto the result.
if (typeof(recognize_speech_finish_phrase) == "number") {
if (recognize_speech_phrases_array.length >= recognize_speech_finish_phrase){
//give user chance to reject the last phrase.
//but this is too tricky to get right
/*setTimeout(function(){
out("timout")
if(recognize_speech_cancel_finish == false){ sr_end() }
else { set_mic_and_instructions(true) }
}, 5000) //give user 5 secs to reject, else the last item is accepted
*/
sr_end()
}
else { set_mic_and_instructions(true) } //still more phrases to collect
}
else if (typeof(recognize_speech_finish_phrase) == "string"){ //has a finish_phrase string
if (recognize_speech_last_text == recognize_speech_finish_phrase) { sr_end() } //close window
else {set_mic_and_instructions(true)} //more phrases to go
}
else if (typeof(recognize_speech_finish_phrase) == "function"){ //has a finish_phrase string
let fp_result = recognize_speech_finish_phrase(recognize_speech_last_text, recognize_speech_last_confidence)
if (fp_result) { sr_end() } //close window
else {set_mic_and_instructions(true)} //more phrases to go
}
/*if(recognize_speech_type_in_id) { recognize_speech_type_in_id.select() }
if ((typeof(recognize_speech_finish_phrase) == "number") &&
(recognize_speech_phrases_array.length >= recognize_speech_finish_phrase)){
//s2t.stop()
}
else if (recognize_speech_click_to_talk) { //s2t.stop()
}
else { sr_animate_gif() }*/
}
break;
default:
shouldnt("recognize_speech ondata got unhandled speechEventType: " + data.speechEventType)
}
}
function set_mic_and_instructions(instructions=false){
//out("set_mic_and_instructions passed: " + instructions)
if (instructions === true) {
instructions = "<span style='vertical-align:100%;'>Speak now.</span><br/><i> Be quiet to recognize speech.</i>"
}
else if (instructions === false) { instructions = "Don't talk" }
if (window["recognize_speech_img_id"]) { //window is up
if (instructions == "Don't talk") { sr_unanimate_gif() }
else if (recognize_speech_click_to_talk) {
sr_unanimate_gif();
recognize_speech_type_in_id.focus();
instructions = "<br/>" //click to talk button is on the screen so unnecessary and confusing to have more instructions
}
else { sr_animate_gif() }
//out("set_mic_and_instructions to: " + instructions)
recognize_speech_instructions_id.innerHTML = instructions
}
}
//not called if the recoed text effectively declares "finished"
function sr_result(data) {
recognize_speech_last_text = data.results[0].transcript.trim() //event_to_text(event)
recognize_speech_last_confidence = data.results[0].confidence //no confidence in Bret's new code
let is_reco_text_valid = true
if(recognize_speech_phrase_callback) {
is_reco_text_valid = recognize_speech_phrase_callback(recognize_speech_last_text, recognize_speech_last_confidence)
if (is_reco_text_valid !== false) {
if (typeof(is_reco_text_valid) == "string") { recognize_speech_last_text = is_reco_text_valid }
is_reco_text_valid = true
}
}
if (!is_reco_text_valid) { recognize_speech_last_text = "INVALID: " + recognize_speech_last_text }
else { recognize_speech_phrases_array.push([recognize_speech_last_text, recognize_speech_last_confidence])
}
if(recognize_speech_type_in_id) {
recognize_speech_type_in_id.value = recognize_speech_last_text
recognize_speech_type_in_id.select()
}
if (!is_reco_text_valid || !got_enough_phrases()){
sr_display_finish_help()
sr_display_prompt()
}
return is_reco_text_valid
}
//called only when this dialog is all over. no more reco will be done.
function sr_end() {
close_window(recognize_speech_window_index)
s2t.stop() //this will cause an error to be printed in the console if we take too long,
//but that seems to be harmless and we can keep going after that.
//from web commentary before Dec 18, 2017, this loooks like a google bug.
if(typeof(recognize_speech_finish_callback)) {
recognize_speech_finish_callback(recognize_speech_phrases_array)
}
}
function sr_error(data) {
set_mic_and_instructions() //don't talk
if (is_window_shown(recognize_speech_window_index)){ //don't show this error message if the user closed the window
out("onerror called with: " + data, "red")
}
s2t.stop()
}
function got_enough_phrases(){
if(typeof(recognize_speech_finish_phrase) == "number"){
return (recognize_speech_phrases_array.length >= recognize_speech_finish_phrase)
}
else { return false } //unlimited
}
function limited_number_of_phrases(){
return (typeof(recognize_speech_finish_phrase) == "number")
}
function sr_animate_gif(){ recognize_speech_img_id.src = "mic-animate.gif"; out("on") }
function sr_unanimate_gif(){ recognize_speech_img_id.src = "mic.gif"; out("off") }
//public
function recognize_speech_default_phrase_callback(text, confidence){
out("recognized text: " + text + "<br/>confidence: " + confidence.toFixed(2))
return true
}
window.recognize_speech_default_phrase_callback = recognize_speech_default_phrase_callback
function sr_onkeypress(event){
if (event.charCode == 13){
let text = recognize_speech_type_in_id.value
data = {speechEventType: "SPEECH_EVENT_UNSPECIFIED",
results:[{transcript: text,
confidence: 100}]}
sr_on_data(data)
}
}
window.sr_onkeypress = sr_onkeypress
function sr_on_finish_button_click(){
let data = {speechEventType: "SPEECH_EVENT_UNSPECIFIED",
results:[{transcript: recognize_speech_finish_phrase,
confidence: 100}]}
sr_on_data(data)
}
window.sr_on_finish_button_click = sr_on_finish_button_click
function sr_display_finish_help(){
sr_finish_help_id.innerHTML = sr_finish_help()
}
function sr_finish_help(){
let finish_help
if (limited_number_of_phrases()) {
return "Recognized " + recognize_speech_phrases_array.length +
" out of " + recognize_speech_finish_phrase
}
else {
let finish_button_html = "<input type='button' value='" + recognize_speech_finish_phrase +
"' onmouseup='sr_on_finish_button_click()'/>" //for some inexplcable reason, onclick doesn't work here
return "To end recognition,<br/>say, type, or click " + finish_button_html
}
}
function sr_reject_html(){
return "<input type='button' value='reject' onmouseup='sr_reject()'/>"
}
/* not called
function sr_reject(){
let new_phrase = ""
let current_phrase = recognize_speech_type_in_id.value
if(recognize_speech_phrases_array.length == 0) {
new_phrase = "NOTHING TO REJECT"
}
else if (current_phrase == "NOTHING TO REJECT") {} //leave the same
else if (current_phrase.startsWith("INVALID: ")) {
new_phrase = replace_substrings(current_phrase, "INVALID: ", "REJECT: ")
}
else if (current_phrase.startsWith("REJECT: ")) {} //leave the same
else {
new_phrase = "REJECT: " + current_phrase
recognize_speech_phrases_array.pop()
}
recognize_speech_type_in_id.value = new_phrase
}
window.sr_reject = sr_reject
*/
function sr_current_prompt(){
let prompt = recognize_speech_prompt
if (Array.isArray(prompt)) {
if(recognize_speech_phrases_array.length >= prompt.length) {
prompt = last(prompt)
}
else { prompt = prompt[recognize_speech_phrases_array.length] }
}
return prompt
}
function sr_display_prompt(){
sr_prompt_id.innerHTML = sr_current_prompt()
}
function recognize_speech({
title="Recognize Speech", prompt="",
//only_once=true, now integer on finish_phrase
click_to_talk=true,
width=430, height=270, x=400, y=200,
background_color="rgb(238, 238, 238)",
phrase_callback=recognize_speech_default_phrase_callback,
finish_callback=null, //this fn called only when we know we're done
finish_phrase=1, //used for finish button label or an integer for how many to collect,
//or a fn, passed the phrase, and return true to mean done.
//if this fn returns true, the cur phrase does not go in the result.
} = {}) {
if (typeof(finish_phrase) == "number"){
if (finish_phrase < 1) {
dde_error("recognize_speech passed invalid finish_phrase of: " + finish_phrase +
'.<br/>It must be a positive integer or a string of a phrase that finishes input.')
}
}
let click_to_talk_html = ""
if (click_to_talk) {
click_to_talk_html = "<input type='button' value='Click to talk' style='margin:10px;vertical-align:top;'/>"
}
recognize_speech_phrase_callback = phrase_callback
recognize_speech_finish_callback = finish_callback
recognize_speech_phrases_array = []
recognize_speech_finish_phrase = finish_phrase
recognize_speech_prompt = prompt
recognize_speech_click_to_talk = click_to_talk
let content = "<div id='sr_prompt_id'>" + sr_current_prompt() + "</div>" +
click_to_talk_html +
"<img id='recognize_speech_img_id' src='mic.gif'/> " +
"<span id='recognize_speech_instructions_id'><br/></span><br/> " + //must use a full close tag for span here because HTML is broken
"<br/>OR: type in text and hit ENTER.<br/>" +
"<input id='recognize_speech_type_in_id' autofocus style='width:330px;font-size:16px;margin:5px;' onkeypress='sr_onkeypress(event)'/> " +
sr_reject_html() +
"<div id=sr_finish_help_id>" + sr_finish_help() + "</div>"
recognize_speech_window_index =
show_window({content: content,
title: title,
width: width, height: height, x: x, y: y,
background_color: background_color,
//callback only would ever get called if there's a click-to-talk button
callback: "window.start_recognition" //start_recognition //called from sandbox initially
})
if (!click_to_talk) { start_recognition() }
}
window.recognize_speech = recognize_speech