-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathstt.py
More file actions
141 lines (117 loc) · 4.48 KB
/
stt.py
File metadata and controls
141 lines (117 loc) · 4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import pyaudio
import wave
import audioop
from collections import deque
import os
import urllib2
import urllib
import time
import math
LANG_CODE = 'en-US' # Language to use
GOOGLE_SPEECH_URL = 'https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&pfilter=2&lang=%s&maxresults=6' % (LANG_CODE)
FLAC_CONV = 'flac -f' # We need a WAV to FLAC converter. flac is available
# on Linux
# Microphone stream config.
CHUNK = 1024 # CHUNKS of bytes to read each time from mic
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 10000
THRESHOLD = 10750 # The threshold intensity that defines silence
# and noise signal (an int. lower than THRESHOLD is silence).
SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where
# only silence is recorded. When this time passes the
# recording finishes and the file is delivered.
PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise
# is detected, how much of previously recorded audio is
# prepended. This helps to prevent chopping the beggining
# of the phrase.
def audio_int(num_samples=50):
""" Gets average audio intensity of your mic sound. You can use it to get
average intensities while you're talking and/or silent. The average
is the avg of the 20% largest intensities recorded.
"""
print "Getting intensity values from mic."
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
values = [math.sqrt(abs(audioop.avg(stream.read(CHUNK), 4)))
for x in range(num_samples)]
values = sorted(values, reverse=True)
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
print " Finished "
print " Average audio intensity is ", r
stream.close()
p.terminate()
return r
def listen_for_speech(threshold=THRESHOLD, num_phrases=-1):
"""
Listens to Microphone, extracts phrases from it and sends it to
Google's TTS service and returns response. a "phrase" is sound
surrounded by silence (according to threshold). num_phrases controls
how many phrases to process before finishing the listening process
(-1 for infinite).
"""
#Open stream
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print "* Listening mic. "
audio2send = []
cur_data = '' # current chunk of audio data
rel = RATE/CHUNK
slid_win = deque(maxlen=SILENCE_LIMIT * rel)
#Prepend audio from 0.5 seconds before noise was detected
prev_audio = deque(maxlen=PREV_AUDIO * rel)
started = False
n = num_phrases
response = []
while (num_phrases == -1 or n > 0):
cur_data = stream.read(CHUNK)
slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
# print slid_win
if(sum([x > THRESHOLD for x in slid_win]) > 0):
print 't: ' + str(THRESHOLD)
print 'x: ' + str(x)
print started
if(not started):
print "Starting record of phrase"
started = True
audio2send.append(cur_data)
elif (started is True):
print "Finished"
filename = save_speech(list(prev_audio) + audio2send, p)
started = False
slid_win = deque(maxlen=SILENCE_LIMIT * rel)
prev_audio = deque(maxlen=0.5 * rel)
audio2send = []
n -= 1
print "Listening ..."
else:
prev_audio.append(cur_data)
print "* Done recording"
stream.close()
p.terminate()
return response
def save_speech(data, p):
""" Saves mic data to temporary WAV file. Returns filename of saved
file """
filename = 'output'
# writes data to WAV file
data = ''.join(data)
wf = wave.open(filename + '.wav', 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(16000) # TODO make this value a function parameter?
wf.writeframes(data)
wf.close()
return filename + '.wav'
if(__name__ == '__main__'):
listen_for_speech() # listen to mic.
# print stt_google_wav('hello.flac') # translate audio file
audio_int() # To measure your mic levels