tp/stt_google.py at master · JustinHatesFun/tp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import pyaudio
import wave
import audioop
from collections import deque
import os
import urllib
import time
import math
import speech_recognition as sr
import pyaudio

LANG_CODE = 'en-US'  # Language to use

GOOGLE_SPEECH_URL = 'https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&pfilter=2&lang=%s&maxresults=6' % (LANG_CODE)

FLAC_CONV = 'flac -f'  # We need a WAV to FLAC converter. flac is available
                       # on Linux

# Microphone stream config.
CHUNK = 1024  # CHUNKS of bytes to read each time from mic
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
THRESHOLD = 2500  # The threshold intensity that defines silence
                  # and noise signal (an int. lower than THRESHOLD is silence).

SILENCE_LIMIT = 1  # Silence limit in seconds. The max ammount of seconds where
                   # only silence is recorded. When this time passes the
                   # recording finishes and the file is delivered.

PREV_AUDIO = 0.5  # Previous audio (in seconds) to prepend. When noise
                  # is detected, how much of previously recorded audio is
                  # prepended. This helps to prevent chopping the beggining
                  # of the phrase.


# def audio_int(num_samples=50):
#     """ Gets average audio intensity of your mic sound. You can use it to get
#         average intensities while you're talking and/or silent. The average
#         is the avg of the 20% largest intensities recorded.
#     """
#
#     print("Getting intensity values from mic.")
#     p = pyaudio.PyAudio()
#
#     stream = p.open(format=FORMAT,
#                     channels=CHANNELS,
#                     rate=RATE,
#                     input=True,
#                     frames_per_buffer=CHUNK)
#
#     values = [math.sqrt(abs(audioop.avg(stream.read(CHUNK), 4)))
#               for x in range(num_samples)]
#     values = sorted(values, reverse=True)
#     r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
#     print(" Finished ")
#     print(" Average audio intensity is ", r)
#     stream.close()
#     p.terminate()
#     return r


def listen_for_speech(threshold=THRESHOLD, num_phrases=-1):
    """
    Listens to Microphone, extracts phrases from it and sends it to
    Google's TTS service and returns response. a "phrase" is sound
    surrounded by silence (according to threshold). num_phrases controls
    how many phrases to process before finishing the listening process
    (-1 for infinite).
    """

    #Open stream
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    print("* Listening mic. ")
    audio2send = []
    cur_data = ''  # current chunk  of audio data
    rel = RATE/CHUNK
    slid_win = deque(maxlen=int(SILENCE_LIMIT * rel))
    #Prepend audio from 0.5 seconds before noise was detected
    prev_audio = deque(maxlen=int(PREV_AUDIO * rel))
    started = False
    n = num_phrases
    response = []

    while (num_phrases == -1 or n > 0):
        cur_data = stream.read(CHUNK)
        slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
        #print slid_win[-1]
        if(sum([x > THRESHOLD for x in slid_win]) > 0):
            print("Recognizing")
            return (recognizeSpeech())
    #         if(not started):
    #             print("Starting record of phrase")
    #             started = True
    #         audio2send.append(cur_data)
    #     elif (started is True):
    #         print("Finished")
    #         # The limit was reached, finish capture and deliver.
    #         filename = save_speech(list(prev_audio) + audio2send, p)
    #         # Send file to Google and get response
    #         r = recognizeSpeech(filename)
    #         if num_phrases == -1:
    #             print("Response")
    #         else:
    #             response.append(r)
    #         # Remove temp file. Comment line to review.
    #         os.remove(filename)
    #         # Reset all
    #         started = False
    #         slid_win = deque(maxlen=SILENCE_LIMIT * rel)
    #         prev_audio = deque(maxlen=0.5 * rel)
    #         audio2send = []
    #         n -= 1
    #         print("Listening ...")
    #     else:
    #         prev_audio.append(cur_data)

   ##   print("* Done recording")
    stream.close()
    p.terminate()
    # print(response)
    # return response


# def save_speech(data, p):
#     """ Saves mic data to temporary WAV file. Returns filename of saved
#         file """
#
#     filename = 'output_'+str(int(time.time()))
#     # writes data to WAV file
#     data = (b''.join(data))
#     wf = wave.open(filename + '.wav', 'wb')
#     wf.setnchannels(1)
#     wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
#     wf.setframerate(16000)  # TODO make this value a function parameter?
#     wf.writeframes(data)
#     wf.close()
#     return filename + '.wav'

def recognizeSpeech():

    r = sr.Recognizer()

    mic = sr.Microphone(device_index=0)

    # Adjusts the recognizer sensitivity for ambient noise and records audio
    with mic as source:
        r.adjust_for_ambient_noise(source)
        audio = r.listen(source)

    # set up the response object
    response = ""

    # try recognizing the speech in the recording
    # if a RequestError or UnknownValueError exception is caught,
    #     update the response object accordingly
    try:
        response = r.recognize_google(audio)
    except sr.RequestError:
        # API was unreachable or unresponsive
        response = None
        print("API unavailable")
    except sr.UnknownValueError:
        # speech was unintelligible
        response = None
        print("Unable to recognize speech")
    print(response)
    return response


# def stt_google_wav(audio_fname):
#     """ Sends audio file (audio_fname) to Google's text to speech
#         service and returns service's response. We need a FLAC
#         converter if audio is not FLAC (check FLAC_CONV). """
#
#     print("Sending ", audio_fname)
#     #Convert to flac first
#     filename = audio_fname
#     del_flac = False
#     if 'flac' not in filename:
#         del_flac = True
#         print("Converting to flac")
#         print(FLAC_CONV + filename)
#         os.system(FLAC_CONV + ' ' + filename)
#         filename = filename.split('.')[0] + '.flac'
#         print(filename)
#
#     f = open(filename, 'rb')
#     flac_cont = f.read()
#     f.close()
#
#     # Headers. A common Chromium (Linux) User-Agent
#     hrs = {"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7",
#            'Content-type': 'audio/x-flac; rate=16000'}
#
#     req = urllib2.Request(GOOGLE_SPEECH_URL, data=flac_cont, headers=hrs)
#     print("Sending request to Google TTS")
#     #print "response", response
#     try:
#         p = urllib2.urlopen(req)
#         response = p.read()
#         res = eval(response)['hypotheses']
#     except:
#         print("Couldn't parse service response")
#         res = None
#
#     if del_flac:
#         os.remove(filename)  # Remove temp file
#
#     return res

listen_for_speech()

# if(__name__ == '__main__'):
#       # listen to mic.
#     print(stt_google_wav('hello.flac'))  # translate audio file
#     audio_int()  # To measure your mic levels