Skip to content

Commit fd07cd7

Browse files
committed
Add Speech Streaming API.
1 parent d31f466 commit fd07cd7

20 files changed

Lines changed: 987 additions & 16 deletions

docs/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@
176176
speech-encoding
177177
speech-metadata
178178
speech-operation
179+
speech-streaming
179180
speech-sample
180181
speech-transcript
181182

docs/speech-streaming.rst

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
Speech StreamingResponseContainer
2+
=================================
3+
4+
.. automodule:: google.cloud.speech.streaming.container
5+
:members:
6+
:undoc-members:
7+
:show-inheritance:
8+
9+
Speech Streaming Request helpers
10+
================================
11+
12+
.. automodule:: google.cloud.speech.streaming.request
13+
:members:
14+
:undoc-members:
15+
:show-inheritance:
16+
17+
Speech StreamingSpeechResponse
18+
==============================
19+
20+
.. automodule:: google.cloud.speech.streaming.response
21+
:members:
22+
:undoc-members:
23+
:show-inheritance:
24+
25+
26+
27+
Speech StreamingSpeechResult
28+
============================
29+
30+
.. automodule:: google.cloud.speech.streaming.result
31+
:members:
32+
:undoc-members:
33+
:show-inheritance:

docs/speech-usage.rst

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,5 +145,82 @@ words to the vocabulary of the recognizer.
145145
transcript: Hello, this is a test
146146
confidence: 0.81
147147
148+
149+
Streaming Recognition
150+
---------------------
151+
152+
The :meth:`~google.cloud.speech.Client.stream_recognize` method converts speech
153+
data to possible text alternatives on the fly.
154+
155+
.. note::
156+
Streaming recognition requests are limited to 1 minute of audio.
157+
158+
See: https://cloud.google.com/speech/limits#content
159+
160+
>>> import io
161+
>>> from google.cloud import speech
162+
>>> from google.cloud.speech.encoding import Encoding
163+
>>> client = speech.Client()
164+
>>> with io.open('./hello.wav', 'rb') as stream:
165+
>>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16,
166+
... sample_rate=16000)
167+
>>> stream_container = client.stream_recognize(sample)
168+
>>> print(stream_container)
169+
<google.cloud.speech.streaming.container.StreamingResponseContainer object at 0x10538ee10>
170+
>>> print(stream_container.responses)
171+
{0: <google.cloud.speech.streaming.response.StreamingSpeechResponse object at 0x10f9ac9d0>}
172+
>>> print(stream_container.responses[0].results[0].alternatives[0].confidence)
173+
0.698092460632
174+
>>> print(stream_container.is_finished)
175+
True
176+
>>> print stream_container.get_full_text()
177+
hello
178+
179+
By default the recognizer will perform continuous recognition
180+
(continuing to process audio even if the user pauses speaking) until the client
181+
closes the output stream or when the maximum time limit has been reached.
182+
183+
If you only want to recognize a single utterance you can set
184+
``single_utterance`` to ``True`` and only one result will be returned.
185+
186+
See: `Single Utterance`_
187+
188+
.. code-block:: python
189+
190+
>>> with io.open('./hello_pause_goodbye.wav', 'rb') as stream:
191+
>>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16,
192+
... sample_rate=16000)
193+
>>> stream_container = client.stream_recognize(sample,
194+
... single_utterance=True)
195+
>>> print(stream_container.get_full_text())
196+
hello
197+
198+
199+
If ``interim_results`` is set to ``True``, interim results
200+
(tentative hypotheses) may be returned as they become available.
201+
202+
.. code-block:: python
203+
204+
>>> with io.open('./hello_pause_goodbye.wav', 'rb') as stream:
205+
>>> sample = client.sample(stream=stream, encoding=Encoding.LINEAR16,
206+
... sample_rate=16000)
207+
>>> stream_container = client.stream_recognize(sample,
208+
... interim_results=True)
209+
>>> print(stream_container.get_full_text())
210+
hello
211+
212+
>>> sample = client.sample(source_uri='gs://my-bucket/recording.flac',
213+
... encoding=Encoding.FLAC,
214+
... sample_rate=44100)
215+
>>> results = client.stream_recognize(sample, interim_results=True)
216+
>>> print(stream_container.responses[0].results[0].alternatives[0].transcript)
217+
how
218+
print(stream_container.responses[1].results[0].alternatives[0].transcript)
219+
hello
220+
>>> print(stream_container.responses[1].results[2].is_final)
221+
True
222+
223+
224+
.. _Single Utterance: https://cloud.google.com/speech/reference/rpc/google.cloud.speech.v1beta1#streamingrecognitionconfig
148225
.. _sync_recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/syncrecognize
149226
.. _Speech Asynchronous Recognize: https://cloud.google.com/speech/reference/rest/v1beta1/speech/asyncrecognize

scripts/verify_included_modules.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
'google.cloud.pubsub.__init__',
4646
'google.cloud.resource_manager.__init__',
4747
'google.cloud.speech.__init__',
48+
'google.cloud.speech.streaming.__init__',
4849
'google.cloud.storage.__init__',
4950
'google.cloud.streaming.__init__',
5051
'google.cloud.streaming.buffered_stream',

speech/google/cloud/speech/client.py

Lines changed: 123 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,30 @@
1414

1515
"""Basic client for Google Cloud Speech API."""
1616

17+
import os
1718
from base64 import b64encode
1819

1920
from google.cloud._helpers import _to_bytes
2021
from google.cloud._helpers import _bytes_to_unicode
2122
from google.cloud import client as client_module
23+
from google.cloud.environment_vars import DISABLE_GRPC
2224
from google.cloud.speech.connection import Connection
2325
from google.cloud.speech.encoding import Encoding
2426
from google.cloud.speech.operation import Operation
27+
from google.cloud.speech.streaming.request import _make_request_stream
2528
from google.cloud.speech.sample import Sample
29+
from google.cloud.speech.streaming.container import StreamingResponseContainer
30+
31+
try:
32+
from google.cloud.gapic.speech.v1beta1.speech_api import SpeechApi
33+
except ImportError: # pragma: NO COVER
34+
_HAVE_GAX = False
35+
else:
36+
_HAVE_GAX = True
37+
38+
39+
_DISABLE_GAX = os.getenv(DISABLE_GRPC, False)
40+
_USE_GAX = _HAVE_GAX and not _DISABLE_GAX
2641

2742

2843
class Client(client_module.Client):
@@ -47,6 +62,7 @@ class Client(client_module.Client):
4762
"""
4863

4964
_connection_class = Connection
65+
_speech_api = None
5066

5167
def async_recognize(self, sample, language_code=None,
5268
max_alternatives=None, profanity_filter=None,
@@ -104,7 +120,7 @@ def async_recognize(self, sample, language_code=None,
104120
return Operation.from_api_repr(self, api_response)
105121

106122
@staticmethod
107-
def sample(content=None, source_uri=None, encoding=None,
123+
def sample(content=None, source_uri=None, stream=None, encoding=None,
108124
sample_rate=None):
109125
"""Factory: construct Sample to use when making recognize requests.
110126
@@ -118,6 +134,9 @@ def sample(content=None, source_uri=None, encoding=None,
118134
supported, which must be specified in the following
119135
format: ``gs://bucket_name/object_name``.
120136
137+
:type stream: :class:`io.BufferedReader`
138+
:param stream: File like object to read audio data from.
139+
121140
:type encoding: str
122141
:param encoding: encoding of audio data sent in all RecognitionAudio
123142
messages, can be one of: :attr:`~.Encoding.LINEAR16`,
@@ -135,7 +154,7 @@ def sample(content=None, source_uri=None, encoding=None,
135154
:rtype: :class:`~google.cloud.speech.sample.Sample`
136155
:returns: Instance of ``Sample``.
137156
"""
138-
return Sample(content=content, source_uri=source_uri,
157+
return Sample(content=content, source_uri=source_uri, stream=stream,
139158
encoding=encoding, sample_rate=sample_rate)
140159

141160
def sync_recognize(self, sample, language_code=None,
@@ -199,6 +218,108 @@ def sync_recognize(self, sample, language_code=None,
199218
else:
200219
raise ValueError('result in api should have length 1')
201220

221+
def stream_recognize(self, sample, language_code=None,
222+
max_alternatives=None, profanity_filter=None,
223+
speech_context=None, single_utterance=False,
224+
interim_results=False):
225+
"""Streaming speech recognition.
226+
227+
.. note::
228+
Streaming recognition requests are limited to 1 minute of audio.
229+
230+
See: https://cloud.google.com/speech/limits#content
231+
232+
:type sample: :class:`~google.cloud.speech.sample.Sample`
233+
:param sample: Instance of ``Sample`` containing audio information.
234+
235+
:type language_code: str
236+
:param language_code: (Optional) The language of the supplied audio as
237+
BCP-47 language tag. Example: ``'en-GB'``.
238+
If omitted, defaults to ``'en-US'``.
239+
240+
:type max_alternatives: int
241+
:param max_alternatives: (Optional) Maximum number of recognition
242+
hypotheses to be returned. The server may
243+
return fewer than maxAlternatives.
244+
Valid values are 0-30. A value of 0 or 1
245+
will return a maximum of 1. Defaults to 1
246+
247+
:type profanity_filter: bool
248+
:param profanity_filter: If True, the server will attempt to filter
249+
out profanities, replacing all but the
250+
initial character in each filtered word with
251+
asterisks, e.g. ``'f***'``. If False or
252+
omitted, profanities won't be filtered out.
253+
254+
:type speech_context: list
255+
:param speech_context: A list of strings (max 50) containing words and
256+
phrases "hints" so that the speech recognition
257+
is more likely to recognize them. This can be
258+
used to improve the accuracy for specific words
259+
and phrases. This can also be used to add new
260+
words to the vocabulary of the recognizer.
261+
262+
:type single_utterance: boolean
263+
:param single_utterance: [Optional] If false or omitted, the recognizer
264+
will perform continuous recognition
265+
(continuing to process audio even if the user
266+
pauses speaking) until the client closes the
267+
output stream (gRPC API) or when the maximum
268+
time limit has been reached. Multiple
269+
SpeechRecognitionResults with the is_final
270+
flag set to true may be returned.
271+
272+
If true, the recognizer will detect a single
273+
spoken utterance. When it detects that the
274+
user has paused or stopped speaking, it will
275+
return an END_OF_UTTERANCE event and cease
276+
recognition. It will return no more than one
277+
SpeechRecognitionResult with the is_final flag
278+
set to true.
279+
280+
:type interim_results: boolean
281+
:param interim_results: [Optional] If true, interim results (tentative
282+
hypotheses) may be returned as they become
283+
available (these interim results are indicated
284+
with the is_final=false flag). If false or
285+
omitted, only is_final=true result(s) are
286+
returned.
287+
288+
:rtype: :class:`~streaming.StreamingResponseContainer`
289+
:returns: An instance of ``StreamingReponseContainer``.
290+
291+
"""
292+
if not _USE_GAX:
293+
raise EnvironmentError('GRPC is required to use this API.')
294+
295+
if sample.stream.closed:
296+
raise ValueError('Stream is closed.')
297+
298+
requests = _make_request_stream(sample, language_code=language_code,
299+
max_alternatives=max_alternatives,
300+
profanity_filter=profanity_filter,
301+
speech_context=speech_context,
302+
single_utterance=single_utterance,
303+
interim_results=interim_results)
304+
305+
responses = StreamingResponseContainer()
306+
for response in self.speech_api.streaming_recognize(requests):
307+
if response:
308+
responses.add_response(response)
309+
310+
return responses
311+
312+
@property
313+
def speech_api(self):
314+
"""Instance of Speech API.
315+
316+
:rtype: :class:`google.cloud.gapic.speech.v1beta1.speech_api.SpeechApi`
317+
:returns: Instance of ``SpeechApi``.
318+
"""
319+
if not self._speech_api:
320+
self._speech_api = SpeechApi()
321+
return self._speech_api
322+
202323

203324
def _build_request_data(sample, language_code=None, max_alternatives=None,
204325
profanity_filter=None, speech_context=None):

speech/google/cloud/speech/operation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ def _update(self, response):
124124
results = []
125125
if raw_results:
126126
for result in raw_results[0]['alternatives']:
127-
results.append(Transcript(result))
127+
results.append(Transcript(result.get('transcript'),
128+
result.get('confidence')))
128129
if metadata:
129130
self._metadata = Metadata.from_api_repr(metadata)
130131

speech/google/cloud/speech/sample.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ class Sample(object):
3030
supported, which must be specified in the following
3131
format: ``gs://bucket_name/object_name``.
3232
33+
:type stream: :class:`io.BufferedReader`
34+
:param stream: File like object to read audio data from.
35+
3336
:type encoding: str
3437
:param encoding: encoding of audio data sent in all RecognitionAudio
3538
messages, can be one of: :attr:`~.Encoding.LINEAR16`,
@@ -47,16 +50,15 @@ class Sample(object):
4750
default_encoding = Encoding.FLAC
4851
default_sample_rate = 16000
4952

50-
def __init__(self, content=None, source_uri=None,
53+
def __init__(self, content=None, source_uri=None, stream=None,
5154
encoding=None, sample_rate=None):
52-
53-
no_source = content is None and source_uri is None
54-
both_source = content is not None and source_uri is not None
55-
if no_source or both_source:
56-
raise ValueError('Supply one of \'content\' or \'source_uri\'')
55+
if [content, source_uri, stream].count(None) != 2:
56+
raise ValueError('Supply only one of \'content\', \'source_uri\''
57+
' or stream.')
5758

5859
self._content = content
5960
self._source_uri = source_uri
61+
self._stream = stream
6062

6163
if sample_rate is not None and not 8000 <= sample_rate <= 48000:
6264
raise ValueError('The value of sample_rate must be between 8000'
@@ -68,6 +70,15 @@ def __init__(self, content=None, source_uri=None,
6870
else:
6971
raise ValueError('Invalid encoding: %s' % (encoding,))
7072

73+
@property
74+
def chunk_size(self):
75+
"""Chunk size to send over GRPC. ~100ms
76+
77+
:rtype: int
78+
:returns: Optimized chunk size.
79+
"""
80+
return int(self.sample_rate / 10)
81+
7182
@property
7283
def source_uri(self):
7384
"""Google Cloud Storage URI of audio source.
@@ -77,6 +88,15 @@ def source_uri(self):
7788
"""
7889
return self._source_uri
7990

91+
@property
92+
def stream(self):
93+
"""Stream of audio data.
94+
95+
:rtype: :class:`io.BufferedReader`
96+
:returns: File like object to read audio data from.
97+
"""
98+
return self._stream
99+
80100
@property
81101
def content(self):
82102
"""Bytes of audio content.

speech/google/cloud/speech/streaming/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)