@@ -10,11 +10,18 @@ defmodule Bumblebee.Audio do
10
10
11
11
* a 1-dimensional `Nx.Tensor` with audio samples
12
12
13
+ * an enumerable of 1-dimensional `Nx.Tensor`s, represending a
14
+ continuous stream of input. The tensors are not required to
15
+ have the same number of samples, the serving takes care of
16
+ accumulating and chunking the input as needed. This input is
17
+ only supported when chunking is enabled with the `:chunk_num_seconds`
18
+ option
19
+
13
20
* `{:file, path}` with path to an audio file (note that this
14
21
requires `ffmpeg` installed)
15
22
16
23
"""
17
- @ type audio :: Nx . t ( ) | { :file , String . t ( ) }
24
+ @ type audio :: Nx . t ( ) | Enumerable . t ( Nx . t ( ) ) | { :file , String . t ( ) }
18
25
19
26
@ type speech_to_text_whisper_input ::
20
27
audio ( ) | % { :audio => audio ( ) , optional ( :seed ) => integer ( ) | nil }
@@ -31,7 +38,11 @@ defmodule Bumblebee.Audio do
31
38
Builds serving for speech-to-text generation with Whisper models.
32
39
33
40
The serving accepts `t:speech_to_text_whisper_input/0` and returns
34
- `t:speech_to_text_whisper_output/0`. A list of inputs is also supported.
41
+ `t:speech_to_text_whisper_output/0`.
42
+
43
+ This serving always accepts a single input. A list of tensors is
44
+ interpreted as continuous chunks. To transcribe multiple inputs
45
+ concurrently use `Nx.Serving.batched_run/2`.
35
46
36
47
## Options
37
48
@@ -48,6 +59,15 @@ defmodule Bumblebee.Audio do
48
59
in the total `:chunk_num_seconds`. Defaults to 1/6 of
49
60
`:chunk_num_seconds`
50
61
62
+ * `:client_batch_size` - the number of input chunks the client
63
+ should gather and run at once. When streaming an input that is
64
+ already available (such as a file), you can use any value without
65
+ introducing a delay. A good default is to use the same value as
66
+ `opts[:compile][:batch_size]`. When streaming an input that is
67
+ being produced live, you may want to process chunks as soon as
68
+ they are available, in which case set this option to `1`. Defaults
69
+ to `opts[:compile][:batch_size]` if present, otherwise to `1`
70
+
51
71
* `:language` - the language of the speech, when known upfront.
52
72
Should be given as ISO alpha-2 code as string. By default no
53
73
language is assumed and it is inferred from the input
0 commit comments