-
Notifications
You must be signed in to change notification settings - Fork 110
Add support for streaming speech-to-text results #242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,27 @@ defmodule Bumblebee.Audio do | |
# TODO: remove in v0.5 | ||
@deprecated "Use Bumblebee.Audio.speech_to_text_whisper/5 instead." | ||
def speech_to_text(model_info, featurizer, tokenizer, generation_config, opts \\ []) do | ||
speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config, opts) | ||
serving = speech_to_text_whisper(model_info, featurizer, tokenizer, generation_config, opts) | ||
client_postprocessing = serving.client_postprocessing | ||
|
||
Nx.Serving.client_postprocessing(serving, fn output_pair, info -> | ||
output = client_postprocessing.(output_pair, info) | ||
|
||
if is_list(output) do | ||
Enum.map(output, &speech_to_text_convert_output/1) | ||
else | ||
speech_to_text_convert_output(output) | ||
end | ||
end) | ||
end | ||
|
||
defp speech_to_text_convert_output(%{chunks: chunks}) do | ||
text = | ||
chunks | ||
|> Enum.map_join(& &1.text) | ||
|> String.trim() | ||
|
||
%{results: [%{text: text}]} | ||
end | ||
|
||
@typedoc """ | ||
|
@@ -21,15 +41,13 @@ defmodule Bumblebee.Audio do | |
|
||
""" | ||
@type speech_to_text_whisper_input :: Nx.t() | {:file, String.t()} | ||
@type speech_to_text_whisper_output :: %{results: list(speech_to_text_whisper_result())} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I removed |
||
@type speech_to_text_whisper_result :: %{ | ||
@type speech_to_text_whisper_output :: %{ | ||
chunks: list(speech_to_text_whisper_chunk()) | ||
} | ||
@type speech_to_text_whisper_chunk :: %{ | ||
text: String.t(), | ||
chunks: | ||
list(%{ | ||
text: String.t(), | ||
start_timestamp_seconds: number() | nil, | ||
end_timestamp_seconds: number() | nil | ||
}) | ||
start_timestamp_seconds: number() | nil, | ||
end_timestamp_seconds: number() | nil | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to wrap up our conversation, the issue with this is that, based on an option, the timestamp is either nil or number. However, callers of the code that only call this with There are a couple fixes. One is to say this is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah you are right it still depends on the option just in a different way. I'm not really a fan of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, there is still a lot for it to happen, but I thought I would start the discussion. If we keep this as is, then -1 is like the only viable option in the future if we want to keep a single type. Another option is to have a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's keep |
||
} | ||
|
||
@doc """ | ||
|
@@ -91,6 +109,12 @@ defmodule Bumblebee.Audio do | |
this option when using partitioned serving, to allocate params | ||
on each of the devices. Defaults to `false` | ||
|
||
* `:stream` - when `true`, the serving immediately returns a | ||
stream that emits chunks as they are generated. Note that | ||
when using streaming, only a single input can be given to the | ||
serving. To process a batch, call the serving with each input | ||
separately. Defaults to `false` | ||
|
||
## Examples | ||
|
||
{:ok, whisper} = Bumblebee.load_model({:hf, "openai/whisper-tiny"}) | ||
|
@@ -103,22 +127,20 @@ defmodule Bumblebee.Audio do | |
defn_options: [compiler: EXLA] | ||
) | ||
|
||
Nx.Serving.run(serving, {:file, "/path/to/audio.wav"}) | ||
output = Nx.Serving.run(serving, {:file, "/path/to/audio.wav"}) | ||
#=> %{ | ||
#=> results: [ | ||
#=> chunks: [ | ||
#=> %{ | ||
#=> chunks: [ | ||
#=> %{ | ||
#=> text: " There is a cat outside the window.", | ||
#=> start_timestamp_seconds: nil, | ||
#=> end_timestamp_seconds: nil | ||
#=> } | ||
#=> ], | ||
#=> text: "There is a cat outside the window." | ||
#=> text: " There is a cat outside the window.", | ||
#=> start_timestamp_seconds: nil, | ||
#=> end_timestamp_seconds: nil | ||
#=> } | ||
#=> ] | ||
#=> } | ||
|
||
text = output.chunks |> Enum.map_join(& &1.text) |> String.trim() | ||
#=> "There is a cat outside the window." | ||
|
||
And with timestamps: | ||
|
||
serving = | ||
|
@@ -130,26 +152,21 @@ defmodule Bumblebee.Audio do | |
|
||
Nx.Serving.run(serving, {:file, "/path/to/colouredstars_08_mathers_128kb.mp3"}) | ||
#=> %{ | ||
#=> results: [ | ||
#=> chunks: [ | ||
#=> %{ | ||
#=> text: " Such an eight of colored stars, versions of fifty isiatic love poems by Edward Powis-Mathers.", | ||
#=> start_timestamp_seconds: 0.0, | ||
#=> end_timestamp_seconds: 7.0 | ||
#=> }, | ||
#=> %{ | ||
#=> text: " This the revocs recording is in the public domain. Doubt. From the Japanese of Hori-Kawa,", | ||
#=> start_timestamp_seconds: 7.0, | ||
#=> end_timestamp_seconds: 14.0 | ||
#=> }, | ||
#=> %{ | ||
#=> chunks: [ | ||
#=> %{ | ||
#=> text: " Such an eight of colored stars, versions of fifty isiatic love poems by Edward Powis-Mathers.", | ||
#=> start_timestamp_seconds: 0.0, | ||
#=> end_timestamp_seconds: 7.0 | ||
#=> }, | ||
#=> %{ | ||
#=> text: " This the revocs recording is in the public domain. Doubt. From the Japanese of Hori-Kawa,", | ||
#=> start_timestamp_seconds: 7.0, | ||
#=> end_timestamp_seconds: 14.0 | ||
#=> }, | ||
#=> %{ | ||
#=> text: " will he be true to me that I do not know. But since the dawn, I have had as much disorder in my thoughts as in my black hair, and of doubt.", | ||
#=> start_timestamp_seconds: 14.0, | ||
#=> end_timestamp_seconds: 27.0 | ||
#=> } | ||
#=> ], | ||
#=> text: "Such an eight of colored stars, versions of fifty isiatic love poems by Edward Powis-Mathers. This the revocs recording is in the public domain. Doubt. From the Japanese of Hori-Kawa, will he be true to me that I do not know. But since the dawn, I have had as much disorder in my thoughts as in my black hair, and of doubt." | ||
#=> text: " will he be true to me that I do not know. But since the dawn, I have had as much disorder in my thoughts as in my black hair, and of doubt.", | ||
#=> start_timestamp_seconds: 14.0, | ||
#=> end_timestamp_seconds: 27.0 | ||
#=> } | ||
#=> ] | ||
#=> } | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The output format changed, so I'm wrapping to avoid a breaking change.