Skip to content

Commit c81af22

Browse files
authored
Merge pull request #181 from manmohan659/stramingWorking
Streaming Change
2 parents 339bc83 + f076d26 commit c81af22

File tree

6 files changed

+483
-228
lines changed

6 files changed

+483
-228
lines changed

examples/tts-demo/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,4 @@
2828
"typescript-eslint": "^8.18.2",
2929
"vite": "^6.0.5"
3030
}
31-
}
31+
}

examples/tts-demo/src/App.tsx

Lines changed: 244 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { useState } from 'react';
1+
import { useState, useRef, useEffect } from 'react';
22
import { BrowserAI } from '@browserai/browserai';
33
import styled from '@emotion/styled';
44

@@ -182,9 +182,88 @@ function App() {
182182
const [isLoading, setIsLoading] = useState(false);
183183
const [ttsAI] = useState(new BrowserAI());
184184
const [isModelLoaded, setIsModelLoaded] = useState(false);
185-
const [audioBlob, setAudioBlob] = useState<Blob | null>(null);
186185
const [selectedVoice, setSelectedVoice] = useState('af_bella');
187186
const [speed, setSpeed] = useState(1.0);
187+
const [audioBlob, setAudioBlob] = useState<Blob | null>(null);
188+
189+
// Audio streaming references
190+
const audioContextRef = useRef<AudioContext | null>(null);
191+
const nextPlayTimeRef = useRef<number>(0);
192+
const isPlayingRef = useRef<boolean>(false);
193+
const accumulatedAudioChunksRef = useRef<Float32Array[]>([]);
194+
const sampleRateRef = useRef<number>(24000);
195+
196+
// Clean up audio context on unmount
197+
useEffect(() => {
198+
return () => {
199+
if (audioContextRef.current) {
200+
audioContextRef.current.close();
201+
}
202+
};
203+
}, []);
204+
205+
// Create WAV header
206+
const createWAVHeader = (numChannels: number, sampleRate: number, numSamples: number): ArrayBuffer => {
207+
const buffer = new ArrayBuffer(44);
208+
const view = new DataView(buffer);
209+
210+
// "RIFF" chunk descriptor
211+
writeString(view, 0, 'RIFF');
212+
// File size (data size + 36 bytes of header)
213+
view.setUint32(4, 36 + numSamples * 2, true);
214+
writeString(view, 8, 'WAVE');
215+
216+
// "fmt " sub-chunk
217+
writeString(view, 12, 'fmt ');
218+
view.setUint32(16, 16, true); // fmt chunk size
219+
view.setUint16(20, 1, true); // audio format (1 for PCM)
220+
view.setUint16(22, numChannels, true);
221+
view.setUint32(24, sampleRate, true);
222+
view.setUint32(28, sampleRate * numChannels * 2, true); // byte rate
223+
view.setUint16(32, numChannels * 2, true); // block align
224+
view.setUint16(34, 16, true); // bits per sample
225+
226+
// "data" sub-chunk
227+
writeString(view, 36, 'data');
228+
view.setUint32(40, numSamples * 2, true); // data size
229+
230+
return buffer;
231+
};
232+
233+
// Helper function to write string to DataView
234+
const writeString = (view: DataView, offset: number, string: string) => {
235+
for (let i = 0; i < string.length; i++) {
236+
view.setUint8(offset + i, string.charCodeAt(i));
237+
}
238+
};
239+
240+
const initializeAudioContext = () => {
241+
if (!audioContextRef.current || audioContextRef.current.state === 'closed') {
242+
const context = new (window.AudioContext || (window as any).webkitAudioContext)();
243+
audioContextRef.current = context;
244+
nextPlayTimeRef.current = context.currentTime; // Initialize play time
245+
return context;
246+
}
247+
return audioContextRef.current;
248+
};
249+
250+
const playAudioChunk = (context: AudioContext, chunk: Float32Array, sampleRate: number) => {
251+
const buffer = context.createBuffer(1, chunk.length, sampleRate);
252+
buffer.copyToChannel(chunk, 0);
253+
254+
const node = context.createBufferSource();
255+
node.buffer = buffer;
256+
node.connect(context.destination);
257+
258+
// Schedule playback precisely
259+
const scheduledTime = Math.max(context.currentTime, nextPlayTimeRef.current);
260+
node.start(scheduledTime);
261+
262+
// Update the time for the next chunk
263+
nextPlayTimeRef.current = scheduledTime + buffer.duration;
264+
265+
return node;
266+
};
188267

189268
const loadModel = async () => {
190269
try {
@@ -204,58 +283,171 @@ function App() {
204283
setStatus('Please enter some text first');
205284
return;
206285
}
286+
if (!isModelLoaded || isLoading) return;
287+
288+
setIsLoading(true);
289+
setStatus('Generating speech stream...');
290+
291+
// Reset any previous audio state
292+
accumulatedAudioChunksRef.current = [];
293+
isPlayingRef.current = true;
294+
295+
const currentAudioContext = initializeAudioContext();
296+
if (!currentAudioContext) {
297+
setStatus('Failed to initialize Audio Context');
298+
setIsLoading(false);
299+
return;
300+
}
301+
302+
// Ensure audio context is running (required after user interaction)
303+
if (currentAudioContext.state === 'suspended') {
304+
await currentAudioContext.resume();
305+
}
306+
307+
// Reset nextPlayTime for new playback
308+
nextPlayTimeRef.current = currentAudioContext.currentTime;
207309

208310
try {
209-
setIsLoading(true);
210-
setStatus('Generating speech...');
211-
const audioData = await ttsAI.textToSpeech(text, {
311+
// Get language from selected voice
312+
const selectedVoiceData = VOICE_OPTIONS.find(v => v.id === selectedVoice);
313+
if (!selectedVoiceData) {
314+
throw new Error("Selected voice data not found.");
315+
}
316+
const language = selectedVoiceData.language;
317+
318+
const result = await ttsAI.textToSpeech(text, {
212319
voice: selectedVoice,
213-
speed: speed
320+
speed: speed,
321+
language: language // Pass explicit language code
214322
});
215323

216-
if (audioData) {
217-
// Create a blob with WAV MIME type
218-
const blob = new Blob([audioData], { type: 'audio/wav' });
219-
setAudioBlob(blob); // Store the blob for download
220-
const audioUrl = URL.createObjectURL(blob);
221-
222-
// Create and play audio element
223-
const audio = new Audio(audioUrl);
324+
// Extract stream and sampleRate from the result
325+
const { stream, sampleRate } = result;
326+
327+
// Store sample rate for WAV generation
328+
sampleRateRef.current = sampleRate;
329+
330+
// Reset accumulated chunks
331+
accumulatedAudioChunksRef.current = [];
332+
333+
// Clear any previous audio blob
334+
setAudioBlob(null);
335+
336+
setStatus('Streaming audio...');
337+
let chunksProcessed = 0;
338+
339+
// Process each chunk from the stream
340+
for await (const chunk of stream) {
341+
if (!isPlayingRef.current) break; // Allow stopping
224342

225-
audio.onended = () => {
226-
setStatus('Finished playing');
227-
setIsLoading(false);
228-
URL.revokeObjectURL(audioUrl); // Clean up
229-
};
343+
// Store the chunk for potential download later
344+
accumulatedAudioChunksRef.current.push(chunk);
230345

231-
audio.onerror = (e) => {
232-
console.error('Audio playback error:', e);
233-
setStatus('Error playing audio');
234-
setIsLoading(false);
235-
URL.revokeObjectURL(audioUrl);
236-
};
346+
// Play this chunk
347+
playAudioChunk(currentAudioContext, chunk, sampleRate);
237348

238-
setStatus('Playing audio...');
239-
await audio.play();
349+
// Update status occasionally to show progress
350+
chunksProcessed++;
351+
if (chunksProcessed % 10 === 0) {
352+
setStatus('Streaming audio...');
353+
}
240354
}
355+
356+
// Calculate when all audio will finish playing
357+
const estimatedDuration = nextPlayTimeRef.current - currentAudioContext.currentTime;
358+
const finishingDelay = Math.max(estimatedDuration * 1000, 100); // At least 100ms
359+
360+
setTimeout(() => {
361+
if (isPlayingRef.current) {
362+
// Create blob for download
363+
if (accumulatedAudioChunksRef.current.length > 0) {
364+
// Calculate total length of all chunks
365+
const totalLength = accumulatedAudioChunksRef.current.reduce((total, chunk) => total + chunk.length, 0);
366+
367+
// Create a combined Float32Array
368+
const combinedFloat32 = new Float32Array(totalLength);
369+
let offset = 0;
370+
371+
// Copy all chunks into the combined array
372+
for (const chunk of accumulatedAudioChunksRef.current) {
373+
combinedFloat32.set(chunk, offset);
374+
offset += chunk.length;
375+
}
376+
377+
// Normalize if needed - skip this as chunks are already normalized
378+
// const maxValue = combinedFloat32.reduce((max, val) => Math.max(max, Math.abs(val)), 0);
379+
// const normalizedData = maxValue > 0 ? new Float32Array(combinedFloat32.length) : combinedFloat32;
380+
381+
// if (maxValue > 0) {
382+
// for (let i = 0; i < combinedFloat32.length; i++) {
383+
// normalizedData[i] = combinedFloat32[i] / maxValue;
384+
// }
385+
// }
386+
387+
// Convert to Int16Array for WAV
388+
const int16Array = new Int16Array(combinedFloat32.length);
389+
const int16Factor = 0x7FFF;
390+
391+
for (let i = 0; i < combinedFloat32.length; i++) {
392+
const s = combinedFloat32[i];
393+
int16Array[i] = s < 0 ? Math.max(-0x8000, s * 0x8000) : Math.min(0x7FFF, s * int16Factor);
394+
}
395+
396+
// Create WAV header
397+
const wavHeader = createWAVHeader(1, sampleRateRef.current, int16Array.length);
398+
399+
// Combine header with audio data
400+
const wavBytes = new Uint8Array(44 + int16Array.byteLength);
401+
wavBytes.set(new Uint8Array(wavHeader), 0);
402+
wavBytes.set(new Uint8Array(int16Array.buffer), 44);
403+
404+
// Create blob for download
405+
const blob = new Blob([wavBytes], { type: 'audio/wav' });
406+
setAudioBlob(blob);
407+
}
408+
409+
console.log(`Finished playing stream (${chunksProcessed} total chunks)`);
410+
setStatus('Finished playing stream');
411+
setIsLoading(false);
412+
isPlayingRef.current = false;
413+
}
414+
}, finishingDelay);
415+
241416
} catch (error) {
242-
console.error('Error in speak:', error);
243-
setStatus('Error generating speech: ' + (error as Error).message);
417+
console.error('Error in speech stream:', error);
418+
setStatus('Error generating or playing stream: ' + (error as Error).message);
244419
setIsLoading(false);
420+
isPlayingRef.current = false;
245421
}
246422
};
247423

424+
const stopSpeak = () => {
425+
isPlayingRef.current = false;
426+
setIsLoading(false);
427+
setStatus('Playback stopped.');
428+
429+
// Reset audio context time tracking
430+
if (audioContextRef.current) {
431+
nextPlayTimeRef.current = audioContextRef.current.currentTime;
432+
}
433+
};
434+
248435
const downloadAudio = () => {
249-
if (audioBlob) {
250-
const url = URL.createObjectURL(audioBlob);
251-
const a = document.createElement('a');
252-
a.href = url;
253-
a.download = 'generated-speech.wav';
254-
document.body.appendChild(a);
255-
a.click();
256-
document.body.removeChild(a);
257-
URL.revokeObjectURL(url);
436+
if (!audioBlob) {
437+
setStatus('No audio data available to download');
438+
return;
258439
}
440+
441+
const url = URL.createObjectURL(audioBlob);
442+
const a = document.createElement('a');
443+
a.href = url;
444+
a.download = 'generated-speech.wav';
445+
document.body.appendChild(a);
446+
a.click();
447+
document.body.removeChild(a);
448+
URL.revokeObjectURL(url);
449+
450+
setStatus('Audio downloaded successfully');
259451
};
260452

261453
return (
@@ -268,7 +460,7 @@ function App() {
268460
<Container>
269461
<div>
270462
<Title>Kokoro TTS Demo</Title>
271-
<Subtitle>A lightweight, browser-based text-to-speech engine</Subtitle>
463+
<Subtitle>A lightweight, browser-based text-to-speech engine with streaming</Subtitle>
272464
</div>
273465

274466
<Button
@@ -322,15 +514,23 @@ function App() {
322514
<Button
323515
onClick={speak}
324516
disabled={!isModelLoaded || isLoading || !text.trim()}
325-
isLoading={isLoading && isModelLoaded}
517+
isLoading={isLoading}
326518
>
327519
<ButtonContent>
328-
{(isLoading && isModelLoaded) && <Spinner />}
329-
{isLoading ? 'Processing...' : 'Speak'}
520+
{isLoading && <Spinner />}
521+
{isLoading ? 'Streaming...' : 'Speak'}
330522
</ButtonContent>
331523
</Button>
332-
333-
{audioBlob && (
524+
525+
{isLoading && (
526+
<Button onClick={stopSpeak}>
527+
<ButtonContent>
528+
Stop
529+
</ButtonContent>
530+
</Button>
531+
)}
532+
533+
{audioBlob && !isLoading && (
334534
<Button onClick={downloadAudio}>
335535
<ButtonContent>
336536
Download Audio

src/core/llm/index.ts

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import { TransformersEngineWrapper } from '../../engines/transformer-engine-wrap
55
import { ModelConfig, MLCConfig, TransformersConfig } from '../../config/models/types';
66
import mlcModels from '../../config/models/mlc-models.json';
77
import transformersModels from '../../config/models/transformers-models.json';
8-
import { TTSEngine } from '@/engines/tts-engine';
98

109
// Combine model configurations
1110
const MODEL_CONFIG: Record<string, ModelConfig> = {
@@ -19,7 +18,6 @@ export class BrowserAI {
1918
private mediaRecorder: MediaRecorder | null = null;
2019
private audioChunks: Blob[] = [];
2120
private modelIdentifier: string | null = null;
22-
private ttsEngine: TTSEngine | null = null;
2321
private customModels: Record<string, ModelConfig> = {};
2422

2523
constructor() {
@@ -170,21 +168,27 @@ export class BrowserAI {
170168
return response as string;
171169
}
172170

173-
async textToSpeech(text: string, options: Record<string, unknown> = {}): Promise<ArrayBuffer> {
174-
if (!this.ttsEngine) {
175-
this.ttsEngine = new TTSEngine();
176-
await this.ttsEngine.loadModel(MODEL_CONFIG['kokoro-tts'], {
171+
async textToSpeech(text: string, options: Record<string, unknown> = {}): Promise<any> {
172+
// Check if engine is already loaded
173+
if (!this.engine) {
174+
// Load the transformers engine if not already loaded
175+
this.engine = new TransformersEngineWrapper();
176+
await this.engine.loadModel(MODEL_CONFIG['kokoro-tts'], {
177177
quantized: true,
178178
device: 'webgpu',
179179
...options,
180180
});
181181
}
182182

183183
try {
184-
const audioData = await this.ttsEngine.generateSpeech(text, options);
185-
return audioData;
184+
if (this.engine instanceof TransformersEngineWrapper) {
185+
// Use the streaming method
186+
return await this.engine.textToSpeechStream(text, options);
187+
} else {
188+
throw new Error('Current engine does not support text-to-speech streaming');
189+
}
186190
} catch (error) {
187-
console.error('Error generating speech:', error);
191+
console.error('Error generating speech stream:', error);
188192
throw error;
189193
}
190194
}

0 commit comments

Comments
 (0)