1
- import { useState } from 'react' ;
1
+ import { useState , useRef , useEffect } from 'react' ;
2
2
import { BrowserAI } from '@browserai/browserai' ;
3
3
import styled from '@emotion/styled' ;
4
4
@@ -182,9 +182,88 @@ function App() {
182
182
const [ isLoading , setIsLoading ] = useState ( false ) ;
183
183
const [ ttsAI ] = useState ( new BrowserAI ( ) ) ;
184
184
const [ isModelLoaded , setIsModelLoaded ] = useState ( false ) ;
185
- const [ audioBlob , setAudioBlob ] = useState < Blob | null > ( null ) ;
186
185
const [ selectedVoice , setSelectedVoice ] = useState ( 'af_bella' ) ;
187
186
const [ speed , setSpeed ] = useState ( 1.0 ) ;
187
+ const [ audioBlob , setAudioBlob ] = useState < Blob | null > ( null ) ;
188
+
189
+ // Audio streaming references
190
+ const audioContextRef = useRef < AudioContext | null > ( null ) ;
191
+ const nextPlayTimeRef = useRef < number > ( 0 ) ;
192
+ const isPlayingRef = useRef < boolean > ( false ) ;
193
+ const accumulatedAudioChunksRef = useRef < Float32Array [ ] > ( [ ] ) ;
194
+ const sampleRateRef = useRef < number > ( 24000 ) ;
195
+
196
+ // Clean up audio context on unmount
197
+ useEffect ( ( ) => {
198
+ return ( ) => {
199
+ if ( audioContextRef . current ) {
200
+ audioContextRef . current . close ( ) ;
201
+ }
202
+ } ;
203
+ } , [ ] ) ;
204
+
205
+ // Create WAV header
206
+ const createWAVHeader = ( numChannels : number , sampleRate : number , numSamples : number ) : ArrayBuffer => {
207
+ const buffer = new ArrayBuffer ( 44 ) ;
208
+ const view = new DataView ( buffer ) ;
209
+
210
+ // "RIFF" chunk descriptor
211
+ writeString ( view , 0 , 'RIFF' ) ;
212
+ // File size (data size + 36 bytes of header)
213
+ view . setUint32 ( 4 , 36 + numSamples * 2 , true ) ;
214
+ writeString ( view , 8 , 'WAVE' ) ;
215
+
216
+ // "fmt " sub-chunk
217
+ writeString ( view , 12 , 'fmt ' ) ;
218
+ view . setUint32 ( 16 , 16 , true ) ; // fmt chunk size
219
+ view . setUint16 ( 20 , 1 , true ) ; // audio format (1 for PCM)
220
+ view . setUint16 ( 22 , numChannels , true ) ;
221
+ view . setUint32 ( 24 , sampleRate , true ) ;
222
+ view . setUint32 ( 28 , sampleRate * numChannels * 2 , true ) ; // byte rate
223
+ view . setUint16 ( 32 , numChannels * 2 , true ) ; // block align
224
+ view . setUint16 ( 34 , 16 , true ) ; // bits per sample
225
+
226
+ // "data" sub-chunk
227
+ writeString ( view , 36 , 'data' ) ;
228
+ view . setUint32 ( 40 , numSamples * 2 , true ) ; // data size
229
+
230
+ return buffer ;
231
+ } ;
232
+
233
+ // Helper function to write string to DataView
234
+ const writeString = ( view : DataView , offset : number , string : string ) => {
235
+ for ( let i = 0 ; i < string . length ; i ++ ) {
236
+ view . setUint8 ( offset + i , string . charCodeAt ( i ) ) ;
237
+ }
238
+ } ;
239
+
240
+ const initializeAudioContext = ( ) => {
241
+ if ( ! audioContextRef . current || audioContextRef . current . state === 'closed' ) {
242
+ const context = new ( window . AudioContext || ( window as any ) . webkitAudioContext ) ( ) ;
243
+ audioContextRef . current = context ;
244
+ nextPlayTimeRef . current = context . currentTime ; // Initialize play time
245
+ return context ;
246
+ }
247
+ return audioContextRef . current ;
248
+ } ;
249
+
250
+ const playAudioChunk = ( context : AudioContext , chunk : Float32Array , sampleRate : number ) => {
251
+ const buffer = context . createBuffer ( 1 , chunk . length , sampleRate ) ;
252
+ buffer . copyToChannel ( chunk , 0 ) ;
253
+
254
+ const node = context . createBufferSource ( ) ;
255
+ node . buffer = buffer ;
256
+ node . connect ( context . destination ) ;
257
+
258
+ // Schedule playback precisely
259
+ const scheduledTime = Math . max ( context . currentTime , nextPlayTimeRef . current ) ;
260
+ node . start ( scheduledTime ) ;
261
+
262
+ // Update the time for the next chunk
263
+ nextPlayTimeRef . current = scheduledTime + buffer . duration ;
264
+
265
+ return node ;
266
+ } ;
188
267
189
268
const loadModel = async ( ) => {
190
269
try {
@@ -204,58 +283,171 @@ function App() {
204
283
setStatus ( 'Please enter some text first' ) ;
205
284
return ;
206
285
}
286
+ if ( ! isModelLoaded || isLoading ) return ;
287
+
288
+ setIsLoading ( true ) ;
289
+ setStatus ( 'Generating speech stream...' ) ;
290
+
291
+ // Reset any previous audio state
292
+ accumulatedAudioChunksRef . current = [ ] ;
293
+ isPlayingRef . current = true ;
294
+
295
+ const currentAudioContext = initializeAudioContext ( ) ;
296
+ if ( ! currentAudioContext ) {
297
+ setStatus ( 'Failed to initialize Audio Context' ) ;
298
+ setIsLoading ( false ) ;
299
+ return ;
300
+ }
301
+
302
+ // Ensure audio context is running (required after user interaction)
303
+ if ( currentAudioContext . state === 'suspended' ) {
304
+ await currentAudioContext . resume ( ) ;
305
+ }
306
+
307
+ // Reset nextPlayTime for new playback
308
+ nextPlayTimeRef . current = currentAudioContext . currentTime ;
207
309
208
310
try {
209
- setIsLoading ( true ) ;
210
- setStatus ( 'Generating speech...' ) ;
211
- const audioData = await ttsAI . textToSpeech ( text , {
311
+ // Get language from selected voice
312
+ const selectedVoiceData = VOICE_OPTIONS . find ( v => v . id === selectedVoice ) ;
313
+ if ( ! selectedVoiceData ) {
314
+ throw new Error ( "Selected voice data not found." ) ;
315
+ }
316
+ const language = selectedVoiceData . language ;
317
+
318
+ const result = await ttsAI . textToSpeech ( text , {
212
319
voice : selectedVoice ,
213
- speed : speed
320
+ speed : speed ,
321
+ language : language // Pass explicit language code
214
322
} ) ;
215
323
216
- if ( audioData ) {
217
- // Create a blob with WAV MIME type
218
- const blob = new Blob ( [ audioData ] , { type : 'audio/wav' } ) ;
219
- setAudioBlob ( blob ) ; // Store the blob for download
220
- const audioUrl = URL . createObjectURL ( blob ) ;
221
-
222
- // Create and play audio element
223
- const audio = new Audio ( audioUrl ) ;
324
+ // Extract stream and sampleRate from the result
325
+ const { stream, sampleRate } = result ;
326
+
327
+ // Store sample rate for WAV generation
328
+ sampleRateRef . current = sampleRate ;
329
+
330
+ // Reset accumulated chunks
331
+ accumulatedAudioChunksRef . current = [ ] ;
332
+
333
+ // Clear any previous audio blob
334
+ setAudioBlob ( null ) ;
335
+
336
+ setStatus ( 'Streaming audio...' ) ;
337
+ let chunksProcessed = 0 ;
338
+
339
+ // Process each chunk from the stream
340
+ for await ( const chunk of stream ) {
341
+ if ( ! isPlayingRef . current ) break ; // Allow stopping
224
342
225
- audio . onended = ( ) => {
226
- setStatus ( 'Finished playing' ) ;
227
- setIsLoading ( false ) ;
228
- URL . revokeObjectURL ( audioUrl ) ; // Clean up
229
- } ;
343
+ // Store the chunk for potential download later
344
+ accumulatedAudioChunksRef . current . push ( chunk ) ;
230
345
231
- audio . onerror = ( e ) => {
232
- console . error ( 'Audio playback error:' , e ) ;
233
- setStatus ( 'Error playing audio' ) ;
234
- setIsLoading ( false ) ;
235
- URL . revokeObjectURL ( audioUrl ) ;
236
- } ;
346
+ // Play this chunk
347
+ playAudioChunk ( currentAudioContext , chunk , sampleRate ) ;
237
348
238
- setStatus ( 'Playing audio...' ) ;
239
- await audio . play ( ) ;
349
+ // Update status occasionally to show progress
350
+ chunksProcessed ++ ;
351
+ if ( chunksProcessed % 10 === 0 ) {
352
+ setStatus ( 'Streaming audio...' ) ;
353
+ }
240
354
}
355
+
356
+ // Calculate when all audio will finish playing
357
+ const estimatedDuration = nextPlayTimeRef . current - currentAudioContext . currentTime ;
358
+ const finishingDelay = Math . max ( estimatedDuration * 1000 , 100 ) ; // At least 100ms
359
+
360
+ setTimeout ( ( ) => {
361
+ if ( isPlayingRef . current ) {
362
+ // Create blob for download
363
+ if ( accumulatedAudioChunksRef . current . length > 0 ) {
364
+ // Calculate total length of all chunks
365
+ const totalLength = accumulatedAudioChunksRef . current . reduce ( ( total , chunk ) => total + chunk . length , 0 ) ;
366
+
367
+ // Create a combined Float32Array
368
+ const combinedFloat32 = new Float32Array ( totalLength ) ;
369
+ let offset = 0 ;
370
+
371
+ // Copy all chunks into the combined array
372
+ for ( const chunk of accumulatedAudioChunksRef . current ) {
373
+ combinedFloat32 . set ( chunk , offset ) ;
374
+ offset += chunk . length ;
375
+ }
376
+
377
+ // Normalize if needed - skip this as chunks are already normalized
378
+ // const maxValue = combinedFloat32.reduce((max, val) => Math.max(max, Math.abs(val)), 0);
379
+ // const normalizedData = maxValue > 0 ? new Float32Array(combinedFloat32.length) : combinedFloat32;
380
+
381
+ // if (maxValue > 0) {
382
+ // for (let i = 0; i < combinedFloat32.length; i++) {
383
+ // normalizedData[i] = combinedFloat32[i] / maxValue;
384
+ // }
385
+ // }
386
+
387
+ // Convert to Int16Array for WAV
388
+ const int16Array = new Int16Array ( combinedFloat32 . length ) ;
389
+ const int16Factor = 0x7FFF ;
390
+
391
+ for ( let i = 0 ; i < combinedFloat32 . length ; i ++ ) {
392
+ const s = combinedFloat32 [ i ] ;
393
+ int16Array [ i ] = s < 0 ? Math . max ( - 0x8000 , s * 0x8000 ) : Math . min ( 0x7FFF , s * int16Factor ) ;
394
+ }
395
+
396
+ // Create WAV header
397
+ const wavHeader = createWAVHeader ( 1 , sampleRateRef . current , int16Array . length ) ;
398
+
399
+ // Combine header with audio data
400
+ const wavBytes = new Uint8Array ( 44 + int16Array . byteLength ) ;
401
+ wavBytes . set ( new Uint8Array ( wavHeader ) , 0 ) ;
402
+ wavBytes . set ( new Uint8Array ( int16Array . buffer ) , 44 ) ;
403
+
404
+ // Create blob for download
405
+ const blob = new Blob ( [ wavBytes ] , { type : 'audio/wav' } ) ;
406
+ setAudioBlob ( blob ) ;
407
+ }
408
+
409
+ console . log ( `Finished playing stream (${ chunksProcessed } total chunks)` ) ;
410
+ setStatus ( 'Finished playing stream' ) ;
411
+ setIsLoading ( false ) ;
412
+ isPlayingRef . current = false ;
413
+ }
414
+ } , finishingDelay ) ;
415
+
241
416
} catch ( error ) {
242
- console . error ( 'Error in speak :' , error ) ;
243
- setStatus ( 'Error generating speech : ' + ( error as Error ) . message ) ;
417
+ console . error ( 'Error in speech stream :' , error ) ;
418
+ setStatus ( 'Error generating or playing stream : ' + ( error as Error ) . message ) ;
244
419
setIsLoading ( false ) ;
420
+ isPlayingRef . current = false ;
245
421
}
246
422
} ;
247
423
424
+ const stopSpeak = ( ) => {
425
+ isPlayingRef . current = false ;
426
+ setIsLoading ( false ) ;
427
+ setStatus ( 'Playback stopped.' ) ;
428
+
429
+ // Reset audio context time tracking
430
+ if ( audioContextRef . current ) {
431
+ nextPlayTimeRef . current = audioContextRef . current . currentTime ;
432
+ }
433
+ } ;
434
+
248
435
const downloadAudio = ( ) => {
249
- if ( audioBlob ) {
250
- const url = URL . createObjectURL ( audioBlob ) ;
251
- const a = document . createElement ( 'a' ) ;
252
- a . href = url ;
253
- a . download = 'generated-speech.wav' ;
254
- document . body . appendChild ( a ) ;
255
- a . click ( ) ;
256
- document . body . removeChild ( a ) ;
257
- URL . revokeObjectURL ( url ) ;
436
+ if ( ! audioBlob ) {
437
+ setStatus ( 'No audio data available to download' ) ;
438
+ return ;
258
439
}
440
+
441
+ const url = URL . createObjectURL ( audioBlob ) ;
442
+ const a = document . createElement ( 'a' ) ;
443
+ a . href = url ;
444
+ a . download = 'generated-speech.wav' ;
445
+ document . body . appendChild ( a ) ;
446
+ a . click ( ) ;
447
+ document . body . removeChild ( a ) ;
448
+ URL . revokeObjectURL ( url ) ;
449
+
450
+ setStatus ( 'Audio downloaded successfully' ) ;
259
451
} ;
260
452
261
453
return (
@@ -268,7 +460,7 @@ function App() {
268
460
< Container >
269
461
< div >
270
462
< Title > Kokoro TTS Demo</ Title >
271
- < Subtitle > A lightweight, browser-based text-to-speech engine</ Subtitle >
463
+ < Subtitle > A lightweight, browser-based text-to-speech engine with streaming </ Subtitle >
272
464
</ div >
273
465
274
466
< Button
@@ -322,15 +514,23 @@ function App() {
322
514
< Button
323
515
onClick = { speak }
324
516
disabled = { ! isModelLoaded || isLoading || ! text . trim ( ) }
325
- isLoading = { isLoading && isModelLoaded }
517
+ isLoading = { isLoading }
326
518
>
327
519
< ButtonContent >
328
- { ( isLoading && isModelLoaded ) && < Spinner /> }
329
- { isLoading ? 'Processing ...' : 'Speak' }
520
+ { isLoading && < Spinner /> }
521
+ { isLoading ? 'Streaming ...' : 'Speak' }
330
522
</ ButtonContent >
331
523
</ Button >
332
-
333
- { audioBlob && (
524
+
525
+ { isLoading && (
526
+ < Button onClick = { stopSpeak } >
527
+ < ButtonContent >
528
+ Stop
529
+ </ ButtonContent >
530
+ </ Button >
531
+ ) }
532
+
533
+ { audioBlob && ! isLoading && (
334
534
< Button onClick = { downloadAudio } >
335
535
< ButtonContent >
336
536
Download Audio
0 commit comments