Tabbly TTS provides a streaming Text-to-Speech API that allows you to use Tabbly TTS as a TTS provider in your LiveKit voice agents with optimized audio delivery that eliminates clicks, pops, and choppy sounds.
Base URL:https://api.tabbly.ioStreaming Endpoint:POST /tts/streamAuthentication: API key via X-API-Key headerResponse Format: HTTP streaming response with WAV-encoded audio chunks (LINEAR16, 48kHz, mono)Protocol: HTTP streaming with WAV files embedded in the stream
Implement a ChunkedStream that handles the audio streaming with optimized buffering:
Copy
class TabblyChunkedStream(tts.ChunkedStream): """Chunked stream for Tabbly TTS synthesis with optimized audio delivery. This improved version eliminates audio clicks/pops by: - Using larger chunk sizes (10ms) to reduce overhead - Pre-buffering (20ms) to smooth out network jitter - Ensuring perfect frame alignment - Continuous data flow without gaps - Handling embedded WAV files in the stream """ def __init__( self, *, tts: TabblyTTS, input_text: str, conn_options: Any, ) -> None: super().__init__(tts=tts, input_text=input_text, conn_options=conn_options) self._tts: TabblyTTS = tts self._input_text = input_text self._conn_options = conn_options async def _run(self, output_emitter: tts.AudioEmitter) -> None: """Run the chunked synthesis process with optimized buffering.""" try: # Initialize AudioEmitter if not hasattr(output_emitter, '_initialized') or not output_emitter._initialized: output_emitter.initialize( request_id=utils.shortuuid(), sample_rate=48000, num_channels=1, mime_type="audio/pcm", ) except RuntimeError as e: if "AudioEmitter already started" in str(e): logger.warning("AudioEmitter already initialized, continuing with existing instance") else: raise # Make the TTS API request url = f"{self._tts.base_url}/tts/stream" headers = { "Content-Type": "application/json", "X-API-Key": self._tts.api_key, } data = { "text": self._input_text, "voice_id": self._tts.voice_id, "model_id": self._tts.model_id, } try: async with self._tts.http_client.stream( "POST", url, json=data, headers=headers, ) as response: response.raise_for_status() # Audio format constants SAMPLE_RATE = 48000 FRAME_SIZE = 2 # 16-bit mono = 2 bytes per frame # Buffering strategy for smooth playback # Use larger chunks (10ms) to reduce overhead and prevent clicks # Pre-buffer helps smooth out network jitter CHUNK_SIZE = 960 # 10ms at 48kHz (480 samples * 2 bytes) PRE_BUFFER_SIZE = 1920 # 20ms pre-buffer (960 samples * 2 bytes) buffer = bytearray() pre_buffer = bytearray() header_skipped = False header_size = 44 # Standard WAV header size async for chunk in response.aiter_bytes(): if chunk: buffer.extend(chunk) # CRITICAL: Extract PCM data from WAV files sent by the API # The API may send WAV files (with headers) in chunks, not raw PCM # We need to extract the raw PCM data from each WAV chunk # Skip WAV header on first chunk if not header_skipped: if len(buffer) >= header_size: # Check if it's a WAV file if buffer[:4] == b'RIFF' and buffer[8:12] == b'WAVE': # Find "data" chunk marker data_start = None for i in range(12, min(len(buffer), 200)): if buffer[i:i+4] == b'data': data_start = i + 8 # Skip "data" (4 bytes) + size (4 bytes) break if data_start: buffer = buffer[data_start:] logger.debug(f"Extracted PCM from first WAV header: {len(buffer)} bytes") else: # Fallback: skip standard 44-byte header buffer = buffer[header_size:] header_skipped = True else: # Not a WAV file, process as raw PCM header_skipped = True # Process audio data after header is skipped if header_skipped and len(buffer) > 0: # CRITICAL: Check if subsequent chunks are also WAV files # The API may send multiple WAV files in the stream processed_buffer = bytearray() temp_buffer = buffer while len(temp_buffer) > 0: # Check if this chunk starts with a WAV header if len(temp_buffer) >= 12 and temp_buffer[:4] == b'RIFF' and temp_buffer[8:12] == b'WAVE': # This is a WAV file - extract PCM data data_start = None # Search for "data" chunk (can be anywhere after "WAVE") for i in range(12, len(temp_buffer)): if i + 4 <= len(temp_buffer) and temp_buffer[i:i+4] == b'data': data_start = i + 8 # Skip "data" (4 bytes) + size (4 bytes) break if data_start and data_start < len(temp_buffer): # Extract PCM data from this WAV chunk # Find where this WAV file ends (next "RIFF" or end of buffer) wav_end = len(temp_buffer) for i in range(data_start, len(temp_buffer) - 4): if temp_buffer[i:i+4] == b'RIFF': wav_end = i break # Extract PCM data from this WAV chunk pcm_data = temp_buffer[data_start:wav_end] processed_buffer.extend(pcm_data) temp_buffer = temp_buffer[wav_end:] logger.debug(f"Extracted PCM from WAV chunk: {len(pcm_data)} bytes") else: # Incomplete WAV file, keep in buffer for next iteration break else: # Not a WAV file, treat as raw PCM processed_buffer.extend(temp_buffer) temp_buffer = bytearray() # Update buffer with remaining unprocessed data buffer = temp_buffer # Add processed PCM data to pre-buffer for smoothing if len(processed_buffer) > 0: pre_buffer.extend(processed_buffer) # Push chunks when pre-buffer has enough data # This ensures smooth, continuous playback without gaps while len(pre_buffer) >= CHUNK_SIZE: # Extract frame-aligned chunk chunk_data = bytes(pre_buffer[:CHUNK_SIZE]) pre_buffer = pre_buffer[CHUNK_SIZE:] # Push immediately for continuous playback output_emitter.push(chunk_data) # Process any remaining data in pre-buffer # Ensure frame alignment before pushing final chunk if header_skipped and len(pre_buffer) >= FRAME_SIZE: # Align to frame boundary aligned_size = (len(pre_buffer) // FRAME_SIZE) * FRAME_SIZE if aligned_size > 0: final_chunk = bytes(pre_buffer[:aligned_size]) output_emitter.push(final_chunk) # Process any remaining data in buffer if header_skipped and len(buffer) >= FRAME_SIZE: aligned_size = (len(buffer) // FRAME_SIZE) * FRAME_SIZE if aligned_size > 0: final_chunk = bytes(buffer[:aligned_size]) output_emitter.push(final_chunk) except httpx.HTTPStatusError as e: logger.error(f"Tabbly TTS API error: {e.response.status_code} - {e.response.text}") raise except Exception as e: logger.error(f"Error in Tabbly TTS synthesis: {e}") raise finally: # Flush at the end - this ensures all buffered audio is sent # Only flush if we have data, otherwise it might cause issues try: output_emitter.flush() except Exception as e: logger.warning(f"Error flushing audio emitter: {e}")