import edge_tts import io import numpy as np from pydub import AudioSegment # Install with: pip install pydub class TextToSpeech: def __init__(self, voice="en-US-AriaNeural"): self.voice = voice async def synthesize(self, text): communicate = edge_tts.Communicate(text, self.voice) # Collect raw audio bytes audio_bytes = bytearray() async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_bytes.extend(chunk["data"]) # Convert to numpy array using pydub audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3") samples = np.array(audio.get_array_of_samples()) sample_rate = audio.frame_rate return (sample_rate, samples)