import edge_tts | |
import io | |
import numpy as np | |
from pydub import AudioSegment # Install with: pip install pydub | |
class TextToSpeech: | |
def __init__(self, voice="en-US-AriaNeural"): | |
self.voice = voice | |
async def synthesize(self, text): | |
communicate = edge_tts.Communicate(text, self.voice) | |
# Collect raw audio bytes | |
audio_bytes = bytearray() | |
async for chunk in communicate.stream(): | |
if chunk["type"] == "audio": | |
audio_bytes.extend(chunk["data"]) | |
# Convert to numpy array using pydub | |
audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3") | |
samples = np.array(audio.get_array_of_samples()) | |
sample_rate = audio.frame_rate | |
return (sample_rate, samples) |