gencent / app /utils /text_to_speech.py
hjaved202's picture
Upload folder using huggingface_hub
a350173 verified
raw
history blame
776 Bytes
import edge_tts
import io
import numpy as np
from pydub import AudioSegment # Install with: pip install pydub
class TextToSpeech:
def __init__(self, voice="en-US-AriaNeural"):
self.voice = voice
async def synthesize(self, text):
communicate = edge_tts.Communicate(text, self.voice)
# Collect raw audio bytes
audio_bytes = bytearray()
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_bytes.extend(chunk["data"])
# Convert to numpy array using pydub
audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format="mp3")
samples = np.array(audio.get_array_of_samples())
sample_rate = audio.frame_rate
return (sample_rate, samples)