TTS_Gradio2 / app.py
moustafa1-1's picture
Update app.py
c06ea85 verified
import sys
import io, os, stat
import subprocess
import random
from zipfile import ZipFile
import uuid
import time
import torch
import torchaudio
#download for mecab
os.system('python -m unidic download')
# By using XTTS you agree to CPML license https://coqui.ai/cpml
os.environ["COQUI_TOS_AGREED"] = "1"
# langid is used to detect language for longer text
# Most users expect text to be their own language, there is checkbox to disable it
import langid
import base64
import csv
from io import StringIO
import datetime
import re
import gradio as gr
from scipy.io.wavfile import write
from pydub import AudioSegment
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
from TTS.utils.manage import ModelManager
HF_TOKEN = os.environ.get("HF_TOKEN")
from huggingface_hub import HfApi
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline # Import missing transformers components
# will use api to restart space on a unrecoverable error
api = HfApi(token=HF_TOKEN)
repo_id = "coqui/xtts"
# Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input
print("Export newer ffmpeg binary for denoise filter")
if not os.path.exists("ffmpeg"):
ZipFile("ffmpeg.zip").extractall()
print("Make ffmpeg binary executable")
st = os.stat("ffmpeg")
os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
# Load XTTS model
print("Loading TTS model...")
model = None # Initialize model to None
try:
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
ModelManager().download_model(model_name)
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
print("XTTS downloaded to:", model_path)
config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
model = Xtts.init_from_config(config)
model.load_checkpoint(
config,
checkpoint_path=os.path.join(model_path, "model.pth"),
vocab_path=os.path.join(model_path, "vocab.json"),
eval=True,
use_deepspeed=False,
)
if torch.cuda.is_available():
model.cuda()
print("XTTS v2 model loaded successfully to CUDA.")
else:
print("XTTS v2 model loaded successfully to CPU.")
except Exception as e:
print(f"Error loading XTTS v2 model: {e}")
# Load sentiment models
print("Loading sentiment models...")
sentiment_tokenizer = None
sentiment_model = None
sentiment_analyzer = None
try:
arabic_model_name = "aubmindlab/bert-base-arabertv02-twitter"
sentiment_tokenizer = AutoTokenizer.from_pretrained(arabic_model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained("UBC-NLP/MARBERT")
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
print("Sentiment models loaded.")
except Exception as e:
print(f"Error loading sentiment models: {e}")
# Language detection (rest of your language detection function remains the same)
def detect_language_safely(text):
try:
if any('\u0600' <= c <= '\u06FF' for c in text):
return "ar"
return langid.detect(text) # Use langid directly
except:
return "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
# Sentiment to emotion mapping (rest of your mapping function remains the same)
def map_sentiment_to_emotion(sentiment, language="en"):
if language == "ar":
return "happy" if sentiment == "positive" else "sad" if sentiment == "negative" else "neutral"
return "happy" if "positive" in sentiment.lower() else "sad" if "negative" in sentiment.lower() else "neutral"
# Simple Arabic sentiment analysis (rest of your Arabic sentiment analysis function remains the same)
def arabic_sentiment_analysis(text):
if sentiment_tokenizer is None or sentiment_model is None:
return "neutral"
pos_words = ["سعيد", "فرح", "ممتاز", "رائع", "جيد", "حب", "جميل", "نجاح", "أحسنت", "شكرا"]
neg_words = ["حزين", "غاضب", "سيء", "فشل", "خطأ", "مشكلة", "صعب", "لا أحب", "سخيف", "مؤسف"]
pos_count = sum(1 for word in pos_words if word in text.lower())
neg_count = sum(1 for word in neg_words if word in text.lower())
if pos_count > neg_count:
return "positive"
elif neg_count > pos_count:
return "negative"
else:
try:
inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
outputs = sentiment_model(**inputs)
sentiment_class = torch.argmax(outputs.logits).item()
return ["negative", "neutral", "positive"][sentiment_class]
except Exception as e:
print(f"Error during Arabic sentiment analysis: {e}")
return "neutral"
import tempfile
import numpy as np
def tts_interface(text_input, speaker_audio):
print("--- tts_interface function called ---")
print(f"Text Input: {text_input}")
print(f"Speaker Audio: {speaker_audio}") # Log the tuple
if model is None:
print("Error: TTS model failed to load.")
return "Error: TTS model failed to load. Check the logs."
if speaker_audio is None:
print("Error: Please upload a reference audio.")
return "Error: Please upload a reference audio."
language = detect_language_safely(text_input)
emotion = "neutral"
audio_output_path = "output.wav"
print(f"Detected Language: {language}")
if sentiment_analyzer is not None:
try:
print("Performing sentiment analysis...")
if language == "en":
sentiment_result = sentiment_analyzer(text_input)[0]
emotion = map_sentiment_to_emotion(sentiment_result["label"])
print(f"English Sentiment: {sentiment_result}, Emotion: {emotion}")
else:
sentiment_result = arabic_sentiment_analysis(text_input)
emotion = map_sentiment_to_emotion(sentiment_result, language="ar")
print(f"Arabic Sentiment: {sentiment_result}, Emotion: {emotion}")
except Exception as e:
print(f"Error during sentiment analysis: {e}")
pass
else:
print("Sentiment analyzer not loaded.")
temp_audio_file = None
try:
print("Attempting to generate audio using model.inference...")
# Save the uploaded audio to a temporary file
sampling_rate, audio_data = speaker_audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
temp_audio_path = tmp_file.name
write(temp_audio_path, sampling_rate, audio_data)
print(f"Temporary audio file saved to: {temp_audio_path}")
temp_audio_file = temp_audio_path
# Extract speaker embedding using the temporary file path
try:
(
gpt_cond_latent,
speaker_embedding,
) = model.get_conditioning_latents(audio_path=temp_audio_file, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
print("Speaker embedding extracted successfully.")
except Exception as e:
print("Speaker encoding error:", str(e))
return f"Error during speaker encoding: {e}"
# Perform inference
out = model.inference(
text=text_input,
language=language,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
# emotion=emotion # Emotion handling might need further investigation
)
# Save the generated audio
torchaudio.save(audio_output_path, torch.tensor(out["wav"]).unsqueeze(0).cpu(), 24000)
print(f"Audio generated and saved to: {audio_output_path}")
return audio_output_path
except Exception as e:
print(f"Error during TTS inference: {e}")
return f"Error during TTS inference: {e}"
finally:
# Clean up the temporary audio file
if temp_audio_file:
os.remove(temp_audio_file)
print(f"Temporary audio file removed: {temp_audio_file}")
from scipy.io.wavfile import write # Ensure this import is present
iface = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(sources=["upload"], label="Upload Reference Audio"),
],
outputs=gr.Audio(label="Generated Audio", autoplay=True),
title="XTTS v2 Text-to-Speech with Voice Cloning and Sentiment-Based Emotion",
description="Enter text and upload a reference audio to clone the voice. The XTTS v2 model will generate speech with an emotion inferred from the sentiment of the text (English and Arabic supported).",
)
if __name__ == "__main__":
iface.launch()