Spaces:
Sleeping
Sleeping
import sys | |
import io, os, stat | |
import subprocess | |
import random | |
from zipfile import ZipFile | |
import uuid | |
import time | |
import torch | |
import torchaudio | |
#download for mecab | |
os.system('python -m unidic download') | |
# By using XTTS you agree to CPML license https://coqui.ai/cpml | |
os.environ["COQUI_TOS_AGREED"] = "1" | |
# langid is used to detect language for longer text | |
# Most users expect text to be their own language, there is checkbox to disable it | |
import langid | |
import base64 | |
import csv | |
from io import StringIO | |
import datetime | |
import re | |
import gradio as gr | |
from scipy.io.wavfile import write | |
from pydub import AudioSegment | |
from TTS.api import TTS | |
from TTS.tts.configs.xtts_config import XttsConfig | |
from TTS.tts.models.xtts import Xtts | |
from TTS.utils.generic_utils import get_user_data_dir | |
from TTS.utils.manage import ModelManager | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
from huggingface_hub import HfApi | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline # Import missing transformers components | |
# will use api to restart space on a unrecoverable error | |
api = HfApi(token=HF_TOKEN) | |
repo_id = "coqui/xtts" | |
# Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input | |
print("Export newer ffmpeg binary for denoise filter") | |
if not os.path.exists("ffmpeg"): | |
ZipFile("ffmpeg.zip").extractall() | |
print("Make ffmpeg binary executable") | |
st = os.stat("ffmpeg") | |
os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC) | |
# Load XTTS model | |
print("Loading TTS model...") | |
model = None # Initialize model to None | |
try: | |
model_name = "tts_models/multilingual/multi-dataset/xtts_v2" | |
ModelManager().download_model(model_name) | |
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) | |
print("XTTS downloaded to:", model_path) | |
config = XttsConfig() | |
config.load_json(os.path.join(model_path, "config.json")) | |
model = Xtts.init_from_config(config) | |
model.load_checkpoint( | |
config, | |
checkpoint_path=os.path.join(model_path, "model.pth"), | |
vocab_path=os.path.join(model_path, "vocab.json"), | |
eval=True, | |
use_deepspeed=False, | |
) | |
if torch.cuda.is_available(): | |
model.cuda() | |
print("XTTS v2 model loaded successfully to CUDA.") | |
else: | |
print("XTTS v2 model loaded successfully to CPU.") | |
except Exception as e: | |
print(f"Error loading XTTS v2 model: {e}") | |
# Load sentiment models | |
print("Loading sentiment models...") | |
sentiment_tokenizer = None | |
sentiment_model = None | |
sentiment_analyzer = None | |
try: | |
arabic_model_name = "aubmindlab/bert-base-arabertv02-twitter" | |
sentiment_tokenizer = AutoTokenizer.from_pretrained(arabic_model_name) | |
sentiment_model = AutoModelForSequenceClassification.from_pretrained("UBC-NLP/MARBERT") | |
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") | |
print("Sentiment models loaded.") | |
except Exception as e: | |
print(f"Error loading sentiment models: {e}") | |
# Language detection (rest of your language detection function remains the same) | |
def detect_language_safely(text): | |
try: | |
if any('\u0600' <= c <= '\u06FF' for c in text): | |
return "ar" | |
return langid.detect(text) # Use langid directly | |
except: | |
return "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en" | |
# Sentiment to emotion mapping (rest of your mapping function remains the same) | |
def map_sentiment_to_emotion(sentiment, language="en"): | |
if language == "ar": | |
return "happy" if sentiment == "positive" else "sad" if sentiment == "negative" else "neutral" | |
return "happy" if "positive" in sentiment.lower() else "sad" if "negative" in sentiment.lower() else "neutral" | |
# Simple Arabic sentiment analysis (rest of your Arabic sentiment analysis function remains the same) | |
def arabic_sentiment_analysis(text): | |
if sentiment_tokenizer is None or sentiment_model is None: | |
return "neutral" | |
pos_words = ["سعيد", "فرح", "ممتاز", "رائع", "جيد", "حب", "جميل", "نجاح", "أحسنت", "شكرا"] | |
neg_words = ["حزين", "غاضب", "سيء", "فشل", "خطأ", "مشكلة", "صعب", "لا أحب", "سخيف", "مؤسف"] | |
pos_count = sum(1 for word in pos_words if word in text.lower()) | |
neg_count = sum(1 for word in neg_words if word in text.lower()) | |
if pos_count > neg_count: | |
return "positive" | |
elif neg_count > pos_count: | |
return "negative" | |
else: | |
try: | |
inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) | |
outputs = sentiment_model(**inputs) | |
sentiment_class = torch.argmax(outputs.logits).item() | |
return ["negative", "neutral", "positive"][sentiment_class] | |
except Exception as e: | |
print(f"Error during Arabic sentiment analysis: {e}") | |
return "neutral" | |
import tempfile | |
import numpy as np | |
def tts_interface(text_input, speaker_audio): | |
print("--- tts_interface function called ---") | |
print(f"Text Input: {text_input}") | |
print(f"Speaker Audio: {speaker_audio}") # Log the tuple | |
if model is None: | |
print("Error: TTS model failed to load.") | |
return "Error: TTS model failed to load. Check the logs." | |
if speaker_audio is None: | |
print("Error: Please upload a reference audio.") | |
return "Error: Please upload a reference audio." | |
language = detect_language_safely(text_input) | |
emotion = "neutral" | |
audio_output_path = "output.wav" | |
print(f"Detected Language: {language}") | |
if sentiment_analyzer is not None: | |
try: | |
print("Performing sentiment analysis...") | |
if language == "en": | |
sentiment_result = sentiment_analyzer(text_input)[0] | |
emotion = map_sentiment_to_emotion(sentiment_result["label"]) | |
print(f"English Sentiment: {sentiment_result}, Emotion: {emotion}") | |
else: | |
sentiment_result = arabic_sentiment_analysis(text_input) | |
emotion = map_sentiment_to_emotion(sentiment_result, language="ar") | |
print(f"Arabic Sentiment: {sentiment_result}, Emotion: {emotion}") | |
except Exception as e: | |
print(f"Error during sentiment analysis: {e}") | |
pass | |
else: | |
print("Sentiment analyzer not loaded.") | |
temp_audio_file = None | |
try: | |
print("Attempting to generate audio using model.inference...") | |
# Save the uploaded audio to a temporary file | |
sampling_rate, audio_data = speaker_audio | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | |
temp_audio_path = tmp_file.name | |
write(temp_audio_path, sampling_rate, audio_data) | |
print(f"Temporary audio file saved to: {temp_audio_path}") | |
temp_audio_file = temp_audio_path | |
# Extract speaker embedding using the temporary file path | |
try: | |
( | |
gpt_cond_latent, | |
speaker_embedding, | |
) = model.get_conditioning_latents(audio_path=temp_audio_file, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60) | |
print("Speaker embedding extracted successfully.") | |
except Exception as e: | |
print("Speaker encoding error:", str(e)) | |
return f"Error during speaker encoding: {e}" | |
# Perform inference | |
out = model.inference( | |
text=text_input, | |
language=language, | |
gpt_cond_latent=gpt_cond_latent, | |
speaker_embedding=speaker_embedding, | |
# emotion=emotion # Emotion handling might need further investigation | |
) | |
# Save the generated audio | |
torchaudio.save(audio_output_path, torch.tensor(out["wav"]).unsqueeze(0).cpu(), 24000) | |
print(f"Audio generated and saved to: {audio_output_path}") | |
return audio_output_path | |
except Exception as e: | |
print(f"Error during TTS inference: {e}") | |
return f"Error during TTS inference: {e}" | |
finally: | |
# Clean up the temporary audio file | |
if temp_audio_file: | |
os.remove(temp_audio_file) | |
print(f"Temporary audio file removed: {temp_audio_file}") | |
from scipy.io.wavfile import write # Ensure this import is present | |
iface = gr.Interface( | |
fn=tts_interface, | |
inputs=[ | |
gr.Textbox(label="Enter Text"), | |
gr.Audio(sources=["upload"], label="Upload Reference Audio"), | |
], | |
outputs=gr.Audio(label="Generated Audio", autoplay=True), | |
title="XTTS v2 Text-to-Speech with Voice Cloning and Sentiment-Based Emotion", | |
description="Enter text and upload a reference audio to clone the voice. The XTTS v2 model will generate speech with an emotion inferred from the sentiment of the text (English and Arabic supported).", | |
) | |
if __name__ == "__main__": | |
iface.launch() |