import sys import io, os, stat import subprocess import random from zipfile import ZipFile import uuid import time import torch import torchaudio #download for mecab os.system('python -m unidic download') # By using XTTS you agree to CPML license https://coqui.ai/cpml os.environ["COQUI_TOS_AGREED"] = "1" # langid is used to detect language for longer text # Most users expect text to be their own language, there is checkbox to disable it import langid import base64 import csv from io import StringIO import datetime import re import gradio as gr from scipy.io.wavfile import write from pydub import AudioSegment from TTS.api import TTS from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from TTS.utils.generic_utils import get_user_data_dir from TTS.utils.manage import ModelManager HF_TOKEN = os.environ.get("HF_TOKEN") from huggingface_hub import HfApi from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline # Import missing transformers components # will use api to restart space on a unrecoverable error api = HfApi(token=HF_TOKEN) repo_id = "coqui/xtts" # Use never ffmpeg binary for Ubuntu20 to use denoising for microphone input print("Export newer ffmpeg binary for denoise filter") if not os.path.exists("ffmpeg"): ZipFile("ffmpeg.zip").extractall() print("Make ffmpeg binary executable") st = os.stat("ffmpeg") os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC) # Load XTTS model print("Loading TTS model...") model = None # Initialize model to None try: model_name = "tts_models/multilingual/multi-dataset/xtts_v2" ModelManager().download_model(model_name) model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) print("XTTS downloaded to:", model_path) config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) model = Xtts.init_from_config(config) model.load_checkpoint( config, checkpoint_path=os.path.join(model_path, "model.pth"), vocab_path=os.path.join(model_path, "vocab.json"), eval=True, use_deepspeed=False, ) if torch.cuda.is_available(): model.cuda() print("XTTS v2 model loaded successfully to CUDA.") else: print("XTTS v2 model loaded successfully to CPU.") except Exception as e: print(f"Error loading XTTS v2 model: {e}") # Load sentiment models print("Loading sentiment models...") sentiment_tokenizer = None sentiment_model = None sentiment_analyzer = None try: arabic_model_name = "aubmindlab/bert-base-arabertv02-twitter" sentiment_tokenizer = AutoTokenizer.from_pretrained(arabic_model_name) sentiment_model = AutoModelForSequenceClassification.from_pretrained("UBC-NLP/MARBERT") sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") print("Sentiment models loaded.") except Exception as e: print(f"Error loading sentiment models: {e}") # Language detection (rest of your language detection function remains the same) def detect_language_safely(text): try: if any('\u0600' <= c <= '\u06FF' for c in text): return "ar" return langid.detect(text) # Use langid directly except: return "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en" # Sentiment to emotion mapping (rest of your mapping function remains the same) def map_sentiment_to_emotion(sentiment, language="en"): if language == "ar": return "happy" if sentiment == "positive" else "sad" if sentiment == "negative" else "neutral" return "happy" if "positive" in sentiment.lower() else "sad" if "negative" in sentiment.lower() else "neutral" # Simple Arabic sentiment analysis (rest of your Arabic sentiment analysis function remains the same) def arabic_sentiment_analysis(text): if sentiment_tokenizer is None or sentiment_model is None: return "neutral" pos_words = ["سعيد", "فرح", "ممتاز", "رائع", "جيد", "حب", "جميل", "نجاح", "أحسنت", "شكرا"] neg_words = ["حزين", "غاضب", "سيء", "فشل", "خطأ", "مشكلة", "صعب", "لا أحب", "سخيف", "مؤسف"] pos_count = sum(1 for word in pos_words if word in text.lower()) neg_count = sum(1 for word in neg_words if word in text.lower()) if pos_count > neg_count: return "positive" elif neg_count > pos_count: return "negative" else: try: inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128) outputs = sentiment_model(**inputs) sentiment_class = torch.argmax(outputs.logits).item() return ["negative", "neutral", "positive"][sentiment_class] except Exception as e: print(f"Error during Arabic sentiment analysis: {e}") return "neutral" import tempfile import numpy as np def tts_interface(text_input, speaker_audio): print("--- tts_interface function called ---") print(f"Text Input: {text_input}") print(f"Speaker Audio: {speaker_audio}") # Log the tuple if model is None: print("Error: TTS model failed to load.") return "Error: TTS model failed to load. Check the logs." if speaker_audio is None: print("Error: Please upload a reference audio.") return "Error: Please upload a reference audio." language = detect_language_safely(text_input) emotion = "neutral" audio_output_path = "output.wav" print(f"Detected Language: {language}") if sentiment_analyzer is not None: try: print("Performing sentiment analysis...") if language == "en": sentiment_result = sentiment_analyzer(text_input)[0] emotion = map_sentiment_to_emotion(sentiment_result["label"]) print(f"English Sentiment: {sentiment_result}, Emotion: {emotion}") else: sentiment_result = arabic_sentiment_analysis(text_input) emotion = map_sentiment_to_emotion(sentiment_result, language="ar") print(f"Arabic Sentiment: {sentiment_result}, Emotion: {emotion}") except Exception as e: print(f"Error during sentiment analysis: {e}") pass else: print("Sentiment analyzer not loaded.") temp_audio_file = None try: print("Attempting to generate audio using model.inference...") # Save the uploaded audio to a temporary file sampling_rate, audio_data = speaker_audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: temp_audio_path = tmp_file.name write(temp_audio_path, sampling_rate, audio_data) print(f"Temporary audio file saved to: {temp_audio_path}") temp_audio_file = temp_audio_path # Extract speaker embedding using the temporary file path try: ( gpt_cond_latent, speaker_embedding, ) = model.get_conditioning_latents(audio_path=temp_audio_file, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60) print("Speaker embedding extracted successfully.") except Exception as e: print("Speaker encoding error:", str(e)) return f"Error during speaker encoding: {e}" # Perform inference out = model.inference( text=text_input, language=language, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, # emotion=emotion # Emotion handling might need further investigation ) # Save the generated audio torchaudio.save(audio_output_path, torch.tensor(out["wav"]).unsqueeze(0).cpu(), 24000) print(f"Audio generated and saved to: {audio_output_path}") return audio_output_path except Exception as e: print(f"Error during TTS inference: {e}") return f"Error during TTS inference: {e}" finally: # Clean up the temporary audio file if temp_audio_file: os.remove(temp_audio_file) print(f"Temporary audio file removed: {temp_audio_file}") from scipy.io.wavfile import write # Ensure this import is present iface = gr.Interface( fn=tts_interface, inputs=[ gr.Textbox(label="Enter Text"), gr.Audio(sources=["upload"], label="Upload Reference Audio"), ], outputs=gr.Audio(label="Generated Audio", autoplay=True), title="XTTS v2 Text-to-Speech with Voice Cloning and Sentiment-Based Emotion", description="Enter text and upload a reference audio to clone the voice. The XTTS v2 model will generate speech with an emotion inferred from the sentiment of the text (English and Arabic supported).", ) if __name__ == "__main__": iface.launch()