Spaces:

moustafa1-1
/

TTS_Gradio2

Sleeping

App Files Files Community

Moustafa1111111111 commited on May 8

Commit

cdd1279

1 Parent(s): 38d1b63

Add TTS gradio

Browse files

Files changed (2) hide show

app.py +123 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gradio as gr
+import torch
+from TTS.api import TTS
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+from langdetect import detect
+# Allowlist XttsConfig so torch.load doesn't raise UnpicklingError
+from torch.serialization import add_safe_globals
+from TTS.tts.configs.xtts_config import XttsConfig
+add_safe_globals([XttsConfig])
+# ✅ Monkey-patch torch.load to always use weights_only=False
+_original_torch_load = torch.load
+def patched_torch_load(*args, **kwargs):
+    kwargs["weights_only"] = False
+    return _original_torch_load(*args, **kwargs)
+torch.load = patched_torch_load
+print("Loading TTS model from Hugging Face Hub...")
+try:
+    tts = TTS("tts_models/multilingual/multi-dataset-xtts_v2").to("cuda" if torch.cuda.is_available() else "cpu")
+    print("XTTS v2 model loaded successfully from Hugging Face Hub.")
+except Exception as e:
+    print(f"Error loading XTTS v2 model from Hugging Face Hub: {e}")
+    tts = None  # Set tts to None if loading fails
+print("Loading sentiment models...")
+try:
+    arabic_model_name = "aubmindlab/bert-base-arabertv02-twitter"
+    sentiment_tokenizer = AutoTokenizer.from_pretrained(arabic_model_name)
+    sentiment_model = AutoModelForSequenceClassification.from_pretrained("UBC-NLP/MARBERT")
+    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+    print("Sentiment models loaded.")
+except Exception as e:
+    print(f"Error loading sentiment models: {e}")
+    sentiment_analyzer = None
+# Language detection
+def detect_language_safely(text):
+    try:
+        if any('\u0600' <= c <= '\u06FF' for c in text):
+            return "ar"
+        return detect(text)
+    except:
+        return "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
+# Sentiment to emotion mapping
+def map_sentiment_to_emotion(sentiment, language="en"):
+    if language == "ar":
+        return "happy" if sentiment == "positive" else "sad" if sentiment == "negative" else "neutral"
+    return "happy" if "positive" in sentiment.lower() else "sad" if "negative" in sentiment.lower() else "neutral"
+# Simple Arabic sentiment analysis
+def arabic_sentiment_analysis(text):
+    if sentiment_tokenizer is None or sentiment_model is None:
+        return "neutral"  # Return neutral if models failed to load
+    pos_words = ["سعيد", "فرح", "ممتاز", "رائع", "جيد", "حب", "جميل", "نجاح", "أحسنت", "شكرا"]
+    neg_words = ["حزين", "غاضب", "سيء", "فشل", "خطأ", "مشكلة", "صعب", "لا أحب", "سخيف", "مؤسف"]
+    pos_count = sum(1 for word in pos_words if word in text.lower())
+    neg_count = sum(1 for word in neg_words if word in text.lower())
+    if pos_count > neg_count:
+        return "positive"
+    elif neg_count > pos_count:
+        return "negative"
+    else:
+        try:
+            inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
+            outputs = sentiment_model(**inputs)
+            sentiment_class = torch.argmax(outputs.logits).item()
+            return ["negative", "neutral", "positive"][sentiment_class]
+        except Exception as e:
+            print(f"Error during Arabic sentiment analysis: {e}")
+            return "neutral"
+def tts_interface(text_input, speaker_audio):
+    if tts is None:
+        return "Error: TTS model failed to load. Check the logs."
+    if speaker_audio is None:
+        return "Error: Please upload a reference audio."
+    language = detect_language_safely(text_input)
+    emotion = "neutral"
+    audio_output_path = "output.wav"
+    if sentiment_analyzer is not None:
+        try:
+            if language == "en":
+                sentiment_result = sentiment_analyzer(text_input)[0]
+                emotion = map_sentiment_to_emotion(sentiment_result["label"])
+            else:
+                sentiment_result = arabic_sentiment_analysis(text_input)
+                emotion = map_sentiment_to_emotion(sentiment_result, language="ar")
+        except Exception as e:
+            print(f"Error during sentiment analysis: {e}")
+            pass
+    try:
+        tts.tts_to_file(
+            text=text_input,
+            file_path=audio_output_path,
+            emotion=emotion,
+            speaker_wav=speaker_audio,
+            language=language
+        )
+        return audio_output_path
+    except Exception as e:
+        return f"Error during TTS: {e}"
+iface = gr.Interface(
+    fn=tts_interface,
+    inputs=[
+        gr.Textbox(label="Enter Text"),
+        gr.Audio(source="upload", label="Upload Reference Audio"),
+    ],
+    outputs=gr.Audio(label="Generated Audio", autoplay=True),
+    title="XTTS v2 Text-to-Speech with Voice Cloning and Sentiment-Based Emotion",
+    description="Enter text and upload a reference audio to clone the voice. The XTTS v2 model will generate speech with an emotion inferred from the sentiment of the text (English and Arabic supported).",
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+torch
+TTS
+transformers
+langdetect
+pydantic
+accelerate # Often needed by transformers
+sentencepiece # Often needed by multilingual models