podcastgen

Paused

App Files Files Community

Rausda6 commited on May 22

Commit

2114e35

verified ·

1 Parent(s): 7118f9e

Update app.py

Browse files

Files changed (1) hide show

app.py +392 -316

app.py CHANGED Viewed

@@ -9,405 +9,481 @@ import os
 import time
 import mimetypes
 import torch
 from typing import List, Dict
-from transformers import AutoTokenizer, AutoModelForCausalLM
 # Constants
 MAX_FILE_SIZE_MB = 20
-MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024  # Convert MB to bytes
 MODEL_ID = "unsloth/gemma-3-1b-pt"
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto"
-).eval()
 class PodcastGenerator:
     def __init__(self):
-        pass
-    async def generate_script(self, prompt: str, language: str,  file_obj=None, progress=None) -> Dict:
-        example = """
-{
-    "topic": "AGI",
-    "podcast": [
-        {
-            "speaker": 2,
-            "line": "So, AGI, huh? Seems like everyone's talking about it these days."
-        },
-        {
-            "speaker": 1,
-            "line": "Yeah, it's definitely having a moment, isn't it?"
-        },
-        {
-            "speaker": 2,
-            "line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?"
-        },
-        {
-            "speaker": 1,
-            "line": "I like that. It really is."
-        },
-        {
-            "speaker": 2,
-            "line": "And honestly, that's a responsibility that extends beyond just the researchers and the policymakers."
-        },
-        {
-            "speaker": 1,
-            "line": "100%"
-        },
-        {
-            "speaker": 2,
-            "line": "So to everyone listening out there I'll leave you with this. As AGI continues to develop, what role do you want to play in shaping its future?"
-        },
-        {
-            "speaker": 1,
-            "line": "That's a question worth pondering."
-        },
-        {
-            "speaker": 2,
-            "line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone."
-        },
-        {
-            "speaker": 1,
-            "line": "Peace."
         }
-    ]
-}
-        """
         if language == "Auto Detect":
-            language_instruction = "- The podcast MUST be in the same language as the user input."
         else:
-            language_instruction = f"- The podcast MUST be in {language} language"
-        system_prompt = f"""
-You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input.
-{language_instruction}
-- The podcast should have 2 speakers.
-- The podcast should be long.
-- Do not use names for the speakers.
-- The podcast should be interesting, lively, and engaging, and hook the listener from the start.
-- The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
-- The script must be in JSON format.
-Follow this example structure:
-{example}
-"""
-        # Construct system and user prompt
-        if prompt and file_obj:
-            user_prompt = f"Please generate a podcast script based on the uploaded file following user input:\n{prompt}"
-        elif prompt:
-            user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
-        else:
-            user_prompt = "Please generate a podcast script based on the uploaded file."
-        # NOTE: file_obj cannot be passed to a text-only LLM
-        if file_obj:
-            print("Warning: Uploaded file is ignored in this version because external LLM does not support file input.")
-        # Build prompt
-        full_prompt = f"""{system_prompt}
-{user_prompt}
-Return the result strictly as a JSON object in the format:
-{{
-  "topic": "{prompt}",
-  "podcast": [
-    {{ "speaker": 1, "line": "..." }},
-    {{ "speaker": 2, "line": "..." }}
-  ]
-}}
-"""
         try:
             if progress:
                 progress(0.3, "Generating podcast script...")
-            inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
-            output = model.generate(**inputs, max_new_tokens=1024)
-            text = tokenizer.decode(output[0], skip_special_tokens=True)
-        except Exception as e:
-            raise Exception(f"Failed to generate podcast script: {e}")
-        print(f"Generated podcast script:\n{text}")
-        if progress:
-            progress(0.4, "Script generated successfully!")
-        try:
-            return json.loads(text)
-        except json.JSONDecodeError:
-            raise Exception("The model did not return valid JSON. Please refine the prompt.")
-    async def _read_file_bytes(self, file_obj) -> bytes:
-        """Read file bytes from a file object"""
-        # Check file size before reading
-        if hasattr(file_obj, 'size'):
-            file_size = file_obj.size
-        else:
-            file_size = os.path.getsize(file_obj.name)
-        if file_size > MAX_FILE_SIZE_BYTES:
-            raise Exception(f"File size exceeds the {MAX_FILE_SIZE_MB}MB limit. Please upload a smaller file.")
-        if hasattr(file_obj, 'read'):
-            return file_obj.read()
-        else:
-            async with aiofiles.open(file_obj.name, 'rb') as f:
-                return await f.read()
-    def _get_mime_type(self, filename: str) -> str:
-        """Determine MIME type based on file extension"""
-        ext = os.path.splitext(filename)[1].lower()
-        if ext == '.pdf':
-            return "application/pdf"
-        elif ext == '.txt':
-            return "text/plain"
-        else:
-            # Fallback to the default mime type detector
-            mime_type, _ = mimetypes.guess_type(filename)
-            return mime_type or "application/octet-stream"
     async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
         voice = speaker1 if speaker == 1 else speaker2
         speech = edge_tts.Communicate(text, voice)
-        temp_filename = f"temp_{uuid.uuid4()}.wav"
-        try:
-            # Add timeout to TTS generation
-            await asyncio.wait_for(speech.save(temp_filename), timeout=30)  # 30 seconds timeout
-            return temp_filename
-        except asyncio.TimeoutError:
-            if os.path.exists(temp_filename):
-                os.remove(temp_filename)
-            raise Exception("Text-to-speech generation timed out. Please try with a shorter text.")
-        except Exception as e:
-            if os.path.exists(temp_filename):
-                os.remove(temp_filename)
-            raise e
     async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
         if progress:
             progress(0.9, "Combining audio files...")
-        combined_audio = AudioSegment.empty()
-        for audio_file in audio_files:
-            combined_audio += AudioSegment.from_file(audio_file)
-            os.remove(audio_file)  # Clean up temporary files
-        output_filename = f"output_{uuid.uuid4()}.wav"
-        combined_audio.export(output_filename, format="wav")
-        if progress:
-            progress(1.0, "Podcast generated successfully!")
-        return output_filename
     async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, file_obj=None, progress=None) -> str:
         try:
             if progress:
                 progress(0.1, "Starting podcast generation...")
-            # Set overall timeout for the entire process
-            return await asyncio.wait_for(
-                self._generate_podcast_internal(input_text, language, speaker1, speaker2, file_obj, progress),
-                timeout=600  # 10 minutes total timeout
-            )
-        except asyncio.TimeoutError:
-            raise Exception("The podcast generation process timed out. Please try with shorter text or try again later.")
-        except Exception as e:
-            raise Exception(f"Error generating podcast: {str(e)}")
-    async def _generate_podcast_internal(self, input_text: str, language: str, speaker1: str, speaker2: str, file_obj=None, progress=None) -> str:
-        if progress:
-            progress(0.2, "Generating podcast script...")
-        podcast_json = await self.generate_script(input_text, language,  file_obj, progress)
-        if progress:
-            progress(0.5, "Converting text to speech...")
-        # Process TTS in batches for concurrent processing
-        audio_files = []
-        total_lines = len(podcast_json['podcast'])
-        # Define batch size to control concurrency
-        batch_size = 10  # Adjust based on system resources
-        # Process in batches
-        for batch_start in range(0, total_lines, batch_size):
-            batch_end = min(batch_start + batch_size, total_lines)
-            batch = podcast_json['podcast'][batch_start:batch_end]
-            # Create tasks for concurrent processing
-            tts_tasks = []
-            for item in batch:
-                tts_task = self.tts_generate(item['line'], item['speaker'], speaker1, speaker2)
-                tts_tasks.append(tts_task)
-            try:
-                # Process batch concurrently
-                batch_results = await asyncio.gather(*tts_tasks, return_exceptions=True)
-                # Check for exceptions and handle results
-                for i, result in enumerate(batch_results):
-                    if isinstance(result, Exception):
-                        # Clean up any files already created
-                        for file in audio_files:
-                            if os.path.exists(file):
-                                os.remove(file)
-                        raise Exception(f"Error generating speech: {str(result)}")
-                    else:
-                        audio_files.append(result)
-                # Update progress
-                if progress:
-                    current_progress = 0.5 + (0.4 * (batch_end / total_lines))
-                    progress(current_progress, f"Processed {batch_end}/{total_lines} speech segments...")
-            except Exception as e:
-                # Clean up any files already created
-                for file in audio_files:
-                    if os.path.exists(file):
-                        os.remove(file)
-                raise Exception(f"Error in batch TTS generation: {str(e)}")
-        combined_audio = await self.combine_audio_files(audio_files, progress)
-        return combined_audio
-async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str,  progress=None) -> str:
-    start_time = time.time()
-    voice_names = {
-        "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
-        "Ava - English (United States)": "en-US-AvaMultilingualNeural",
-        "Brian - English (United States)": "en-US-BrianMultilingualNeural",
-        "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
-        "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
-        "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
-        "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
-        "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
-    }
-    speaker1 = voice_names[speaker1]
-    speaker2 = voice_names[speaker2]
     try:
         if progress:
             progress(0.05, "Processing input...")
-        api_key = ""  # No API key needed for local model
         podcast_generator = PodcastGenerator()
-        podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, input_file, progress)
         end_time = time.time()
-        print(f"Total podcast generation time: {end_time - start_time:.2f} seconds")
-        return podcast
     except Exception as e:
-        # Ensure we show a user-friendly error
         error_msg = str(e)
-        if "rate limit" in error_msg.lower():
-            raise Exception("Rate limit exceeded. Please try again later or use your own API key.")
-        elif "timeout" in error_msg.lower():
-            raise Exception("The request timed out. This could be due to server load or the length of your input. Please try again with shorter text.")
-        else:
-            raise Exception(f"Error: {error_msg}")
-# Gradio UI
 def generate_podcast_gradio(input_text, input_file, language, speaker1, speaker2):
-    progress = gr.Progress()
-    # Handle the file if uploaded
-    file_obj = None
-    if input_file is not None:
-        file_obj = input_file
-    # Use the progress function from Gradio
-    def progress_callback(value, text):
-        progress(value, text)
-    # Run the async function in the event loop
-    result = asyncio.run(process_input(
-        input_text,
-        file_obj,
-        language,
-        speaker1,
-        speaker2,
-        progress_callback
-    ))
-    return result
-def main():
-    # Define language options
     language_options = [
-        "Auto Detect",
-        "Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani",
-        "Bahasa Indonesian", "Bangla", "Basque", "Bengali", "Bosnian", "Bulgarian",
-        "Burmese", "Catalan", "Chinese Cantonese", "Chinese Mandarin",
-        "Chinese Taiwanese", "Croatian", "Czech", "Danish", "Dutch", "English",
-        "Estonian", "Filipino", "Finnish", "French", "Galician", "Georgian",
-        "German", "Greek", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Irish",
-        "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean",
-        "Lao", "Latvian", "Lithuanian", "Macedonian", "Malay", "Malayalam",
-        "Maltese", "Mongolian", "Nepali", "Norwegian Bokmål", "Pashto", "Persian",
-        "Polish", "Portuguese", "Romanian", "Russian", "Serbian", "Slovak", "Slovene", "Somali", "Spanish", "Sundanese", "Swahili",
-        "Swedish", "Tamil", "Telugu", "Thai", "Turkish", "Ukrainian", "Urdu",
-        "Uzbek", "Vietnamese", "Welsh", "Zulu"
     ]
-    # Define voice options
-    voice_options = [
-        "Andrew - English (United States)",
-        "Ava - English (United States)",
-        "Brian - English (United States)",
-        "Emma - English (United States)",
-        "Florian - German (Germany)",
-        "Seraphina - German (Germany)",
-        "Remy - French (France)",
-        "Vivienne - French (France)"
-    ]
-    # Create Gradio interface
-    with gr.Blocks(title="PodcastGen 2🎙️") as demo:
-        gr.Markdown("# PodcastGen 2🎙️")
-        gr.Markdown("Generate a 2-speaker podcast from text input or documents!")
         with gr.Row():
             with gr.Column(scale=2):
-                input_text = gr.Textbox(label="Input Text", lines=10, placeholder="Enter text for podcast generation...")
             with gr.Column(scale=1):
-                input_file = gr.File(label="Or Upload a PDF or TXT file", file_types=[".pdf", ".txt"])
         with gr.Row():
-            with gr.Column():
-                language = gr.Dropdown(label="Language", choices=language_options, value="Auto Detect")
-            with gr.Column():
-                speaker1 = gr.Dropdown(label="Speaker 1 Voice", choices=voice_options, value="Andrew - English (United States)")
-                speaker2 = gr.Dropdown(label="Speaker 2 Voice", choices=voice_options, value="Ava - English (United States)")
-        generate_btn = gr.Button("Generate Podcast", variant="primary")
-        with gr.Row():
-            output_audio = gr.Audio(label="Generated Podcast", type="filepath", format="wav")
         generate_btn.click(
             fn=generate_podcast_gradio,
             inputs=[input_text, input_file, language, speaker1, speaker2],
-            outputs=[output_audio]
         )
-    demo.launch()
 if __name__ == "__main__":
-    main()

 import time
 import mimetypes
 import torch
+import re
 from typing import List, Dict
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
 # Constants
 MAX_FILE_SIZE_MB = 20
+MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024
 MODEL_ID = "unsloth/gemma-3-1b-pt"
+# Initialize model with proper error handling
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto",
+        trust_remote_code=True
+    ).eval()
+    # Configure generation parameters
+    generation_config = GenerationConfig(
+        max_new_tokens=1024,
+        temperature=0.7,
+        top_p=0.9,
+        do_sample=True,
+        pad_token_id=tokenizer.pad_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+    )
+    print(f"Model loaded successfully on device: {model.device}")
+except Exception as e:
+    print(f"Model initialization error: {e}")
+    model = None
+    tokenizer = None
+    generation_config = None
 class PodcastGenerator:
     def __init__(self):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.generation_config = generation_config
+    def extract_json_from_text(self, text: str) -> Dict:
+        """Extract JSON from model output using regex patterns"""
+        # Remove the input prompt from the output
+        # Look for JSON-like structures
+        json_patterns = [
+            r'\{[^{}]*"topic"[^{}]*"podcast"[^{}]*\[.*?\]\s*\}',
+            r'\{.*?"topic".*?"podcast".*?\[.*?\].*?\}',
+        ]
+        for pattern in json_patterns:
+            matches = re.findall(pattern, text, re.DOTALL | re.IGNORECASE)
+            for match in matches:
+                try:
+                    # Clean up the match
+                    cleaned_match = match.strip()
+                    return json.loads(cleaned_match)
+                except json.JSONDecodeError:
+                    continue
+        # If no valid JSON found, create a fallback structure
+        return self.create_fallback_podcast(text)
+    def create_fallback_podcast(self, text: str) -> Dict:
+        """Create a basic podcast structure when JSON parsing fails"""
+        # Extract meaningful sentences from the text
+        sentences = [s.strip() for s in text.split('.') if len(s.strip()) > 10]
+        if not sentences:
+            sentences = ["Let's discuss this interesting topic.", "That's a great point to consider."]
+        podcast_lines = []
+        for i, sentence in enumerate(sentences[:10]):  # Limit to 10 exchanges
+            speaker = (i % 2) + 1
+            podcast_lines.append({
+                "speaker": speaker,
+                "line": sentence + "." if not sentence.endswith('.') else sentence
+            })
+        return {
+            "topic": "Generated Discussion",
+            "podcast": podcast_lines
+        }
+    async def generate_script(self, prompt: str, language: str, file_obj=None, progress=None) -> Dict:
+        if not self.model or not self.tokenizer:
+            raise Exception("Model not properly initialized. Please check model loading.")
+        example_json = {
+            "topic": "AGI",
+            "podcast": [
+                {"speaker": 1, "line": "So, AGI, huh? Seems like everyone's talking about it these days."},
+                {"speaker": 2, "line": "Yeah, it's definitely having a moment, isn't it?"},
+                {"speaker": 1, "line": "It really is. What got you hooked on this topic?"},
+                {"speaker": 2, "line": "The potential implications are fascinating and concerning at the same time."}
+            ]
         }
         if language == "Auto Detect":
+            language_instruction = "Use the same language as the input text"
         else:
+            language_instruction = f"Generate the podcast in {language} language"
+        # Simplified, more direct prompt
+        system_prompt = f"""Generate a podcast script as valid JSON. {language_instruction}.
+Requirements:
+- Exactly 2 speakers (speaker 1 and 2)
+- Natural, engaging conversation
+- JSON format only
+Example format:
+{json.dumps(example_json, indent=2)}
+Input topic: {prompt}
+Generate JSON:"""
         try:
             if progress:
                 progress(0.3, "Generating podcast script...")
+            # Tokenize with proper attention mask
+            inputs = self.tokenizer(
+                system_prompt,
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=2048
+            )
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Generate with timeout
+            with torch.no_grad():
+                output = self.model.generate(
+                    **inputs,
+                    generation_config=self.generation_config,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                )
+            # Decode only the new tokens
+            generated_text = self.tokenizer.decode(
+                output[0][inputs['input_ids'].shape[1]:],
+                skip_special_tokens=True
+            )
+            print(f"Generated text: {generated_text[:500]}...")
+            if progress:
+                progress(0.4, "Processing generated script...")
+            # Extract JSON from the generated text
+            result = self.extract_json_from_text(generated_text)
+            if progress:
+                progress(0.5, "Script generated successfully!")
+            return result
+        except Exception as e:
+            print(f"Generation error: {e}")
+            # Return fallback podcast
+            return {
+                "topic": prompt or "Discussion",
+                "podcast": [
+                    {"speaker": 1, "line": f"Welcome to our discussion about {prompt or 'this topic'}."},
+                    {"speaker": 2, "line": "Thanks for having me. This is indeed an interesting subject."},
+                    {"speaker": 1, "line": "Let's dive into the key points and explore different perspectives."},
+                    {"speaker": 2, "line": "Absolutely. There's a lot to unpack here."},
+                    {"speaker": 1, "line": "What aspects do you find most compelling?"},
+                    {"speaker": 2, "line": "The implications and potential applications are fascinating."},
+                    {"speaker": 1, "line": "That's a great point. Thanks for the insightful discussion."},
+                    {"speaker": 2, "line": "Thank you. This has been a valuable conversation."}
+                ]
+            }
     async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
+        """Generate TTS audio with improved error handling"""
         voice = speaker1 if speaker == 1 else speaker2
         speech = edge_tts.Communicate(text, voice)
+        temp_filename = f"temp_audio_{uuid.uuid4()}.wav"
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                await asyncio.wait_for(speech.save(temp_filename), timeout=30)
+                if os.path.exists(temp_filename) and os.path.getsize(temp_filename) > 0:
+                    return temp_filename
+                else:
+                    raise Exception("Generated audio file is empty")
+            except asyncio.TimeoutError:
+                if os.path.exists(temp_filename):
+                    os.remove(temp_filename)
+                if attempt == max_retries - 1:
+                    raise Exception("TTS generation timed out after multiple attempts")
+                await asyncio.sleep(1)  # Brief delay before retry
+            except Exception as e:
+                if os.path.exists(temp_filename):
+                    os.remove(temp_filename)
+                if attempt == max_retries - 1:
+                    raise Exception(f"TTS generation failed: {str(e)}")
+                await asyncio.sleep(1)
     async def combine_audio_files(self, audio_files: List[str], progress=None) -> str:
+        """Combine audio files with silence padding"""
         if progress:
             progress(0.9, "Combining audio files...")
+        try:
+            combined_audio = AudioSegment.empty()
+            silence_padding = AudioSegment.silent(duration=500)  # 500ms silence
+            for i, audio_file in enumerate(audio_files):
+                try:
+                    audio_segment = AudioSegment.from_file(audio_file)
+                    combined_audio += audio_segment
+                    # Add silence between speakers (except for the last file)
+                    if i < len(audio_files) - 1:
+                        combined_audio += silence_padding
+                except Exception as e:
+                    print(f"Warning: Could not process audio file {audio_file}: {e}")
+                finally:
+                    # Clean up temporary file
+                    if os.path.exists(audio_file):
+                        os.remove(audio_file)
+            if len(combined_audio) == 0:
+                raise Exception("No audio content generated")
+            output_filename = f"podcast_output_{uuid.uuid4()}.wav"
+            combined_audio.export(output_filename, format="wav")
+            if progress:
+                progress(1.0, "Podcast generated successfully!")
+            return output_filename
+        except Exception as e:
+            # Clean up any remaining temp files
+            for audio_file in audio_files:
+                if os.path.exists(audio_file):
+                    os.remove(audio_file)
+            raise Exception(f"Audio combination failed: {str(e)}")
     async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, file_obj=None, progress=None) -> str:
+        """Main podcast generation pipeline with improved error handling"""
         try:
             if progress:
                 progress(0.1, "Starting podcast generation...")
+            # Generate script
+            podcast_json = await self.generate_script(input_text, language, file_obj, progress)
+            if not podcast_json.get('podcast'):
+                raise Exception("No podcast content generated")
+            if progress:
+                progress(0.5, "Converting text to speech...")
+            # Generate TTS with sequential processing to avoid overload
+            audio_files = []
+            total_lines = len(podcast_json['podcast'])
+            for i, item in enumerate(podcast_json['podcast']):
+                try:
+                    audio_file = await self.tts_generate(
+                        item['line'],
+                        item['speaker'],
+                        speaker1,
+                        speaker2
+                    )
+                    audio_files.append(audio_file)
+                    # Update progress
+                    if progress:
+                        current_progress = 0.5 + (0.4 * (i + 1) / total_lines)
+                        progress(current_progress, f"Generated speech {i + 1}/{total_lines}")
+                except Exception as e:
+                    print(f"TTS error for line {i}: {e}")
+                    # Continue with remaining lines
+                    continue
+            if not audio_files:
+                raise Exception("No audio files generated successfully")
+            # Combine audio files
+            combined_audio = await self.combine_audio_files(audio_files, progress)
+            return combined_audio
+        except Exception as e:
+            raise Exception(f"Podcast generation failed: {str(e)}")
+# Voice mapping
+VOICE_MAPPING = {
+    "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
+    "Ava - English (United States)": "en-US-AvaMultilingualNeural",
+    "Brian - English (United States)": "en-US-BrianMultilingualNeural",
+    "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
+    "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
+    "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
+    "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
+    "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
+}
+async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, progress=None) -> str:
+    """Process input and generate podcast"""
+    start_time = time.time()
     try:
         if progress:
             progress(0.05, "Processing input...")
+        # Map speaker names to voice IDs
+        speaker1_voice = VOICE_MAPPING.get(speaker1, "en-US-AndrewMultilingualNeural")
+        speaker2_voice = VOICE_MAPPING.get(speaker2, "en-US-AvaMultilingualNeural")
+        # Validate input
+        if not input_text or input_text.strip() == "":
+            if input_file is None:
+                raise Exception("Please provide either text input or upload a file")
+            # TODO: Add file processing logic here if needed
         podcast_generator = PodcastGenerator()
+        result = await podcast_generator.generate_podcast(
+            input_text, language, speaker1_voice, speaker2_voice, input_file, progress
+        )
         end_time = time.time()
+        print(f"Total generation time: {end_time - start_time:.2f} seconds")
+        return result
     except Exception as e:
         error_msg = str(e)
+        print(f"Processing error: {error_msg}")
+        raise Exception(f"Generation failed: {error_msg}")
 def generate_podcast_gradio(input_text, input_file, language, speaker1, speaker2):
+    """Gradio interface function with proper error handling"""
+    try:
+        # Validate inputs
+        if not input_text and input_file is None:
+            return None
+        if input_text and len(input_text.strip()) == 0:
+            input_text = None
+        # Create a simple progress tracker
+        progress_history = []
+        def progress_callback(value, text):
+            progress_history.append(f"{value:.1%}: {text}")
+            print(f"Progress: {value:.1%} - {text}")
+        # Run the async function
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            result = loop.run_until_complete(
+                process_input(input_text, input_file, language, speaker1, speaker2, progress_callback)
+            )
+            return result
+        finally:
+            loop.close()
+    except Exception as e:
+        print(f"Gradio function error: {e}")
+        raise gr.Error(f"Failed to generate podcast: {str(e)}")
+def create_interface():
+    """Create the Gradio interface with proper component configuration"""
     language_options = [
+        "Auto Detect", "English", "German", "French", "Spanish", "Italian",
+        "Portuguese", "Dutch", "Russian", "Chinese", "Japanese", "Korean"
     ]
+    voice_options = list(VOICE_MAPPING.keys())
+    with gr.Blocks(
+        title="PodcastGen 2🎙️",
+        theme=gr.themes.Soft(),
+        css=".gradio-container {max-width: 1200px; margin: auto;}"
+    ) as demo:
+        gr.Markdown("# 🎙️ PodcastGen 2")
+        gr.Markdown("Generate professional 2-speaker podcasts from text input!")
         with gr.Row():
             with gr.Column(scale=2):
+                input_text = gr.Textbox(
+                    label="Input Text",
+                    lines=8,
+                    placeholder="Enter your topic or text for podcast generation...",
+                    info="Describe what you want the podcast to discuss"
+                )
             with gr.Column(scale=1):
+                input_file = gr.File(
+                    label="Upload File (Optional)",
+                    file_types=[".pdf", ".txt"],
+                    info=f"Max size: {MAX_FILE_SIZE_MB}MB"
+                )
         with gr.Row():
+            language = gr.Dropdown(
+                label="Language",
+                choices=language_options,
+                value="Auto Detect",
+                info="Select output language"
+            )
+            speaker1 = gr.Dropdown(
+                label="Speaker 1 Voice",
+                choices=voice_options,
+                value="Andrew - English (United States)"
+            )
+            speaker2 = gr.Dropdown(
+                label="Speaker 2 Voice",
+                choices=voice_options,
+                value="Ava - English (United States)"
+            )
+        generate_btn = gr.Button(
+            "🎙️ Generate Podcast",
+            variant="primary",
+            size="lg"
+        )
+        output_audio = gr.Audio(
+            label="Generated Podcast",
+            type="filepath",
+            format="wav",
+            show_download_button=True
+        )
+        # Connect the interface
         generate_btn.click(
             fn=generate_podcast_gradio,
             inputs=[input_text, input_file, language, speaker1, speaker2],
+            outputs=[output_audio],
+            show_progress=True
         )
+        # Add usage instructions
+        with gr.Accordion("Usage Instructions", open=False):
+            gr.Markdown("""
+            ### How to use:
+            1. **Input**: Enter your topic or text in the text box, or upload a PDF/TXT file
+            2. **Language**: Choose the output language (Auto Detect recommended)
+            3. **Voices**: Select different voices for Speaker 1 and Speaker 2
+            4. **Generate**: Click the button and wait for processing
+            ### Tips:
+            - Provide clear, specific topics for better results
+            - The AI will create a natural conversation between two speakers
+            - Generation may take 1-3 minutes depending on text length
+            """)
+    return demo
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        share=False
+    )