Spaces:

CamiloVega
/

NewsIA

Sleeping

App Files Files Community

CamiloVega commited on Apr 1

Commit

43f972f

verified ·

1 Parent(s): 48d2a37

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -417

app.py CHANGED Viewed

@@ -22,12 +22,12 @@ import traceback # For detailed error logging
 # Configure logging
 logging.basicConfig(
-    level=logging.INFO, # Set to INFO, can change to DEBUG for more verbosity if needed
-    format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s' # Added filename/lineno
 )
 logger = logging.getLogger(__name__)
-logger.info("--- Starting App ---") # Log app start
 # Login to Hugging Face Hub if token is available
 HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')
@@ -40,7 +40,7 @@ if HUGGINGFACE_TOKEN:
         logger.error(f"Failed to login to Hugging Face Hub: {e}")
         logger.error(traceback.format_exc())
 else:
-    logger.warning("HUGGINGFACE_TOKEN environment variable not set. Model loading might fail if private.")
 class ModelManager:
@@ -54,7 +54,7 @@ class ModelManager:
         return cls._instance
     def __init__(self):
-        if not hasattr(self, '_initialized') or not self._initialized: # Ensure init runs only once
             logger.info("Initializing ModelManager attributes.")
             self.tokenizer = None
             self.model = None
@@ -65,10 +65,9 @@ class ModelManager:
             self.last_used = time.time()
             self.llm_loading = False
             self.whisper_loading = False
-            self._initialized = True # Mark as initialized
     def _cleanup_memory(self):
-        """Utility function to force memory cleanup"""
         logger.info("Running garbage collection...")
         collected_count = gc.collect()
         logger.info(f"Garbage collected ({collected_count} objects).")
@@ -78,500 +77,262 @@ class ModelManager:
             logger.info("CUDA cache cleared.")
     def reset_llm(self):
-        """Explicitly resets the LLM components."""
         logger.info("--- Attempting to reset LLM ---")
         try:
-            # Check attributes before deleting
-            if hasattr(self, 'model') and self.model is not None:
-                del self.model
-                logger.info("LLM model deleted.")
-            if hasattr(self, 'tokenizer') and self.tokenizer is not None:
-                del self.tokenizer
-                logger.info("LLM tokenizer deleted.")
-            if hasattr(self, 'text_pipeline') and self.text_pipeline is not None:
-                del self.text_pipeline
-                logger.info("LLM pipeline deleted.")
-            # Reset attributes
-            self.model = None
-            self.tokenizer = None
-            self.text_pipeline = None
-            self.llm_loaded = False # Mark as not loaded
             self._cleanup_memory()
             logger.info("LLM components reset successfully.")
-        except Exception as e:
-            logger.error(f"!!! ERROR during LLM reset: {e}")
-            logger.error(traceback.format_exc())
     def reset_whisper(self):
-        """Explicitly resets the Whisper model."""
         logger.info("--- Attempting to reset Whisper ---")
         try:
-            if hasattr(self, 'whisper_model') and self.whisper_model is not None:
-                del self.whisper_model
-                logger.info("Whisper model deleted.")
             self.whisper_model = None
-            self.whisper_loaded = False # Mark as not loaded
             self._cleanup_memory()
             logger.info("Whisper component reset successfully.")
-        except Exception as e:
-            logger.error(f"!!! ERROR during Whisper reset: {e}")
-            logger.error(traceback.format_exc())
     @spaces.GPU(duration=120)
     def initialize_llm(self):
-        """Initialize LLM model with standard transformers"""
         logger.info("Attempting to initialize LLM.")
-        if self.llm_loading:
-            logger.info("LLM initialization already in progress. Skipping.")
-            return True
-        if self.llm_loaded:
-            logger.info("LLM already initialized.")
-            self.last_used = time.time()
-            return True
-        # Explicitly try to free Whisper memory before loading LLM
-        # self.reset_whisper() # Optional: Uncomment if severe memory pressure
         self.llm_loading = True
         logger.info("Starting LLM initialization...")
         try:
             MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
             logger.info(f"Using LLM model: {MODEL_NAME}")
-            logger.info("Loading LLM tokenizer...")
             self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HUGGINGFACE_TOKEN, use_fast=True)
-            logger.info("LLM tokenizer loaded.")
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-            logger.info("Loading LLM model...")
-            self.model = AutoModelForCausalLM.from_pretrained(
-                MODEL_NAME, token=HUGGINGFACE_TOKEN, device_map="auto",
-                torch_dtype=torch.float16, low_cpu_mem_usage=True,
-                offload_folder="offload", offload_state_dict=True
-            )
-            logger.info("LLM model loaded.")
-            logger.info("Creating LLM text generation pipeline...")
-            self.text_pipeline = pipeline(
-                "text-generation", model=self.model, tokenizer=self.tokenizer,
-                torch_dtype=torch.float16, device_map="auto", max_length=1024
-            )
-            logger.info("LLM text generation pipeline created.")
             logger.info("LLM initialized successfully.")
-            self.last_used = time.time()
-            self.llm_loaded = True
-            self.llm_loading = False
-            return True
-        except Exception as e:
-            logger.error(f"!!! ERROR during LLM initialization: {str(e)}")
-            logger.error(traceback.format_exc())
-            logger.error("Resetting potentially partially loaded LLM components due to error.")
-            self.reset_llm() # Use the specific reset function
-            self.llm_loading = False
-            raise
     @spaces.GPU(duration=120)
     def initialize_whisper(self):
-        """Initialize Whisper model for audio transcription"""
         logger.info("Attempting to initialize Whisper.")
-        if self.whisper_loading:
-            logger.info("Whisper initialization already in progress. Skipping.")
-            return True
-        if self.whisper_loaded:
-             logger.info("Whisper already initialized.")
-             self.last_used = time.time()
-             return True
-        # Explicitly try to free LLM memory before loading Whisper
-        # self.reset_llm() # Optional: Uncomment if severe memory pressure
         self.whisper_loading = True
         logger.info("Starting Whisper initialization...")
         try:
             WHISPER_MODEL_NAME = "tiny"
-            logger.info(f"Loading Whisper model: {WHISPER_MODEL_NAME}")
-            self.whisper_model = whisper.load_model(
-                WHISPER_MODEL_NAME, device="cuda" if torch.cuda.is_available() else "cpu",
-                download_root="/tmp/whisper"
-            )
             logger.info(f"Whisper model '{WHISPER_MODEL_NAME}' loaded successfully.")
-            self.last_used = time.time()
-            self.whisper_loaded = True
-            self.whisper_loading = False
-            return True
-        except Exception as e:
-            logger.error(f"!!! ERROR during Whisper initialization: {str(e)}")
-            logger.error(traceback.format_exc())
-            logger.error("Resetting potentially partially loaded Whisper components due to error.")
-            self.reset_whisper() # Use the specific reset function
-            self.whisper_loading = False
-            raise
     def check_llm_initialized(self):
-        """Check if LLM is initialized and initialize if needed"""
         logger.info("Checking if LLM is initialized.")
         if not self.llm_loaded:
             logger.info("LLM not initialized, attempting initialization...")
-            if not self.llm_loading:
-                 self.initialize_llm() # This will raise error if it fails
-                 logger.info("LLM initialization completed by check_llm_initialized.")
             else:
-                 logger.info("LLM initialization is already in progress. Waiting briefly.")
                  time.sleep(10)
-                 if not self.llm_loaded:
-                     logger.error("LLM initialization timed out or failed after waiting.")
-                     raise RuntimeError("LLM initialization timed out or failed.")
-                 else:
-                     logger.info("LLM seems initialized now after waiting.")
-        else:
-            logger.info("LLM was already initialized.")
         self.last_used = time.time()
     def check_whisper_initialized(self):
-        """Check if Whisper model is initialized and initialize if needed"""
         logger.info("Checking if Whisper is initialized.")
         if not self.whisper_loaded:
             logger.info("Whisper model not initialized, attempting initialization...")
-            if not self.whisper_loading:
-                self.initialize_whisper() # This will raise error if it fails
-                logger.info("Whisper initialization completed by check_whisper_initialized.")
             else:
-                logger.info("Whisper initialization is already in progress. Waiting briefly.")
                 time.sleep(10)
-                if not self.whisper_loaded:
-                    logger.error("Whisper initialization timed out or failed after waiting.")
-                    raise RuntimeError("Whisper initialization timed out or failed.")
-                else:
-                    logger.info("Whisper seems initialized now after waiting.")
-        else:
-            logger.info("Whisper was already initialized.")
         self.last_used = time.time()
     def reset_models(self, force=False):
-        """Reset models if idle or forced."""
-        if force:
-            logger.info("Forcing reset of all models.")
-            self.reset_llm()
-            self.reset_whisper()
 # Create global model manager instance
 logger.info("Creating global ModelManager instance.")
 model_manager = ModelManager()
 # --- Functions: download_social_media_video, convert_video_to_audio, etc. ---
-# --- These functions are kept exactly the same as the previous full version ---
-# --- with detailed logging. Paste them here.                       ---
 @lru_cache(maxsize=16)
 def download_social_media_video(url):
-    """Download audio from a social media video URL."""
-    logger.info(f"Attempting to download audio from social media URL: {url}")
     temp_dir = tempfile.mkdtemp()
     output_template = os.path.join(temp_dir, '%(id)s.%(ext)s')
     final_audio_file_path = None
-    ydl_opts = {
-        'format': 'bestaudio/best', 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
-        'outtmpl': output_template, 'quiet': True, 'no_warnings': True, 'nocheckcertificate': True, 'retries': 3, 'socket_timeout': 15, 'cachedir': False
-    }
     try:
-        logger.debug(f"yt-dlp options: {ydl_opts}")
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            logger.debug("Extracting info and downloading...")
-            info_dict = ydl.extract_info(url, download=True)
-            logger.debug(f"yt-dlp extraction complete for {url}. ID: {info_dict.get('id')}")
-            found_files = [f for f in os.listdir(temp_dir) if f.endswith('.mp3')]
-            if found_files:
-                final_audio_file_path = os.path.join(temp_dir, found_files[0])
-                logger.debug(f"Found downloaded MP3: {final_audio_file_path}")
-            else:
-                 logger.error(f"Could not find downloaded MP3 file in {temp_dir} for URL {url}")
-                 raise FileNotFoundError(f"Downloaded MP3 not found in {temp_dir}")
-        logger.debug(f"Reading content of {final_audio_file_path}")
         with open(final_audio_file_path, 'rb') as f: audio_content = f.read()
-        logger.debug("Saving audio content to a new temporary file...")
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_output_file:
-            temp_output_file.write(audio_content)
-            final_path_for_gradio = temp_output_file.name
-        logger.info(f"Audio content saved to temporary file for processing: {final_path_for_gradio}")
         return final_path_for_gradio
-    except yt_dlp.utils.DownloadError as e:
-        logger.error(f"!!! yt-dlp download error for {url}: {str(e)}")
-        return None
-    except Exception as e:
-        logger.error(f"!!! Unexpected error downloading video from {url}: {str(e)}")
-        logger.error(traceback.format_exc())
-        return None
     finally:
         if os.path.exists(temp_dir):
-            logger.debug(f"Cleaning up temporary download directory: {temp_dir}")
-            try:
-                import shutil
-                shutil.rmtree(temp_dir)
-            except Exception as cleanup_e: logger.warning(f"Could not clean up {temp_dir}: {cleanup_e}")
 def convert_video_to_audio(video_file_path):
-    """Convert a video file to audio using ffmpeg directly."""
-    logger.info(f"Attempting to convert video to audio: {video_file_path}")
     output_file_path = None
     try:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file: output_file_path = temp_file.name
-        logger.debug(f"Output audio path will be: {output_file_path}")
         command = ["ffmpeg", "-i", video_file_path, "-vn", "-acodec", "libmp3lame", "-ab", "192k", "-ar", "44100", "-ac", "2", output_file_path, "-y", "-loglevel", "error"]
-        logger.debug(f"Executing ffmpeg command: {' '.join(command)}")
-        process = subprocess.run(command, check=True, capture_output=True, text=True, timeout=120)
-        logger.debug(f"ffmpeg conversion successful for {video_file_path}.")
-        if not os.path.exists(output_file_path) or os.path.getsize(output_file_path) == 0:
-            logger.error(f"ffmpeg conversion failed: Output file '{output_file_path}' not created or is empty.")
-            raise RuntimeError(f"ffmpeg conversion failed: Output file '{output_file_path}' not created or is empty.")
-        logger.info(f"Video successfully converted to audio: {output_file_path}")
         return output_file_path
-    except subprocess.CalledProcessError as e:
-         logger.error(f"!!! ffmpeg command failed with exit code {e.returncode} for video: {video_file_path}")
-         logger.error(f"ffmpeg stderr: {e.stderr}")
-         if output_file_path and os.path.exists(output_file_path):
-             try: os.remove(output_file_path)
-             except: pass
-         raise RuntimeError(f"ffmpeg conversion failed: {e.stderr}") from e
-    except subprocess.TimeoutExpired as e:
-        logger.error(f"!!! ffmpeg command timed out after {e.timeout} seconds for video: {video_file_path}")
-        if output_file_path and os.path.exists(output_file_path):
-             try: os.remove(output_file_path)
-             except: pass
-        raise RuntimeError(f"ffmpeg conversion timed out after {e.timeout} seconds.") from e
-    except Exception as e:
-        logger.error(f"!!! Error converting video '{video_file_path}': {str(e)}")
-        logger.error(traceback.format_exc())
-        if output_file_path and os.path.exists(output_file_path):
              try: os.remove(output_file_path)
              except: pass
-        raise
 def preprocess_audio(input_audio_path):
-    """Preprocess the audio file (e.g., normalize volume)."""
-    logger.info(f"Attempting to preprocess audio file: {input_audio_path}")
     output_path = None
     try:
-        if not os.path.exists(input_audio_path):
-             logger.error(f"Input audio file for preprocessing not found: {input_audio_path}")
-             raise FileNotFoundError(f"Input audio file not found: {input_audio_path}")
-        logger.debug("Loading audio with pydub...")
         audio = AudioSegment.from_file(input_audio_path)
-        logger.debug("Audio loaded.")
-        # Optional normalization can be added here
-        logger.debug("Exporting preprocessed audio...")
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
-            output_path = temp_file.name
-            audio.export(output_path, format="mp3")
-        logger.info(f"Audio preprocessed and saved to: {output_path}")
         return output_path
-    except FileNotFoundError as e:
-        logger.error(f"!!! File not found during audio preprocessing: {e}")
-        raise
-    except Exception as e:
-        logger.error(f"!!! Error preprocessing audio '{input_audio_path}': {str(e)}")
-        logger.error(traceback.format_exc())
-        if output_path and os.path.exists(output_path):
-             try: os.remove(output_path)
-             except: pass
-        raise
 @spaces.GPU(duration=300)
 def transcribe_audio_or_video(file_input):
-    """Transcribe an audio or video file (local path or Gradio File object)."""
-    logger.info(f"--- Starting transcription process for input: {type(file_input)} ---")
-    audio_file_to_transcribe = None; original_input_path = None
-    temp_files_to_clean = []; processing_step = "Initialization"; transcription = ""
     try:
-        processing_step = "Whisper Model Check"
-        logger.info("Checking/Initializing Whisper model for transcription...")
-        model_manager.check_whisper_initialized()
-        logger.info("Whisper model is ready for transcription.")
         if file_input is None: return ""
-        processing_step = "Input Type Handling"
-        if isinstance(file_input, str):
-            original_input_path = file_input
-            if not os.path.exists(original_input_path): raise FileNotFoundError(f"Input file not found: {original_input_path}")
-            input_path = original_input_path
-        elif hasattr(file_input, 'name') and file_input.name:
-            original_input_path = file_input.name
-            if not os.path.exists(original_input_path): raise FileNotFoundError(f"Gradio temporary file not found: {original_input_path}")
-            input_path = original_input_path
-        else: raise TypeError("Invalid input type for transcription.")
-        logger.debug(f"Input path identified: {input_path}")
         file_extension = os.path.splitext(input_path)[1].lower()
-        logger.debug(f"File extension: {file_extension}")
-        processing_step = "Video Conversion Check"
         if file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.webm']:
-            logger.info(f"Detected video file ({file_extension}), converting...")
             converted_audio_path = convert_video_to_audio(input_path)
             temp_files_to_clean.append(converted_audio_path); audio_file_to_process = converted_audio_path
-        elif file_extension in ['.mp3', '.wav', '.ogg', '.flac', '.m4a', '.aac']:
-             logger.info(f"Detected audio file ({file_extension}).")
-             audio_file_to_process = input_path
-        else: raise ValueError(f"Unsupported file type: {file_extension}")
-        processing_step = "Audio Preprocessing"
         try:
-            logger.debug(f"Attempting to preprocess audio file: {audio_file_to_process}")
             preprocessed_audio_path = preprocess_audio(audio_file_to_process)
             if preprocessed_audio_path != audio_file_to_process: temp_files_to_clean.append(preprocessed_audio_path)
             audio_file_to_transcribe = preprocessed_audio_path
-            logger.debug(f"Using preprocessed audio: {audio_file_to_transcribe}")
-        except Exception as preprocess_err:
-            logger.warning(f"Audio preprocessing failed: {preprocess_err}. Using original/converted audio.")
-            audio_file_to_transcribe = audio_file_to_process
-        processing_step = "Transcription Execution"
-        logger.info(f"Starting transcription execution for: {audio_file_to_transcribe}")
-        if not os.path.exists(audio_file_to_transcribe): raise FileNotFoundError(f"Audio file to transcribe not found: {audio_file_to_transcribe}")
-        logger.debug("Calling Whisper model transcribe method...")
         with torch.inference_mode():
-            use_fp16 = torch.cuda.is_available(); logger.debug(f"Using fp16: {use_fp16}")
             result = model_manager.whisper_model.transcribe(audio_file_to_transcribe, fp16=use_fp16)
-        logger.debug("Whisper transcribe method finished.")
-        if not result or "text" not in result: raise RuntimeError("Transcription failed to produce results")
-        transcription = result.get("text", "Error: Transcription result empty")
-        logger.info(f"Transcription completed successfully: '{transcription[:100]}...'")
-        processing_step = "Success"
-    except FileNotFoundError as e:
-        logger.error(f"!!! File not found error (Step: {processing_step}): {e}"); transcription = f"Error: Input file not found ({e})"
-    except ValueError as e:
-         logger.error(f"!!! Value error (Step: {processing_step}): {e}"); transcription = f"Error: Unsupported file type ({e})"
-    except TypeError as e:
-         logger.error(f"!!! Type error (Step: {processing_step}): {e}"); transcription = f"Error: Invalid input provided ({e})"
-    except RuntimeError as e:
-         logger.error(f"!!! Runtime error (Step: {processing_step}): {e}"); logger.error(traceback.format_exc()); transcription = f"Error during processing: {e}"
-    except Exception as e:
-        logger.error(f"!!! Unexpected error (Step: {processing_step}): {str(e)}"); logger.error(traceback.format_exc()); transcription = f"Error processing the file: An unexpected error occurred."
     finally:
-        logger.debug(f"--- Cleaning up {len(temp_files_to_clean)} temp files for transcription ---")
         for temp_file in temp_files_to_clean:
             try:
-                if os.path.exists(temp_file): os.remove(temp_file); logger.debug(f"Cleaned: {temp_file}")
-            except Exception as e: logger.warning(f"Could not remove temp file {temp_file}: {e}")
-        logger.debug("--- Finished transcription cleanup ---")
         return transcription
 @lru_cache(maxsize=16)
 def read_document(document_path):
-    """Read the content of a document (PDF, DOCX, XLSX, CSV)."""
-    logger.info(f"Attempting to read document: {document_path}")
     try:
-        if not os.path.exists(document_path): raise FileNotFoundError(f"Document not found: {document_path}")
-        file_extension = os.path.splitext(document_path)[1].lower(); logger.debug(f"Doc type: {file_extension}")
         content = ""
-        if file_extension == ".pdf":
-            logger.debug("Reading PDF using PyMuPDF...")
             doc = fitz.open(document_path)
-            if doc.is_encrypted:
-                logger.warning(f"PDF {document_path} encrypted. Trying empty password.")
-                if not doc.authenticate(""): raise ValueError("Encrypted PDF cannot be read.")
             content = "\n".join([page.get_text() for page in doc]); doc.close()
-        elif file_extension == ".docx":
-            logger.debug("Reading DOCX using python-docx...")
-            doc = docx.Document(document_path); content = "\n".join([p.text for p in doc.paragraphs])
-        elif file_extension in (".xlsx", ".xls"):
-            logger.debug("Reading Excel using pandas...")
-            xls = pd.ExcelFile(document_path); text_parts = []
-            for sheet_name in xls.sheet_names:
-                logger.debug(f"Reading sheet: {sheet_name}")
-                df = pd.read_excel(xls, sheet_name=sheet_name); text_parts.append(f"--- Sheet: {sheet_name} ---\n{df.to_string()}")
-            content = "\n\n".join(text_parts).strip()
-        elif file_extension == ".csv":
-            logger.debug("Reading CSV using pandas...")
             try:
-                with open(document_path, 'rb') as f: import chardet; encoding = chardet.detect(f.read())['encoding']
-                logger.debug(f"Detected CSV encoding: {encoding}")
-                df = pd.read_csv(document_path, encoding=encoding)
-            except (pd.errors.ParserError, UnicodeDecodeError, LookupError) as e1:
-                 logger.warning(f"CSV parse failed ({e1}), trying semicolon.")
-                 try: df = pd.read_csv(document_path, sep=';', encoding=encoding)
-                 except Exception as e2:
-                     logger.error(f"Also failed with semicolon ({e2}). Trying latin1.")
-                     try: df = pd.read_csv(document_path, encoding='latin1')
-                     except Exception as e3: raise ValueError(f"Failed to parse CSV: {e1}, {e2}, {e3}")
             content = df.to_string()
-        else: return "Unsupported file type. Please upload a PDF, DOCX, XLSX or CSV document."
-        logger.info(f"Document read successfully. Length: {len(content)} chars.")
         return content
-    except FileNotFoundError as e: logger.error(f"!!! File not found reading doc: {e}"); return f"Error: Document file not found: {e}"
-    except ValueError as e: logger.error(f"!!! Value error reading doc: {e}"); return f"Error reading document: {e}"
-    except Exception as e: logger.error(f"!!! Error reading doc: {str(e)}"); logger.error(traceback.format_exc()); return f"Error reading document: {str(e)}"
 @lru_cache(maxsize=16)
 def read_url(url):
-    """Read the main textual content of a URL."""
-    logger.info(f"Attempting to read URL: {url}")
     if not url or not url.strip().startswith('http'): return ""
     try:
-        headers = {'User-Agent': 'Mozilla/5.0 ... Chrome/91...', 'Accept': 'text/html...', 'Accept-Language': 'en-US,en;q=0.9', 'Connection': 'keep-alive'}
-        logger.debug(f"Sending GET to {url}")
         response = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
-        logger.debug(f"Response from {url}: {response.status_code}, CT: {response.headers.get('content-type')}")
         response.raise_for_status()
-        content_type = response.headers.get('content-type', '').lower()
-        if not ('html' in content_type or 'text' in content_type): return f"Error: URL content type ({content_type}) is not text/html."
-        detected_encoding = response.encoding if response.encoding else response.apparent_encoding
-        html_content = response.content.decode(detected_encoding or 'utf-8', errors='ignore')
-        logger.debug(f"Parsing HTML ({len(html_content)} bytes) from {url}...")
-        soup = BeautifulSoup(html_content, 'html.parser')
-        tags_to_remove = ["script", "style", "meta", "noscript", "iframe", "header", "footer", "nav", "aside", "form", "button", "link", "head"]
-        for tag_name in tags_to_remove:
-            for element in soup.find_all(tag_name): element.extract()
-        logger.debug("Finding main content container...")
-        main_content = (soup.find("main") or soup.find("article") or soup.find("div", class_=["content", "main", "post-content", "entry-content", "article-body", "story-content"]) or soup.find("div", id=["content", "main", "article", "story"]))
-        text = ""
-        if main_content: text = main_content.get_text(separator='\n', strip=True)
-        else:
-            body = soup.find("body")
-            if body: text = body.get_text(separator='\n', strip=True)
-            else: text = soup.get_text(separator='\n', strip=True)
-        lines = [line.strip() for line in text.split('\n') if line.strip()]; cleaned_text = "\n".join(lines)
-        if not cleaned_text: return "Error: Could not extract text content from URL."
-        max_chars = 15000
-        final_text = (cleaned_text[:max_chars] + "... [content truncated]") if len(cleaned_text) > max_chars else cleaned_text
-        logger.info(f"Successfully read URL {url}. Final length: {len(final_text)}")
-        return final_text
-    except requests.exceptions.RequestException as e: logger.error(f"!!! Error fetching URL {url}: {e}"); return f"Error reading URL: Could not fetch content ({e})"
-    except Exception as e: logger.error(f"!!! Error parsing URL {url}: {e}"); logger.error(traceback.format_exc()); return f"Error reading URL: Could not parse content ({e})"
 def process_social_media_url(url):
-    """Process a social media URL, attempting to get text and transcribe video/audio."""
-    logger.info(f"--- Starting processing for social media URL: {url} ---")
     if not url or not url.strip().startswith('http'): return None
-    text_content = None; video_transcription = None; temp_audio_file = None
-    try:
-        logger.debug(f"Attempting text read from social URL: {url}")
-        text_content_result = read_url(url)
-        if text_content_result and not text_content_result.startswith("Error:"): text_content = text_content_result; logger.debug("Text read success.")
-        elif text_content_result: logger.warning(f"read_url error for {url}: {text_content_result}")
-        else: logger.debug("No text via read_url.")
-    except Exception as e: logger.error(f"!!! Exception text reading social URL {url}: {e}"); logger.error(traceback.format_exc())
     try:
-        logger.debug(f"Attempting audio download from social URL: {url}")
-        temp_audio_file = download_social_media_video(url)
-        if temp_audio_file:
-            logger.info(f"Audio downloaded from {url} to {temp_audio_file}. Transcribing...")
-            transcription_result = transcribe_audio_or_video(temp_audio_file)
-            if transcription_result and not transcription_result.startswith("Error"): video_transcription = transcription_result; logger.info("Transcription success.")
-            elif transcription_result: logger.warning(f"Transcription error for {url}: {transcription_result}")
-            else: logger.warning(f"Empty transcription for {url}.")
-        else: logger.debug("No downloadable audio found.")
-    except Exception as e: logger.error(f"!!! Exception audio processing social URL {url}: {e}"); logger.error(traceback.format_exc())
     finally:
-         if temp_audio_file and os.path.exists(temp_audio_file):
-            logger.debug(f"Cleaning up social temp audio: {temp_audio_file}")
-            try: os.remove(temp_audio_file)
-            except Exception as e: logger.warning(f"Failed cleanup {temp_audio_file}: {e}")
-    logger.debug(f"--- Finished processing social URL: {url} ---")
-    if text_content or video_transcription: return {"text": text_content or "", "video": video_transcription or ""}
-    else: logger.info(f"No usable content retrieved for social URL: {url}"); return None
 # ==============================================================
 # ========= SIMPLIFIED generate_news FOR DEBUGGING =============
@@ -613,7 +374,8 @@ def generate_news(instructions, facts, size, tone, *args):
 # ==============================================================
-# --- create_demo function remains the same as the previous version ---
 def create_demo():
     """Creates the Gradio interface"""
     logger.info("--- Creating Gradio interface ---")
@@ -623,55 +385,46 @@ def create_demo():
         all_inputs = []
         with gr.Row():
             with gr.Column(scale=2):
-                logger.info("Creating instruction input.")
                 instructions = gr.Textbox(label="Instructions for the News Article", placeholder="Enter specific instructions...", lines=2)
                 all_inputs.append(instructions)
-                logger.info("Creating facts input.")
                 facts = gr.Textbox(label="Main Facts", placeholder="Describe the most important facts...", lines=4)
                 all_inputs.append(facts)
                 with gr.Row():
-                    logger.info("Creating size slider.")
                     size_slider = gr.Slider(label="Approximate Length (words)", minimum=100, maximum=700, value=250, step=50)
                     all_inputs.append(size_slider)
-                    logger.info("Creating tone dropdown.")
                     tone_dropdown = gr.Dropdown(label="Tone of the News Article", choices=["neutral", "serious", "formal", "urgent", "investigative", "human-interest", "lighthearted"], value="neutral")
                     all_inputs.append(tone_dropdown)
             with gr.Column(scale=3):
                 with gr.Tabs():
                     with gr.TabItem("📝 Documents"):
-                        logger.info("Creating document input tabs.")
                         gr.Markdown("Upload relevant documents (PDF, DOCX, XLSX, CSV). Max 5.")
                         doc_inputs = []
                         for i in range(1, 6):
-                            doc_file = gr.File(label=f"Document {i}", file_types=["pdf", ".docx", ".xlsx", ".csv"], file_count="single")
                             doc_inputs.append(doc_file)
                         all_inputs.extend(doc_inputs)
-                        logger.info(f"{len(doc_inputs)} document inputs created.")
                     with gr.TabItem("🔊 Audio/Video"):
-                         logger.info("Creating audio/video input tabs.")
                          gr.Markdown("Upload audio or video files... Max 5 sources.")
                          audio_video_inputs = []
                          for i in range(1, 6):
                             with gr.Group():
                                 gr.Markdown(f"**Source {i}**")
-                                audio_file = gr.File(label=f"Audio/Video File {i}", file_types=["audio", "video"])
                                 with gr.Row():
                                     speaker_name = gr.Textbox(label="Speaker Name", placeholder="Name...")
                                     speaker_role = gr.Textbox(label="Role/Position", placeholder="Role...")
                                 audio_video_inputs.extend([audio_file, speaker_name, speaker_role])
                          all_inputs.extend(audio_video_inputs)
-                         logger.info(f"{len(audio_video_inputs)} audio/video inputs created.")
                     with gr.TabItem("🌐 URLs"):
-                         logger.info("Creating URL input tabs.")
                          gr.Markdown("Add URLs to relevant web pages... Max 5.")
                          url_inputs = []
                          for i in range(1, 6):
                             url_textbox = gr.Textbox(label=f"URL {i}", placeholder="https://...")
                             url_inputs.append(url_textbox)
                          all_inputs.extend(url_inputs)
-                         logger.info(f"{len(url_inputs)} URL inputs created.")
                     with gr.TabItem("📱 Social Media"):
-                         logger.info("Creating social media input tabs.")
                          gr.Markdown("Add URLs to social media posts... Max 3.")
                          social_inputs = []
                          for i in range(1, 4):
@@ -683,26 +436,17 @@ def create_demo():
                                     social_context_textbox = gr.Textbox(label=f"Context", placeholder="Context...")
                                 social_inputs.extend([social_url_textbox, social_name_textbox, social_context_textbox])
                          all_inputs.extend(social_inputs)
-                         logger.info(f"{len(social_inputs)} social media inputs created.")
-        logger.info(f"Total number of input components collected: {len(all_inputs)}")
-        with gr.Row():
-            logger.info("Creating generate and clear buttons.")
-            generate_button = gr.Button("✨ Generate News Article", variant="primary")
-            clear_button = gr.Button("🔄 Clear All Inputs")
         with gr.Tabs():
             with gr.TabItem("📄 Generated News Article"):
-                logger.info("Creating news output textbox.")
                 news_output = gr.Textbox(label="Draft News Article", lines=20, show_copy_button=True, interactive=False)
             with gr.TabItem("🎙️ Source Transcriptions & Logs"):
-                logger.info("Creating transcriptions/log output textbox.")
                 transcriptions_output = gr.Textbox(label="Transcriptions and Processing Log", lines=15, show_copy_button=True, interactive=False)
         outputs_list = [news_output, transcriptions_output]
-        logger.info("Setting up event handlers.")
-        # Asegúrate de que el botón llama a la función generate_news (aunque ahora esté simplificada)
         generate_button.click(fn=generate_news, inputs=all_inputs, outputs=outputs_list)
-        logger.info("Generate button click handler set.")
         def clear_all_inputs_and_outputs():
             logger.info("--- Clear All button clicked ---")
@@ -713,31 +457,21 @@ def create_demo():
                 elif isinstance(input_comp, gr.File): reset_values.append(None)
                 else: reset_values.append(None)
             reset_values.extend(["", ""])
-            logger.info(f"Generated {len(reset_values)} reset values for UI components.")
-            try:
-                 logger.info("Calling model reset from clear button handler.")
-                 model_manager.reset_models(force=True)
-            except Exception as e:
-                 logger.error(f"Error resetting models during clear operation: {e}")
-                 logger.error(traceback.format_exc())
             logger.info("--- Clear All operation finished ---")
             return reset_values
         clear_button.click(fn=clear_all_inputs_and_outputs, inputs=None, outputs=all_inputs + outputs_list)
-        logger.info("Clear button click handler set.")
-        logger.info("--- Gradio interface creation complete ---")
     return demo
 # --- main execution block remains the same ---
 if __name__ == "__main__":
     logger.info("--- Running main execution block ---")
-    logger.info("Creating Gradio demo instance...")
     news_demo = create_demo()
-    logger.info("Gradio demo instance created.")
-    logger.info("Configuring Gradio queue...")
     news_demo.queue()
-    logger.info("Gradio queue configured.")
     logger.info("Launching Gradio interface...")
     try:
         news_demo.launch(server_name="0.0.0.0", server_port=7860)
@@ -745,4 +479,4 @@ if __name__ == "__main__":
     except Exception as launch_err:
          logger.error(f"!!! CRITICAL Error during Gradio launch: {launch_err}")
          logger.error(traceback.format_exc())
-    logger.info("--- Main execution block potentially finished (if launch doesn't block indefinitely) ---")

 # Configure logging
 logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s'
 )
 logger = logging.getLogger(__name__)
+logger.info("--- Starting App ---")
 # Login to Hugging Face Hub if token is available
 HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')
         logger.error(f"Failed to login to Hugging Face Hub: {e}")
         logger.error(traceback.format_exc())
 else:
+    logger.warning("HUGGINGFACE_TOKEN environment variable not set.")
 class ModelManager:
         return cls._instance
     def __init__(self):
+        if not hasattr(self, '_initialized') or not self._initialized:
             logger.info("Initializing ModelManager attributes.")
             self.tokenizer = None
             self.model = None
             self.last_used = time.time()
             self.llm_loading = False
             self.whisper_loading = False
+            self._initialized = True
     def _cleanup_memory(self):
         logger.info("Running garbage collection...")
         collected_count = gc.collect()
         logger.info(f"Garbage collected ({collected_count} objects).")
             logger.info("CUDA cache cleared.")
     def reset_llm(self):
         logger.info("--- Attempting to reset LLM ---")
         try:
+            if hasattr(self, 'model') and self.model is not None: del self.model; logger.info("LLM model deleted.")
+            if hasattr(self, 'tokenizer') and self.tokenizer is not None: del self.tokenizer; logger.info("LLM tokenizer deleted.")
+            if hasattr(self, 'text_pipeline') and self.text_pipeline is not None: del self.text_pipeline; logger.info("LLM pipeline deleted.")
+            self.model = None; self.tokenizer = None; self.text_pipeline = None
+            self.llm_loaded = False
             self._cleanup_memory()
             logger.info("LLM components reset successfully.")
+        except Exception as e: logger.error(f"!!! ERROR during LLM reset: {e}"); logger.error(traceback.format_exc())
     def reset_whisper(self):
         logger.info("--- Attempting to reset Whisper ---")
         try:
+            if hasattr(self, 'whisper_model') and self.whisper_model is not None: del self.whisper_model; logger.info("Whisper model deleted.")
             self.whisper_model = None
+            self.whisper_loaded = False
             self._cleanup_memory()
             logger.info("Whisper component reset successfully.")
+        except Exception as e: logger.error(f"!!! ERROR during Whisper reset: {e}"); logger.error(traceback.format_exc())
     @spaces.GPU(duration=120)
     def initialize_llm(self):
         logger.info("Attempting to initialize LLM.")
+        if self.llm_loading: logger.info("LLM initialization already in progress."); return True
+        if self.llm_loaded: logger.info("LLM already initialized."); self.last_used = time.time(); return True
         self.llm_loading = True
         logger.info("Starting LLM initialization...")
         try:
             MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
             logger.info(f"Using LLM model: {MODEL_NAME}")
             self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HUGGINGFACE_TOKEN, use_fast=True)
+            if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, token=HUGGINGFACE_TOKEN, device_map="auto", torch_dtype=torch.float16, low_cpu_mem_usage=True, offload_folder="offload", offload_state_dict=True)
+            self.text_pipeline = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer, torch_dtype=torch.float16, device_map="auto", max_length=1024)
             logger.info("LLM initialized successfully.")
+            self.last_used = time.time(); self.llm_loaded = True; self.llm_loading = False; return True
+        except Exception as e: logger.error(f"!!! ERROR during LLM initialization: {e}"); logger.error(traceback.format_exc()); self.reset_llm(); self.llm_loading = False; raise
     @spaces.GPU(duration=120)
     def initialize_whisper(self):
         logger.info("Attempting to initialize Whisper.")
+        if self.whisper_loading: logger.info("Whisper initialization already in progress."); return True
+        if self.whisper_loaded: logger.info("Whisper already initialized."); self.last_used = time.time(); return True
         self.whisper_loading = True
         logger.info("Starting Whisper initialization...")
         try:
             WHISPER_MODEL_NAME = "tiny"
+            self.whisper_model = whisper.load_model(WHISPER_MODEL_NAME, device="cuda" if torch.cuda.is_available() else "cpu", download_root="/tmp/whisper")
             logger.info(f"Whisper model '{WHISPER_MODEL_NAME}' loaded successfully.")
+            self.last_used = time.time(); self.whisper_loaded = True; self.whisper_loading = False; return True
+        except Exception as e: logger.error(f"!!! ERROR during Whisper initialization: {e}"); logger.error(traceback.format_exc()); self.reset_whisper(); self.whisper_loading = False; raise
     def check_llm_initialized(self):
         logger.info("Checking if LLM is initialized.")
         if not self.llm_loaded:
             logger.info("LLM not initialized, attempting initialization...")
+            if not self.llm_loading: self.initialize_llm(); logger.info("LLM initialization completed by check_llm_initialized.")
             else:
+                 logger.info("LLM initialization already in progress. Waiting briefly.")
                  time.sleep(10)
+                 if not self.llm_loaded: raise RuntimeError("LLM initialization timed out or failed after waiting.")
+                 else: logger.info("LLM seems initialized now after waiting.")
+        else: logger.info("LLM was already initialized.")
         self.last_used = time.time()
     def check_whisper_initialized(self):
         logger.info("Checking if Whisper is initialized.")
         if not self.whisper_loaded:
             logger.info("Whisper model not initialized, attempting initialization...")
+            if not self.whisper_loading: self.initialize_whisper(); logger.info("Whisper initialization completed by check_whisper_initialized.")
             else:
+                logger.info("Whisper initialization already in progress. Waiting briefly.")
                 time.sleep(10)
+                if not self.whisper_loaded: raise RuntimeError("Whisper initialization timed out or failed after waiting.")
+                else: logger.info("Whisper seems initialized now after waiting.")
+        else: logger.info("Whisper was already initialized.")
         self.last_used = time.time()
     def reset_models(self, force=False):
+        if force: logger.info("Forcing reset of all models."); self.reset_llm(); self.reset_whisper()
 # Create global model manager instance
 logger.info("Creating global ModelManager instance.")
 model_manager = ModelManager()
 # --- Functions: download_social_media_video, convert_video_to_audio, etc. ---
+# --- Kept exactly the same as the previous full version           ---
 @lru_cache(maxsize=16)
 def download_social_media_video(url):
+    logger.info(f"Attempting social download: {url}")
     temp_dir = tempfile.mkdtemp()
     output_template = os.path.join(temp_dir, '%(id)s.%(ext)s')
     final_audio_file_path = None
+    ydl_opts = {'format': 'bestaudio/best', 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}], 'outtmpl': output_template, 'quiet': True, 'no_warnings': True, 'nocheckcertificate': True, 'retries': 3, 'socket_timeout': 15, 'cachedir': False}
     try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=True)
+        found_files = [f for f in os.listdir(temp_dir) if f.endswith('.mp3')]
+        if not found_files: raise FileNotFoundError(f"Downloaded MP3 not found in {temp_dir}")
+        final_audio_file_path = os.path.join(temp_dir, found_files[0])
         with open(final_audio_file_path, 'rb') as f: audio_content = f.read()
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_output_file:
+            temp_output_file.write(audio_content); final_path_for_gradio = temp_output_file.name
+        logger.info(f"Social audio saved to: {final_path_for_gradio}")
         return final_path_for_gradio
+    except yt_dlp.utils.DownloadError as e: logger.error(f"yt-dlp error {url}: {e}"); return None
+    except Exception as e: logger.error(f"Download error {url}: {e}"); logger.error(traceback.format_exc()); return None
     finally:
         if os.path.exists(temp_dir):
+            try: import shutil; shutil.rmtree(temp_dir)
+            except Exception as cleanup_e: logger.warning(f"Cleanup failed {temp_dir}: {cleanup_e}")
 def convert_video_to_audio(video_file_path):
+    logger.info(f"Converting video: {video_file_path}")
     output_file_path = None
     try:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file: output_file_path = temp_file.name
         command = ["ffmpeg", "-i", video_file_path, "-vn", "-acodec", "libmp3lame", "-ab", "192k", "-ar", "44100", "-ac", "2", output_file_path, "-y", "-loglevel", "error"]
+        subprocess.run(command, check=True, capture_output=True, text=True, timeout=120)
+        if not os.path.exists(output_file_path) or os.path.getsize(output_file_path) == 0: raise RuntimeError("ffmpeg output empty")
+        logger.info(f"Video converted to: {output_file_path}")
         return output_file_path
+    except subprocess.CalledProcessError as e: logger.error(f"ffmpeg fail {video_file_path}: {e.stderr}"); raise RuntimeError(f"ffmpeg failed: {e.stderr}") from e
+    except subprocess.TimeoutExpired as e: logger.error(f"ffmpeg timeout {video_file_path}"); raise RuntimeError("ffmpeg timed out") from e
+    except Exception as e: logger.error(f"Video conversion error {video_file_path}: {e}"); logger.error(traceback.format_exc()); raise
+    finally:
+        if output_file_path and os.path.exists(output_file_path) and ( 'e' in locals() or (not os.path.exists(output_file_path) or os.path.getsize(output_file_path) == 0)):
              try: os.remove(output_file_path)
              except: pass
 def preprocess_audio(input_audio_path):
+    logger.info(f"Preprocessing audio: {input_audio_path}")
     output_path = None
     try:
+        if not os.path.exists(input_audio_path): raise FileNotFoundError(f"Preprocessing input not found: {input_audio_path}")
         audio = AudioSegment.from_file(input_audio_path)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
+            output_path = temp_file.name; audio.export(output_path, format="mp3")
+        logger.info(f"Audio preprocessed to: {output_path}")
         return output_path
+    except FileNotFoundError as e: logger.error(f"Preprocessing file not found: {e}"); raise
+    except Exception as e: logger.error(f"Preprocessing error {input_audio_path}: {e}"); logger.error(traceback.format_exc()); raise
+    finally:
+        if 'e' in locals() and output_path and os.path.exists(output_path):
+            try: os.remove(output_path)
+            except: pass
 @spaces.GPU(duration=300)
 def transcribe_audio_or_video(file_input):
+    logger.info(f"--- Starting transcription: {type(file_input)} ---")
+    audio_file_to_transcribe = None; temp_files_to_clean = []; transcription = ""
     try:
+        logger.info("Checking Whisper model..."); model_manager.check_whisper_initialized()
         if file_input is None: return ""
+        if isinstance(file_input, str): input_path = file_input
+        elif hasattr(file_input, 'name') and file_input.name: input_path = file_input.name
+        else: raise TypeError("Invalid input type.")
+        if not os.path.exists(input_path): raise FileNotFoundError(f"Input not found: {input_path}")
         file_extension = os.path.splitext(input_path)[1].lower()
         if file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.webm']:
             converted_audio_path = convert_video_to_audio(input_path)
             temp_files_to_clean.append(converted_audio_path); audio_file_to_process = converted_audio_path
+        elif file_extension in ['.mp3', '.wav', '.ogg', '.flac', '.m4a', '.aac']: audio_file_to_process = input_path
+        else: raise ValueError(f"Unsupported type: {file_extension}")
         try:
             preprocessed_audio_path = preprocess_audio(audio_file_to_process)
             if preprocessed_audio_path != audio_file_to_process: temp_files_to_clean.append(preprocessed_audio_path)
             audio_file_to_transcribe = preprocessed_audio_path
+        except Exception as preprocess_err: logger.warning(f"Preprocessing failed ({preprocess_err}), using original."); audio_file_to_transcribe = audio_file_to_process
+        if not os.path.exists(audio_file_to_transcribe): raise FileNotFoundError(f"File to transcribe lost: {audio_file_to_transcribe}")
+        logger.info(f"Transcribing: {audio_file_to_transcribe}")
         with torch.inference_mode():
+            use_fp16 = torch.cuda.is_available()
             result = model_manager.whisper_model.transcribe(audio_file_to_transcribe, fp16=use_fp16)
+        if not result or "text" not in result: raise RuntimeError("Transcription empty result")
+        transcription = result.get("text", "")
+        logger.info(f"Transcription success: '{transcription[:100]}...'")
+    except Exception as e: logger.error(f"!!! Transcription failed: {e}"); logger.error(traceback.format_exc()); transcription = f"Error during transcription: {e}"
     finally:
+        logger.debug(f"--- Cleaning {len(temp_files_to_clean)} temp transcription files ---")
         for temp_file in temp_files_to_clean:
             try:
+                if os.path.exists(temp_file): os.remove(temp_file)
+            except Exception as e: logger.warning(f"Cleanup failed {temp_file}: {e}")
         return transcription
 @lru_cache(maxsize=16)
 def read_document(document_path):
+    logger.info(f"Reading document: {document_path}")
     try:
+        if not os.path.exists(document_path): raise FileNotFoundError(f"Doc not found: {document_path}")
+        ext = os.path.splitext(document_path)[1].lower(); logger.debug(f"Doc type: {ext}")
         content = ""
+        if ext == ".pdf":
             doc = fitz.open(document_path)
+            if doc.is_encrypted and not doc.authenticate(""): raise ValueError("Encrypted PDF")
             content = "\n".join([page.get_text() for page in doc]); doc.close()
+        elif ext == ".docx": doc = docx.Document(document_path); content = "\n".join([p.text for p in doc.paragraphs])
+        elif ext in (".xlsx", ".xls"):
+            xls = pd.ExcelFile(document_path); parts = []
+            for sheet in xls.sheet_names: df = pd.read_excel(xls, sheet_name=sheet); parts.append(f"--- {sheet} ---\n{df.to_string()}")
+            content = "\n\n".join(parts).strip()
+        elif ext == ".csv":
             try:
+                with open(document_path, 'rb') as f: import chardet; enc = chardet.detect(f.read())['encoding']
+                df = pd.read_csv(document_path, encoding=enc)
+            except Exception as e1:
+                 logger.warning(f"CSV parse failed ({e1}), trying alternatives...")
+                 try: df = pd.read_csv(document_path, sep=';', encoding=enc)
+                 except Exception as e2: df = pd.read_csv(document_path, encoding='latin1') # Last resort
             content = df.to_string()
+        else: return "Unsupported file type."
+        logger.info(f"Doc read success. Length: {len(content)}")
         return content
+    except Exception as e: logger.error(f"!!! Read doc error: {e}"); logger.error(traceback.format_exc()); return f"Error reading document: {e}"
 @lru_cache(maxsize=16)
 def read_url(url):
+    logger.info(f"Reading URL: {url}")
     if not url or not url.strip().startswith('http'): return ""
     try:
+        headers = {'User-Agent': 'Mozilla/5.0 ...', 'Accept': 'text/html...', 'Accept-Language': 'en-US,en;q=0.9', 'Connection': 'keep-alive'}
         response = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
         response.raise_for_status()
+        ct = response.headers.get('content-type', '').lower()
+        if not ('html' in ct or 'text' in ct): return f"Error: Non-text content type: {ct}"
+        enc = response.encoding if response.encoding else response.apparent_encoding
+        html = response.content.decode(enc or 'utf-8', errors='ignore')
+        soup = BeautifulSoup(html, 'html.parser')
+        for tag in soup(["script", "style", "meta", "noscript", "iframe", "header", "footer", "nav", "aside", "form", "button", "link", "head"]): tag.extract()
+        main = (soup.find("main") or soup.find("article") or soup.find("div", class_=["content", "main", "post-content", "entry-content", "article-body", "story-content"]) or soup.find("div", id=["content", "main", "article", "story"]))
+        text = main.get_text(separator='\n', strip=True) if main else soup.body.get_text(separator='\n', strip=True) if soup.body else soup.get_text(separator='\n', strip=True)
+        lines = [line.strip() for line in text.split('\n') if line.strip()]; cleaned = "\n".join(lines)
+        if not cleaned: return "Error: Could not extract text."
+        max_c = 15000; final = (cleaned[:max_c] + "... [truncated]") if len(cleaned) > max_c else cleaned
+        logger.info(f"URL read success. Length: {len(final)}")
+        return final
+    except Exception as e: logger.error(f"!!! Read URL error: {e}"); logger.error(traceback.format_exc()); return f"Error reading URL: {e}"
 def process_social_media_url(url):
+    logger.info(f"--- Processing social URL: {url} ---")
     if not url or not url.strip().startswith('http'): return None
+    text = None; video = None; audio_file = None
+    try: text_res = read_url(url); text = text_res if text_res and not text_res.startswith("Error:") else None
+    except Exception as e: logger.error(f"Social text read error: {e}")
     try:
+        audio_file = download_social_media_video(url)
+        if audio_file: video_res = transcribe_audio_or_video(audio_file); video = video_res if video_res and not video_res.startswith("Error:") else None
+    except Exception as e: logger.error(f"Social audio proc error: {e}")
     finally:
+         if audio_file and os.path.exists(audio_file):
+            try: os.remove(audio_file)
+            except Exception as e: logger.warning(f"Social cleanup fail {audio_file}: {e}")
+    logger.debug(f"--- Finished social URL: {url} ---")
+    if text or video: return {"text": text or "", "video": video or ""}
+    else: return None
 # ==============================================================
 # ========= SIMPLIFIED generate_news FOR DEBUGGING =============
 # ==============================================================
+# --- create_demo function ---
+# --- MODIFIED: Removed file_types from gr.File ---
 def create_demo():
     """Creates the Gradio interface"""
     logger.info("--- Creating Gradio interface ---")
         all_inputs = []
         with gr.Row():
             with gr.Column(scale=2):
                 instructions = gr.Textbox(label="Instructions for the News Article", placeholder="Enter specific instructions...", lines=2)
                 all_inputs.append(instructions)
                 facts = gr.Textbox(label="Main Facts", placeholder="Describe the most important facts...", lines=4)
                 all_inputs.append(facts)
                 with gr.Row():
                     size_slider = gr.Slider(label="Approximate Length (words)", minimum=100, maximum=700, value=250, step=50)
                     all_inputs.append(size_slider)
                     tone_dropdown = gr.Dropdown(label="Tone of the News Article", choices=["neutral", "serious", "formal", "urgent", "investigative", "human-interest", "lighthearted"], value="neutral")
                     all_inputs.append(tone_dropdown)
             with gr.Column(scale=3):
                 with gr.Tabs():
                     with gr.TabItem("📝 Documents"):
                         gr.Markdown("Upload relevant documents (PDF, DOCX, XLSX, CSV). Max 5.")
                         doc_inputs = []
                         for i in range(1, 6):
+                            # *** CHANGED: Removed file_types ***
+                            doc_file = gr.File(label=f"Document {i}", file_count="single")
                             doc_inputs.append(doc_file)
                         all_inputs.extend(doc_inputs)
                     with gr.TabItem("🔊 Audio/Video"):
                          gr.Markdown("Upload audio or video files... Max 5 sources.")
                          audio_video_inputs = []
                          for i in range(1, 6):
                             with gr.Group():
                                 gr.Markdown(f"**Source {i}**")
+                                # *** CHANGED: Removed file_types ***
+                                audio_file = gr.File(label=f"Audio/Video File {i}")
                                 with gr.Row():
                                     speaker_name = gr.Textbox(label="Speaker Name", placeholder="Name...")
                                     speaker_role = gr.Textbox(label="Role/Position", placeholder="Role...")
                                 audio_video_inputs.extend([audio_file, speaker_name, speaker_role])
                          all_inputs.extend(audio_video_inputs)
                     with gr.TabItem("🌐 URLs"):
                          gr.Markdown("Add URLs to relevant web pages... Max 5.")
                          url_inputs = []
                          for i in range(1, 6):
                             url_textbox = gr.Textbox(label=f"URL {i}", placeholder="https://...")
                             url_inputs.append(url_textbox)
                          all_inputs.extend(url_inputs)
                     with gr.TabItem("📱 Social Media"):
                          gr.Markdown("Add URLs to social media posts... Max 3.")
                          social_inputs = []
                          for i in range(1, 4):
                                     social_context_textbox = gr.Textbox(label=f"Context", placeholder="Context...")
                                 social_inputs.extend([social_url_textbox, social_name_textbox, social_context_textbox])
                          all_inputs.extend(social_inputs)
+        generate_button = gr.Button("✨ Generate News Article", variant="primary")
+        clear_button = gr.Button("🔄 Clear All Inputs")
         with gr.Tabs():
             with gr.TabItem("📄 Generated News Article"):
                 news_output = gr.Textbox(label="Draft News Article", lines=20, show_copy_button=True, interactive=False)
             with gr.TabItem("🎙️ Source Transcriptions & Logs"):
                 transcriptions_output = gr.Textbox(label="Transcriptions and Processing Log", lines=15, show_copy_button=True, interactive=False)
         outputs_list = [news_output, transcriptions_output]
         generate_button.click(fn=generate_news, inputs=all_inputs, outputs=outputs_list)
         def clear_all_inputs_and_outputs():
             logger.info("--- Clear All button clicked ---")
                 elif isinstance(input_comp, gr.File): reset_values.append(None)
                 else: reset_values.append(None)
             reset_values.extend(["", ""])
+            try: logger.info("Calling model reset from clear button handler."); model_manager.reset_models(force=True)
+            except Exception as e: logger.error(f"Error resetting models during clear: {e}")
             logger.info("--- Clear All operation finished ---")
             return reset_values
         clear_button.click(fn=clear_all_inputs_and_outputs, inputs=None, outputs=all_inputs + outputs_list)
+    logger.info("--- Gradio interface creation complete ---")
     return demo
 # --- main execution block remains the same ---
 if __name__ == "__main__":
     logger.info("--- Running main execution block ---")
     news_demo = create_demo()
     news_demo.queue()
     logger.info("Launching Gradio interface...")
     try:
         news_demo.launch(server_name="0.0.0.0", server_port=7860)
     except Exception as launch_err:
          logger.error(f"!!! CRITICAL Error during Gradio launch: {launch_err}")
          logger.error(traceback.format_exc())
+    logger.info("--- Main execution block potentially finished ---")