CamiloVega commited on
Commit
48d2a37
verified
1 Parent(s): 76536cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -727
app.py CHANGED
@@ -54,18 +54,18 @@ class ModelManager:
54
  return cls._instance
55
 
56
  def __init__(self):
57
- if not self._initialized:
58
  logger.info("Initializing ModelManager attributes.")
59
  self.tokenizer = None
60
  self.model = None
61
  self.text_pipeline = None
62
  self.whisper_model = None
63
- # self._initialized remains False until a model is successfully loaded
64
  self.llm_loaded = False
65
  self.whisper_loaded = False
66
  self.last_used = time.time()
67
  self.llm_loading = False
68
  self.whisper_loading = False
 
69
 
70
  def _cleanup_memory(self):
71
  """Utility function to force memory cleanup"""
@@ -81,6 +81,7 @@ class ModelManager:
81
  """Explicitly resets the LLM components."""
82
  logger.info("--- Attempting to reset LLM ---")
83
  try:
 
84
  if hasattr(self, 'model') and self.model is not None:
85
  del self.model
86
  logger.info("LLM model deleted.")
@@ -91,10 +92,11 @@ class ModelManager:
91
  del self.text_pipeline
92
  logger.info("LLM pipeline deleted.")
93
 
 
94
  self.model = None
95
  self.tokenizer = None
96
  self.text_pipeline = None
97
- self.llm_loaded = False
98
  self._cleanup_memory()
99
  logger.info("LLM components reset successfully.")
100
  except Exception as e:
@@ -110,7 +112,7 @@ class ModelManager:
110
  logger.info("Whisper model deleted.")
111
 
112
  self.whisper_model = None
113
- self.whisper_loaded = False
114
  self._cleanup_memory()
115
  logger.info("Whisper component reset successfully.")
116
  except Exception as e:
@@ -130,7 +132,7 @@ class ModelManager:
130
  return True
131
 
132
  # Explicitly try to free Whisper memory before loading LLM
133
- self.reset_whisper()
134
 
135
  self.llm_loading = True
136
  logger.info("Starting LLM initialization...")
@@ -186,7 +188,7 @@ class ModelManager:
186
  return True
187
 
188
  # Explicitly try to free LLM memory before loading Whisper
189
- self.reset_llm()
190
 
191
  self.whisper_loading = True
192
  logger.info("Starting Whisper initialization...")
@@ -219,8 +221,7 @@ class ModelManager:
219
  self.initialize_llm() # This will raise error if it fails
220
  logger.info("LLM initialization completed by check_llm_initialized.")
221
  else:
222
- # This state should ideally be avoided by sequential logic, but handle anyway
223
- logger.info("LLM initialization is already in progress by another request. Waiting briefly.")
224
  time.sleep(10)
225
  if not self.llm_loaded:
226
  logger.error("LLM initialization timed out or failed after waiting.")
@@ -241,7 +242,7 @@ class ModelManager:
241
  self.initialize_whisper() # This will raise error if it fails
242
  logger.info("Whisper initialization completed by check_whisper_initialized.")
243
  else:
244
- logger.info("Whisper initialization is already in progress by another request. Waiting briefly.")
245
  time.sleep(10)
246
  if not self.whisper_loaded:
247
  logger.error("Whisper initialization timed out or failed after waiting.")
@@ -254,180 +255,118 @@ class ModelManager:
254
 
255
  def reset_models(self, force=False):
256
  """Reset models if idle or forced."""
257
- # This function now just calls the specific resets.
258
- # Idle logic could be added back if needed, but explicit resets might be better for ZeroGPU.
259
  if force:
260
  logger.info("Forcing reset of all models.")
261
  self.reset_llm()
262
  self.reset_whisper()
263
- # else: # Optional: Add idle check back if desired
264
- # current_time = time.time()
265
- # if current_time - self.last_used > 600:
266
- # logger.info("Resetting models due to inactivity.")
267
- # self.reset_llm()
268
- # self.reset_whisper()
269
 
270
 
271
- # --- Rest of the functions (download_social_media_video, convert_video_to_audio, etc.) remain the same as the previous version with detailed logging ---
272
- # --- Paste the functions from the previous answer here, starting from @lru_cache...download_social_media_video down to the end of process_social_media_url ---
 
273
 
274
- @lru_cache(maxsize=16) # Reduced cache size slightly
 
 
 
 
 
275
  def download_social_media_video(url):
276
  """Download audio from a social media video URL."""
277
  logger.info(f"Attempting to download audio from social media URL: {url}")
278
  temp_dir = tempfile.mkdtemp()
279
- # Note: Using filename from info_dict can be unreliable. Let yt-dlp decide final name.
280
  output_template = os.path.join(temp_dir, '%(id)s.%(ext)s')
281
- final_audio_file_path = None # Will store the path of the actual downloaded mp3
282
-
283
  ydl_opts = {
284
- 'format': 'bestaudio/best',
285
- 'postprocessors': [{
286
- 'key': 'FFmpegExtractAudio',
287
- 'preferredcodec': 'mp3',
288
- 'preferredquality': '192', # Standard quality
289
- }],
290
- 'outtmpl': output_template,
291
- 'quiet': True,
292
- 'no_warnings': True,
293
- 'nocheckcertificate': True, # Sometimes needed for tricky sites
294
- 'retries': 3, # Add retries
295
- 'socket_timeout': 15, # Timeout
296
- 'cachedir': False, # Avoid caching issues in temp envs
297
  }
298
  try:
299
- logger.info(f"yt-dlp options: {ydl_opts}") # Log options for debugging
300
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
301
- logger.info("Extracting info and downloading...")
302
- # Download should happen here and postprocessor rename to .mp3
303
  info_dict = ydl.extract_info(url, download=True)
304
- logger.info(f"yt-dlp extraction complete for {url}. ID: {info_dict.get('id')}")
305
-
306
- # Find the downloaded MP3 file (name might not exactly match ID if title had weird chars)
307
  found_files = [f for f in os.listdir(temp_dir) if f.endswith('.mp3')]
308
  if found_files:
309
  final_audio_file_path = os.path.join(temp_dir, found_files[0])
310
- logger.info(f"Found downloaded MP3: {final_audio_file_path}")
311
  else:
312
  logger.error(f"Could not find downloaded MP3 file in {temp_dir} for URL {url}")
313
  raise FileNotFoundError(f"Downloaded MP3 not found in {temp_dir}")
314
-
315
- # Read the file content to return, as the temp dir might be cleaned up
316
- logger.info(f"Reading content of {final_audio_file_path}")
317
- with open(final_audio_file_path, 'rb') as f:
318
- audio_content = f.read()
319
-
320
- # Save the content to a new temporary file that Gradio can handle better
321
- logger.info("Saving audio content to a new temporary file...")
322
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_output_file:
323
  temp_output_file.write(audio_content)
324
  final_path_for_gradio = temp_output_file.name
325
  logger.info(f"Audio content saved to temporary file for processing: {final_path_for_gradio}")
326
  return final_path_for_gradio
327
-
328
  except yt_dlp.utils.DownloadError as e:
329
  logger.error(f"!!! yt-dlp download error for {url}: {str(e)}")
330
- # Don't log full traceback here as DownloadError is often informative enough
331
- return None # Return None to indicate failure
332
  except Exception as e:
333
  logger.error(f"!!! Unexpected error downloading video from {url}: {str(e)}")
334
  logger.error(traceback.format_exc())
335
- return None # Return None
336
  finally:
337
- # Clean up the temporary directory and its contents
338
  if os.path.exists(temp_dir):
339
- logger.info(f"Cleaning up temporary download directory: {temp_dir}")
340
  try:
341
  import shutil
342
  shutil.rmtree(temp_dir)
343
- logger.info("Temporary download directory cleaned up.")
344
- except Exception as cleanup_e:
345
- logger.warning(f"Could not completely clean up temp download directory {temp_dir}: {cleanup_e}")
346
-
347
 
348
  def convert_video_to_audio(video_file_path):
349
  """Convert a video file to audio using ffmpeg directly."""
350
  logger.info(f"Attempting to convert video to audio: {video_file_path}")
351
- output_file_path = None # Initialize
352
  try:
353
- # Create a temporary file path for the output MP3
354
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
355
- output_file_path = temp_file.name
356
- logger.info(f"Output audio path will be: {output_file_path}")
357
-
358
- command = [
359
- "ffmpeg",
360
- "-i", video_file_path,
361
- "-vn", # No video
362
- "-acodec", "libmp3lame", # Specify MP3 codec
363
- "-ab", "192k", # Audio bitrate
364
- "-ar", "44100", # Audio sample rate
365
- "-ac", "2", # Stereo audio
366
- output_file_path,
367
- "-y", # Overwrite output file if it exists
368
- "-loglevel", "error" # Suppress verbose ffmpeg output, show only errors
369
- ]
370
- logger.info(f"Executing ffmpeg command: {' '.join(command)}")
371
-
372
- process = subprocess.run(command, check=True, capture_output=True, text=True, timeout=120) # Added timeout
373
- logger.info(f"ffmpeg conversion successful for {video_file_path}.")
374
- # Log stdout/stderr only if needed for debugging, can be verbose
375
- # logger.debug(f"ffmpeg stdout: {process.stdout}")
376
- # logger.debug(f"ffmpeg stderr: {process.stderr}")
377
-
378
-
379
- # Verify output file exists and has size
380
  if not os.path.exists(output_file_path) or os.path.getsize(output_file_path) == 0:
381
  logger.error(f"ffmpeg conversion failed: Output file '{output_file_path}' not created or is empty.")
382
  raise RuntimeError(f"ffmpeg conversion failed: Output file '{output_file_path}' not created or is empty.")
383
-
384
  logger.info(f"Video successfully converted to audio: {output_file_path}")
385
  return output_file_path
386
  except subprocess.CalledProcessError as e:
387
  logger.error(f"!!! ffmpeg command failed with exit code {e.returncode} for video: {video_file_path}")
388
  logger.error(f"ffmpeg stderr: {e.stderr}")
389
- # Clean up potentially empty/invalid output file
390
  if output_file_path and os.path.exists(output_file_path):
391
- logger.info(f"Cleaning up failed ffmpeg output file: {output_file_path}")
392
- os.remove(output_file_path)
393
  raise RuntimeError(f"ffmpeg conversion failed: {e.stderr}") from e
394
  except subprocess.TimeoutExpired as e:
395
  logger.error(f"!!! ffmpeg command timed out after {e.timeout} seconds for video: {video_file_path}")
396
  if output_file_path and os.path.exists(output_file_path):
397
- logger.info(f"Cleaning up potentially incomplete ffmpeg output file: {output_file_path}")
398
- os.remove(output_file_path)
399
  raise RuntimeError(f"ffmpeg conversion timed out after {e.timeout} seconds.") from e
400
  except Exception as e:
401
  logger.error(f"!!! Error converting video '{video_file_path}': {str(e)}")
402
  logger.error(traceback.format_exc())
403
- # Clean up potentially created output file
404
  if output_file_path and os.path.exists(output_file_path):
405
- logger.info(f"Cleaning up ffmpeg output file due to exception: {output_file_path}")
406
- os.remove(output_file_path)
407
- raise # Re-raise the exception
408
 
409
  def preprocess_audio(input_audio_path):
410
  """Preprocess the audio file (e.g., normalize volume)."""
411
  logger.info(f"Attempting to preprocess audio file: {input_audio_path}")
412
  output_path = None
413
  try:
414
- # Check if file exists before trying to load
415
  if not os.path.exists(input_audio_path):
416
  logger.error(f"Input audio file for preprocessing not found: {input_audio_path}")
417
  raise FileNotFoundError(f"Input audio file not found: {input_audio_path}")
418
-
419
- logger.info("Loading audio with pydub...")
420
  audio = AudioSegment.from_file(input_audio_path)
421
- logger.info("Audio loaded.")
422
-
423
- # Example: Normalize volume (optional, uncomment if needed)
424
- # logger.info(f"Original dBFS: {audio.dBFS}. Normalizing target: -20 dBFS.")
425
- # change_in_dBFS = -20.0 - audio.dBFS
426
- # audio = audio.apply_gain(change_in_dBFS)
427
- # logger.info("Volume normalization applied.")
428
-
429
- # Export to a new temporary file
430
- logger.info("Exporting preprocessed audio...")
431
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
432
  output_path = temp_file.name
433
  audio.export(output_path, format="mp3")
@@ -435,397 +374,208 @@ def preprocess_audio(input_audio_path):
435
  return output_path
436
  except FileNotFoundError as e:
437
  logger.error(f"!!! File not found during audio preprocessing: {e}")
438
- raise # Reraise specific error
439
  except Exception as e:
440
  logger.error(f"!!! Error preprocessing audio '{input_audio_path}': {str(e)}")
441
  logger.error(traceback.format_exc())
442
- # Clean up potentially created output file if error occurred during export
443
  if output_path and os.path.exists(output_path):
444
- logger.info(f"Cleaning up preprocessing output file due to exception: {output_path}")
445
- os.remove(output_path)
446
- raise # Re-raise the exception
447
 
448
- @spaces.GPU(duration=300) # Allow more time for transcription
449
  def transcribe_audio_or_video(file_input):
450
  """Transcribe an audio or video file (local path or Gradio File object)."""
451
  logger.info(f"--- Starting transcription process for input: {type(file_input)} ---")
452
- audio_file_to_transcribe = None
453
- original_input_path = None
454
- temp_files_to_clean = []
455
- processing_step = "Initialization"
456
- transcription = "" # Default value
457
-
458
  try:
459
  processing_step = "Whisper Model Check"
460
  logger.info("Checking/Initializing Whisper model for transcription...")
461
- # *** Crucial Change: Reset LLM before ensuring Whisper is ready ***
462
- # model_manager.reset_llm()
463
- # *** Let's try NOT resetting LLM here, maybe both can fit? Check logs if fails ***
464
- model_manager.check_whisper_initialized() # Will raise error if fails
465
  logger.info("Whisper model is ready for transcription.")
466
-
467
- if file_input is None:
468
- logger.info("No file input provided for transcription. Returning empty string.")
469
- return ""
470
-
471
- # ... (rest of the input type handling, conversion, preprocessing - same as before) ...
472
  processing_step = "Input Type Handling"
473
- if isinstance(file_input, str): # Input is a path
474
  original_input_path = file_input
475
- logger.info(f"Processing path input: {original_input_path}")
476
- if not os.path.exists(original_input_path):
477
- logger.error(f"Input file path does not exist: {original_input_path}")
478
- raise FileNotFoundError(f"Input file not found: {original_input_path}")
479
  input_path = original_input_path
480
- elif hasattr(file_input, 'name') and file_input.name: # Input is a Gradio File object
481
  original_input_path = file_input.name
482
- logger.info(f"Processing Gradio file input. Temp path: {original_input_path}")
483
- if not os.path.exists(original_input_path):
484
- logger.error(f"Gradio temporary file path does not exist: {original_input_path}")
485
- raise FileNotFoundError(f"Gradio temporary file not found: {original_input_path}")
486
- input_path = original_input_path # Gradio usually provides a temp path
487
- else:
488
- logger.error(f"Unsupported input type for transcription: {type(file_input)}")
489
- raise TypeError("Invalid input type for transcription. Expected file path or Gradio File object.")
490
-
491
- logger.info(f"Input path identified: {input_path}")
492
  file_extension = os.path.splitext(input_path)[1].lower()
493
- logger.info(f"File extension: {file_extension}")
494
-
495
  processing_step = "Video Conversion Check"
496
  if file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.webm']:
497
- logger.info(f"Detected video file ({file_extension}), attempting conversion to audio...")
498
- converted_audio_path = convert_video_to_audio(input_path) # Raises error on failure
499
- logger.info(f"Video converted to audio: {converted_audio_path}")
500
- temp_files_to_clean.append(converted_audio_path)
501
- audio_file_to_process = converted_audio_path
502
- elif file_extension in ['.mp3', '.wav', '.ogg', '.flac', '.m4a', '.aac']: # Added more audio types
503
  logger.info(f"Detected audio file ({file_extension}).")
504
  audio_file_to_process = input_path
505
- else:
506
- logger.error(f"Unsupported file extension for transcription: {file_extension}")
507
- raise ValueError(f"Unsupported file type: {file_extension}")
508
-
509
  processing_step = "Audio Preprocessing"
510
  try:
511
- logger.info(f"Attempting to preprocess audio file: {audio_file_to_process}")
512
  preprocessed_audio_path = preprocess_audio(audio_file_to_process)
513
- if preprocessed_audio_path != audio_file_to_process:
514
- logger.info("Preprocessing created a new file, adding to cleanup list.")
515
- temp_files_to_clean.append(preprocessed_audio_path)
516
  audio_file_to_transcribe = preprocessed_audio_path
517
- logger.info(f"Audio preprocessing successful. File to transcribe: {audio_file_to_transcribe}")
518
  except Exception as preprocess_err:
519
- logger.warning(f"Audio preprocessing failed: {preprocess_err}. Using original/converted audio for transcription.")
520
- logger.warning(traceback.format_exc())
521
  audio_file_to_transcribe = audio_file_to_process
522
-
523
  processing_step = "Transcription Execution"
524
  logger.info(f"Starting transcription execution for: {audio_file_to_transcribe}")
525
- if not os.path.exists(audio_file_to_transcribe):
526
- logger.error(f"Audio file to transcribe not found: {audio_file_to_transcribe}")
527
- raise FileNotFoundError(f"Audio file to transcribe not found: {audio_file_to_transcribe}")
528
-
529
- logger.info("Calling Whisper model transcribe method...")
530
  with torch.inference_mode():
531
- use_fp16 = torch.cuda.is_available()
532
- logger.info(f"Using fp16 for transcription: {use_fp16}")
533
- # Add language='en' if most input is English, might improve speed/accuracy
534
- result = model_manager.whisper_model.transcribe(
535
- audio_file_to_transcribe, fp16=use_fp16 #, language="en"
536
- )
537
- logger.info("Whisper model transcribe method finished.")
538
- if not result or "text" not in result:
539
- logger.error("Transcription failed to produce results or 'text' key missing.")
540
- raise RuntimeError("Transcription failed to produce results")
541
-
542
  transcription = result.get("text", "Error: Transcription result empty")
543
- log_transcription = (transcription[:100] + '...') if len(transcription) > 100 else transcription
544
- logger.info(f"Transcription completed successfully: '{log_transcription}'")
545
-
546
  processing_step = "Success"
547
- # *** Optional: Reset Whisper immediately after use if memory is tight ***
548
- # logger.info("Resetting Whisper model after successful transcription.")
549
- # model_manager.reset_whisper()
550
-
551
- # ... (keep the except blocks same as before) ...
552
  except FileNotFoundError as e:
553
- logger.error(f"!!! File not found error during transcription (Step: {processing_step}): {e}")
554
- logger.error(traceback.format_exc())
555
- transcription = f"Error: Input file not found ({e})"
556
  except ValueError as e:
557
- logger.error(f"!!! Value error during transcription (Step: {processing_step}): {e}")
558
- logger.error(traceback.format_exc())
559
- transcription = f"Error: Unsupported file type ({e})"
560
  except TypeError as e:
561
- logger.error(f"!!! Type error during transcription setup (Step: {processing_step}): {e}")
562
- logger.error(traceback.format_exc())
563
- transcription = f"Error: Invalid input provided ({e})"
564
  except RuntimeError as e:
565
- logger.error(f"!!! Runtime error during transcription (Step: {processing_step}): {e}")
566
- logger.error(traceback.format_exc())
567
- transcription = f"Error during processing: {e}"
568
  except Exception as e:
569
- logger.error(f"!!! Unexpected error during transcription (Step: {processing_step}): {str(e)}")
570
- logger.error(traceback.format_exc())
571
- transcription = f"Error processing the file: An unexpected error occurred."
572
  finally:
573
- # Clean up temporary files
574
- logger.info(f"--- Cleaning up temporary files for transcription process ({len(temp_files_to_clean)} files) ---")
575
  for temp_file in temp_files_to_clean:
576
  try:
577
- if os.path.exists(temp_file):
578
- os.remove(temp_file)
579
- logger.info(f"Cleaned up temporary file: {temp_file}")
580
- # else:
581
- # logger.info(f"Temporary file already removed or never created: {temp_file}")
582
- except Exception as e:
583
- logger.warning(f"Could not remove temporary file {temp_file}: {str(e)}")
584
- logger.info("--- Finished transcription process cleanup ---")
585
- # Return the result (could be transcription or error message)
586
  return transcription
587
 
588
-
589
  @lru_cache(maxsize=16)
590
  def read_document(document_path):
591
  """Read the content of a document (PDF, DOCX, XLSX, CSV)."""
592
  logger.info(f"Attempting to read document: {document_path}")
593
  try:
594
- if not os.path.exists(document_path):
595
- logger.error(f"Document not found at path: {document_path}")
596
- raise FileNotFoundError(f"Document not found: {document_path}")
597
-
598
- file_extension = os.path.splitext(document_path)[1].lower()
599
- logger.info(f"Document type detected: {file_extension}")
600
-
601
  content = ""
602
  if file_extension == ".pdf":
603
- logger.info("Reading PDF document using PyMuPDF (fitz)...")
604
  doc = fitz.open(document_path)
605
- # Check for encryption first
606
  if doc.is_encrypted:
607
- logger.warning(f"PDF document {document_path} is encrypted. Attempting to decrypt with empty password.")
608
- if not doc.authenticate(""):
609
- logger.error(f"Failed to decrypt PDF {document_path} with empty password.")
610
- doc.close()
611
- raise ValueError("Encrypted PDF cannot be read without password.")
612
- content = "\n".join([page.get_text() for page in doc])
613
- doc.close()
614
- logger.info(f"PDF read successfully. Length: {len(content)} chars.")
615
  elif file_extension == ".docx":
616
- logger.info("Reading DOCX document using python-docx...")
617
- doc = docx.Document(document_path)
618
- content = "\n".join([paragraph.text for paragraph in doc.paragraphs])
619
- logger.info(f"DOCX read successfully. Length: {len(content)} chars.")
620
  elif file_extension in (".xlsx", ".xls"):
621
- logger.info("Reading Excel document using pandas...")
622
- xls = pd.ExcelFile(document_path)
623
- text_parts = []
624
  for sheet_name in xls.sheet_names:
625
- logger.info(f"Reading sheet: {sheet_name}")
626
- df = pd.read_excel(xls, sheet_name=sheet_name)
627
- text_parts.append(f"--- Sheet: {sheet_name} ---\n{df.to_string()}")
628
  content = "\n\n".join(text_parts).strip()
629
- logger.info(f"Excel read successfully. Length: {len(content)} chars.")
630
  elif file_extension == ".csv":
631
- logger.info("Reading CSV document using pandas...")
632
  try:
633
- logger.info("Attempting CSV read with comma separator...")
634
- # Try to sniff encoding
635
- with open(document_path, 'rb') as f:
636
- import chardet
637
- encoding = chardet.detect(f.read())['encoding']
638
- logger.info(f"Detected CSV encoding: {encoding}")
639
  df = pd.read_csv(document_path, encoding=encoding)
640
- except (pd.errors.ParserError, UnicodeDecodeError) as e1:
641
- logger.warning(f"Could not parse CSV {document_path} with comma/detected encoding ({e1}), trying semicolon.")
642
- try:
643
- df = pd.read_csv(document_path, sep=';', encoding=encoding)
644
  except Exception as e2:
645
- logger.error(f"Also failed with semicolon separator: {e2}. Trying latin1 encoding.")
646
- try:
647
- df = pd.read_csv(document_path, encoding='latin1')
648
- except Exception as e3:
649
- logger.error(f"Also failed with latin1: {e3}. Giving up.")
650
- raise ValueError(f"Failed to parse CSV: {e1}, {e2}, {e3}")
651
-
652
  content = df.to_string()
653
- logger.info(f"CSV read successfully. Length: {len(content)} chars.")
654
- else:
655
- logger.warning(f"Unsupported document type for reading: {file_extension}")
656
- return "Unsupported file type. Please upload a PDF, DOCX, XLSX or CSV document."
657
-
658
  return content
659
-
660
- except FileNotFoundError as e:
661
- logger.error(f"!!! File not found error while reading document: {e}")
662
- return f"Error: Document file not found at {document_path}"
663
- except ValueError as e: # Catch specific errors like encryption or CSV parsing
664
- logger.error(f"!!! Value error reading document {document_path}: {e}")
665
- logger.error(traceback.format_exc())
666
- return f"Error reading document: {e}"
667
- except Exception as e:
668
- logger.error(f"!!! Error reading document {document_path}: {str(e)}")
669
- logger.error(traceback.format_exc())
670
- return f"Error reading document: {str(e)}"
671
 
672
  @lru_cache(maxsize=16)
673
  def read_url(url):
674
  """Read the main textual content of a URL."""
675
  logger.info(f"Attempting to read URL: {url}")
676
- if not url or not url.strip().startswith('http'):
677
- logger.warning(f"Invalid or empty URL provided: '{url}'")
678
- return ""
679
-
680
  try:
681
- headers = {
682
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
683
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
684
- 'Accept-Language': 'en-US,en;q=0.9',
685
- 'Connection': 'keep-alive'
686
- }
687
- logger.info(f"Sending GET request to {url} with headers: {headers}")
688
  response = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
689
- logger.info(f"Received response from {url}. Status code: {response.status_code}, Content-Type: {response.headers.get('content-type')}")
690
  response.raise_for_status()
691
-
692
  content_type = response.headers.get('content-type', '').lower()
693
- if not ('html' in content_type or 'text' in content_type):
694
- logger.warning(f"URL {url} has non-text content type: {content_type}. Skipping.")
695
- return f"Error: URL content type ({content_type}) is not text/html."
696
-
697
- # Decode content carefully
698
  detected_encoding = response.encoding if response.encoding else response.apparent_encoding
699
- logger.info(f"Decoding response content with encoding: {detected_encoding}")
700
  html_content = response.content.decode(detected_encoding or 'utf-8', errors='ignore')
701
-
702
- logger.info(f"Parsing HTML content from {url} using BeautifulSoup...")
703
  soup = BeautifulSoup(html_content, 'html.parser')
704
- logger.info("HTML parsed.")
705
-
706
- logger.info("Removing script, style, and other non-content tags...")
707
  tags_to_remove = ["script", "style", "meta", "noscript", "iframe", "header", "footer", "nav", "aside", "form", "button", "link", "head"]
708
  for tag_name in tags_to_remove:
709
- for element in soup.find_all(tag_name):
710
- element.extract()
711
- logger.info("Non-content tags removed.")
712
-
713
- logger.info("Attempting to find main content container...")
714
- main_content = (
715
- soup.find("main") or
716
- soup.find("article") or
717
- soup.find("div", class_=["content", "main", "post-content", "entry-content", "article-body", "story-content"]) or # Added more common classes
718
- soup.find("div", id=["content", "main", "article", "story"]) # Added more common IDs
719
- )
720
-
721
  text = ""
722
- if main_content:
723
- logger.info("Main content container found. Extracting text.")
724
- text = main_content.get_text(separator='\n', strip=True)
725
  else:
726
- logger.warning(f"No specific main content container found for {url}. Falling back to body text.")
727
  body = soup.find("body")
728
- if body:
729
- logger.info("Extracting text from body.")
730
- text = body.get_text(separator='\n', strip=True)
731
- else:
732
- logger.warning(f"No body tag found for {url}. Falling back to all text.")
733
- text = soup.get_text(separator='\n', strip=True)
734
-
735
- logger.info("Cleaning extracted text whitespace...")
736
- lines = [line.strip() for line in text.split('\n') if line.strip()]
737
- cleaned_text = "\n".join(lines)
738
- logger.info(f"Text cleaning complete. Initial length: {len(text)}, Cleaned length: {len(cleaned_text)}")
739
-
740
- if not cleaned_text:
741
- logger.warning(f"Could not extract meaningful text from URL: {url}")
742
- return "Error: Could not extract text content from URL."
743
-
744
  max_chars = 15000
745
- if len(cleaned_text) > max_chars:
746
- logger.info(f"URL content is long ({len(cleaned_text)} chars), truncating to {max_chars} characters.")
747
- final_text = cleaned_text[:max_chars] + "... [content truncated]"
748
- else:
749
- final_text = cleaned_text
750
-
751
- logger.info(f"Successfully read and processed URL {url}. Final text length: {len(final_text)}")
752
  return final_text
753
- except requests.exceptions.RequestException as e:
754
- logger.error(f"!!! Error fetching URL {url}: {str(e)}")
755
- return f"Error reading URL: Could not fetch content ({e})"
756
- except Exception as e:
757
- logger.error(f"!!! Error parsing URL {url}: {str(e)}")
758
- logger.error(traceback.format_exc())
759
- return f"Error reading URL: Could not parse content ({e})"
760
 
761
  def process_social_media_url(url):
762
  """Process a social media URL, attempting to get text and transcribe video/audio."""
763
  logger.info(f"--- Starting processing for social media URL: {url} ---")
764
- if not url or not url.strip().startswith('http'):
765
- logger.warning(f"Invalid or empty social media URL provided: '{url}'")
766
- return None
767
-
768
- text_content = None
769
- video_transcription = None
770
- temp_audio_file = None
771
-
772
- # 1. Try extracting text content using read_url
773
- logger.info(f"Attempting to read text content from social URL: {url}")
774
  try:
 
775
  text_content_result = read_url(url)
776
- if text_content_result and not text_content_result.startswith("Error:"):
777
- text_content = text_content_result
778
- logger.info(f"Successfully read text content from {url}. Length: {len(text_content)}")
779
- elif text_content_result:
780
- logger.warning(f"read_url returned an error for {url}: {text_content_result}")
781
- else:
782
- logger.info(f"No text content extracted by read_url for {url}.")
783
- except Exception as e:
784
- logger.error(f"!!! Exception during text content extraction from social URL {url}: {e}")
785
- logger.error(traceback.format_exc())
786
-
787
- # 2. Try downloading and transcribing potential video/audio content
788
- logger.info(f"Attempting to download audio/video content from social URL: {url}")
789
  try:
790
- temp_audio_file = download_social_media_video(url) # Returns path or None
 
791
  if temp_audio_file:
792
- logger.info(f"Audio downloaded from {url} to {temp_audio_file}. Proceeding to transcription.")
793
- transcription_result = transcribe_audio_or_video(temp_audio_file) # Handles errors internally
794
- if transcription_result and not transcription_result.startswith("Error"):
795
- video_transcription = transcription_result
796
- logger.info(f"Successfully transcribed audio from {url}. Length: {len(video_transcription)}")
797
- elif transcription_result:
798
- logger.warning(f"Transcription returned an error for audio from {url}: {transcription_result}")
799
- else:
800
- logger.warning(f"Transcription returned empty result for audio from {url}.")
801
- else:
802
- logger.info(f"No downloadable audio/video found or download failed for URL: {url}")
803
- except Exception as e:
804
- logger.error(f"!!! Exception during video/audio processing for social URL {url}: {e}")
805
- logger.error(traceback.format_exc())
806
  finally:
807
- # Clean up downloaded file if it exists
808
- if temp_audio_file and os.path.exists(temp_audio_file):
809
- logger.info(f"Cleaning up downloaded social media audio file: {temp_audio_file}")
810
- try:
811
- os.remove(temp_audio_file)
812
- logger.info("Downloaded audio file removed.")
813
- except Exception as e:
814
- logger.warning(f"Failed to cleanup downloaded audio {temp_audio_file}: {e}")
815
-
816
- # Return results
817
- logger.info(f"--- Finished processing social media URL: {url} ---")
818
- if text_content or video_transcription:
819
- return {"text": text_content or "", "video": video_transcription or ""}
820
- else:
821
- # Return None only if BOTH failed and no content was retrieved
822
- logger.info(f"No usable content retrieved for social URL: {url}")
823
- return None
824
-
825
- # Create global model manager instance
826
- logger.info("Creating global ModelManager instance.")
827
- model_manager = ModelManager()
828
-
829
 
830
  @spaces.GPU(duration=10) # Duraci贸n corta solo para prueba
831
  def generate_news(instructions, facts, size, tone, *args):
@@ -843,6 +593,7 @@ def generate_news(instructions, facts, size, tone, *args):
843
  # --- NO CONSTRUYAS EL PROMPT ---
844
  # --- NO LLAMES A text_pipeline ---
845
  pass # Simplemente no hacemos nada
 
846
 
847
  except Exception as e:
848
  total_time = time.time() - request_start_time
@@ -857,319 +608,9 @@ def generate_news(instructions, facts, size, tone, *args):
857
  # Aseg煤rate de devolver dos strings
858
  return generated_article, raw_transcriptions
859
 
860
- # --- Argument Parsing ---
861
- # (Same as before)
862
- logger.info("Parsing dynamic arguments...")
863
- num_docs = 5
864
- num_audio_sources = 5
865
- num_audio_inputs_per_source = 3
866
- num_urls = 5
867
- num_social_sources = 3
868
- num_social_inputs_per_source = 3
869
- total_expected_args = num_docs + (num_audio_sources * num_audio_inputs_per_source) + num_urls + (num_social_sources * num_social_inputs_per_source)
870
-
871
- args_list = list(args)
872
- if len(args_list) < total_expected_args:
873
- logger.warning(f"Received fewer arguments ({len(args_list)}) than expected ({total_expected_args}). Padding with None.")
874
- args_list.extend([None] * (total_expected_args - len(args_list)))
875
- elif len(args_list) > total_expected_args:
876
- logger.warning(f"Received more arguments ({len(args_list)}) than expected ({total_expected_args}). Truncating.")
877
- args_list = args_list[:total_expected_args]
878
-
879
- doc_files = args_list[0:num_docs]
880
- audio_inputs_flat = args_list[num_docs : num_docs + (num_audio_sources * num_audio_inputs_per_source)]
881
- url_inputs = args_list[num_docs + (num_audio_sources * num_audio_inputs_per_source) : num_docs + (num_audio_sources * num_audio_inputs_per_source) + num_urls]
882
- social_inputs_flat = args_list[num_docs + (num_audio_sources * num_audio_inputs_per_source) + num_urls : total_expected_args]
883
- logger.info(f"Argument parsing complete. Docs: {len(doc_files)}, Audio sets: {len(audio_inputs_flat)//3}, URLs: {len(url_inputs)}, Social sets: {len(social_inputs_flat)//3}")
884
-
885
- knowledge_base = {
886
- "instructions": instructions or "No specific instructions provided.",
887
- "facts": facts or "No specific facts provided.",
888
- "document_content": [], "audio_data": [], "url_content": [], "social_content": []
889
- }
890
-
891
- # --- Process Inputs (Documents, URLs, Collect Audio Info, Social Media) ---
892
- # (Keep the processing loops same as previous version with detailed logging)
893
- # --- Processing document inputs ---
894
- logger.info("--- Processing document inputs ---")
895
- doc_counter = 0
896
- for i, doc_file in enumerate(doc_files):
897
- if doc_file and hasattr(doc_file, 'name') and doc_file.name:
898
- doc_filename = os.path.basename(doc_file.name)
899
- logger.info(f"Attempting to read document {i+1}: {doc_filename} (Path: {doc_file.name})")
900
- try:
901
- content = read_document(doc_file.name)
902
- if content and content.startswith("Error:"):
903
- logger.warning(f"Skipping document {i+1} ({doc_filename}) due to read error: {content}")
904
- raw_transcriptions += f"[Document {i+1}: {doc_filename}] Error reading: {content}\n\n"
905
- elif content:
906
- doc_excerpt = (content[:1000] + "... [document truncated]") if len(content) > 1000 else content
907
- knowledge_base["document_content"].append(f"[Document {i+1} Source: {doc_filename}]\n{doc_excerpt}")
908
- logger.info(f"Successfully processed document {i+1}. Added excerpt.")
909
- doc_counter += 1
910
- else:
911
- logger.warning(f"Skipping document {i+1} ({doc_filename}) because content is empty.")
912
- raw_transcriptions += f"[Document {i+1}: {doc_filename}] Read successfully but content is empty.\n\n"
913
- except Exception as e:
914
- logger.error(f"!!! FAILED to process document {i+1} ({doc_filename}): {e}")
915
- logger.error(traceback.format_exc())
916
- raw_transcriptions += f"[Document {i+1}: {doc_filename}] CRITICAL Error during processing: {e}\n\n"
917
- # else: logger.info(f"Skipping document slot {i+1}: No file.")
918
- logger.info(f"--- Finished processing {doc_counter} documents. ---")
919
-
920
- # --- Processing URL inputs ---
921
- logger.info("--- Processing URL inputs ---")
922
- url_counter = 0
923
- for i, url in enumerate(url_inputs):
924
- if url and isinstance(url, str) and url.strip().startswith('http'):
925
- logger.info(f"Attempting to read URL {i+1}: {url}")
926
- try:
927
- content = read_url(url)
928
- if content and content.startswith("Error:"):
929
- logger.warning(f"Skipping URL {i+1} ({url}) due to read error: {content}")
930
- raw_transcriptions += f"[URL {i+1}: {url}] Error reading: {content}\n\n"
931
- elif content:
932
- knowledge_base["url_content"].append(f"[URL {i+1} Source: {url}]\n{content}")
933
- logger.info(f"Successfully processed URL {i+1}. Added content.")
934
- url_counter += 1
935
- else:
936
- logger.warning(f"Skipping URL {i+1} ({url}) because content is empty.")
937
- raw_transcriptions += f"[URL {i+1}: {url}] Read successfully but content is empty.\n\n"
938
- except Exception as e:
939
- logger.error(f"!!! FAILED to process URL {i+1} ({url}): {e}")
940
- logger.error(traceback.format_exc())
941
- raw_transcriptions += f"[URL {i+1}: {url}] CRITICAL Error during processing: {e}\n\n"
942
- # elif url: logger.warning(f"Skipping URL slot {i+1}: Invalid URL '{url}'.")
943
- # else: logger.info(f"Skipping URL slot {i+1}: No URL.")
944
- logger.info(f"--- Finished processing {url_counter} URLs. ---")
945
-
946
- # --- Processing audio/video inputs (collecting info) ---
947
- logger.info("--- Processing audio/video inputs (collecting info) ---")
948
- has_audio_source = False
949
- audio_counter = 0
950
- for i in range(num_audio_sources):
951
- start_idx = i * num_audio_inputs_per_source
952
- if start_idx + 2 < len(audio_inputs_flat):
953
- audio_file = audio_inputs_flat[start_idx]
954
- name = audio_inputs_flat[start_idx + 1] or f"Unnamed Audio Source {i+1}"
955
- position = audio_inputs_flat[start_idx + 2] or "Role N/A"
956
- if audio_file and hasattr(audio_file, 'name') and audio_file.name:
957
- audio_filename = os.path.basename(audio_file.name)
958
- logger.info(f"Found audio/video source {i+1}: {name} ({position}) - File: {audio_filename} (Path: {audio_file.name})")
959
- knowledge_base["audio_data"].append({"file_path": audio_file.name, "name": name, "position": position, "original_filename": audio_filename})
960
- has_audio_source = True
961
- audio_counter += 1
962
- # else: logger.info(f"Skipping audio source slot {i+1}: No file.")
963
- else: logger.warning(f"Index out of bounds for audio source {i+1}."); break
964
- logger.info(f"--- Finished collecting audio/video info. {audio_counter} sources found. Transcription needed: {has_audio_source} ---")
965
-
966
- # --- Processing social media inputs ---
967
- logger.info("--- Processing social media inputs ---")
968
- social_counter = 0
969
- for i in range(num_social_sources):
970
- start_idx = i * num_social_inputs_per_source
971
- if start_idx + 2 < len(social_inputs_flat):
972
- social_url = social_inputs_flat[start_idx]
973
- social_name = social_inputs_flat[start_idx + 1] or f"Unnamed Social Source {i+1}"
974
- social_context = social_inputs_flat[start_idx + 2] or "Context N/A"
975
- if social_url and isinstance(social_url, str) and social_url.strip().startswith('http'):
976
- logger.info(f"Attempting to process social media URL {i+1}: {social_url} ({social_name}, {social_context})")
977
- try:
978
- social_data = process_social_media_url(social_url)
979
- if social_data: # process_social_media_url now returns dict even if empty
980
- if social_data.get("text") or social_data.get("video"):
981
- logger.info(f"Successfully processed social URL {i+1}. Text: {bool(social_data.get('text'))}, Video: {bool(social_data.get('video'))}")
982
- knowledge_base["social_content"].append({"url": social_url, "name": social_name, "context": social_context, "text": social_data.get("text", ""), "video_transcription": social_data.get("video", "")})
983
- social_counter += 1
984
- else:
985
- logger.warning(f"Processed social URL {i+1} ({social_url}) but found no text or video content.")
986
- raw_transcriptions += f"[Social Media {i+1}: {social_url} ({social_name})] Processed but no content found.\n\n"
987
- # No 'else' needed as process_social_media_url handles internal errors and returns dict
988
- except Exception as e:
989
- logger.error(f"!!! FAILED to process social URL {i+1} ({social_url}): {e}")
990
- logger.error(traceback.format_exc())
991
- raw_transcriptions += f"[Social Media {i+1}: {social_url} ({social_name})] CRITICAL Error during processing: {e}\n\n"
992
- # elif social_url: logger.warning(f"Skipping social slot {i+1}: Invalid URL '{social_url}'.")
993
- # else: logger.info(f"Skipping social slot {i+1}: No URL.")
994
- else: logger.warning(f"Index out of bounds for social source {i+1}."); break
995
- logger.info(f"--- Finished processing {social_counter} social media sources. ---")
996
-
997
-
998
- # --- Transcribe Audio/Video (Conditional) ---
999
- transcriptions_for_prompt = ""
1000
- if has_audio_source:
1001
- logger.info("--- Starting Audio Transcription Phase ---")
1002
- # Whisper check/initialization happens INSIDE transcribe_audio_or_video now
1003
- for idx, data in enumerate(knowledge_base["audio_data"]):
1004
- audio_filename = data['original_filename']
1005
- logger.info(f"Attempting transcription for audio source {idx+1}: {audio_filename} ({data['name']}, {data['position']})")
1006
- try:
1007
- # transcribe_audio_or_video now includes model check and returns error string on failure
1008
- transcription = transcribe_audio_or_video(data["file_path"])
1009
- if transcription and not transcription.startswith("Error"):
1010
- logger.info(f"Transcription successful for audio {idx+1}. Length: {len(transcription)}")
1011
- quote = f'"{transcription}" - {data["name"]}, {data["position"]}'
1012
- transcriptions_for_prompt += f"{quote}\n\n"
1013
- raw_transcriptions += f'[Audio/Video {idx + 1}: {audio_filename} ({data["name"]}, {data["position"]})]\n"{transcription}"\n\n'
1014
- else:
1015
- # Log the error message returned by the function
1016
- logger.warning(f"Transcription failed or returned error for audio source {idx+1} ({audio_filename}): {transcription}")
1017
- raw_transcriptions += f'[Audio/Video {idx + 1}: {audio_filename} ({data["name"]}, {data["position"]})]\n[Transcription Error: {transcription}]\n\n'
1018
- except Exception as e:
1019
- # Catch unexpected errors during the call itself
1020
- logger.error(f"!!! CRITICAL Error during transcription call for audio source {idx+1} ({audio_filename}): {e}")
1021
- logger.error(traceback.format_exc())
1022
- raw_transcriptions += f'[Audio/Video {idx + 1}: {audio_filename} ({data["name"]}, {data["position"]})]\n[CRITICAL Error during transcription call: {e}]\n\n'
1023
- logger.info("--- Finished Audio Transcription Phase ---")
1024
- else:
1025
- logger.info("--- Skipping Audio Transcription Phase (no audio sources found) ---")
1026
-
1027
-
1028
- # --- Add Social Media Content to Prompt Data ---
1029
- # (Same as before)
1030
- logger.info("--- Adding social media content to prompt data ---")
1031
- social_content_added_to_prompt = False
1032
- for idx, data in enumerate(knowledge_base["social_content"]):
1033
- source_id_log = f'[Social Media {idx+1}: {data["url"]} ({data["name"]}, {data["context"]})]'
1034
- source_id_prompt = f'Social Media Post ({data["name"]}, {data["context"]} at {data["url"]}):'
1035
- content_added_this_source = False
1036
- if data["text"]:
1037
- text_excerpt = (data["text"][:500] + "...[text truncated]") if len(data["text"]) > 500 else data["text"]
1038
- social_text_prompt = f'{source_id_prompt}\nText Content:\n"{text_excerpt}"\n\n'
1039
- transcriptions_for_prompt += social_text_prompt
1040
- raw_transcriptions += f"{source_id_log}\nText Content:\n{data['text']}\n\n"
1041
- content_added_this_source = True; social_content_added_to_prompt = True
1042
- if data["video_transcription"]:
1043
- social_video_prompt = f'{source_id_prompt}\nVideo Transcription:\n"{data["video_transcription"]}"\n\n'
1044
- transcriptions_for_prompt += social_video_prompt
1045
- raw_transcriptions += f"{source_id_log}\nVideo Transcription:\n{data['video_transcription']}\n\n"
1046
- content_added_this_source = True; social_content_added_to_prompt = True
1047
- if content_added_this_source: logger.info(f"Added content from social source {idx+1} to prompt data.")
1048
- # else: logger.info(f"No usable content found for social source {idx+1} ({data['url']}).")
1049
- if not social_content_added_to_prompt: logger.info("No content from social media sources was added to the prompt data.")
1050
- logger.info("--- Finished adding social media content ---")
1051
-
1052
-
1053
- # --- Prepare Final Prompt ---
1054
- # (Same as before)
1055
- logger.info("--- Preparing final prompt for LLM ---")
1056
- document_summary = "\n\n".join(knowledge_base["document_content"]) if knowledge_base["document_content"] else "No document content provided or processed successfully."
1057
- url_summary = "\n\n".join(knowledge_base["url_content"]) if knowledge_base["url_content"] else "No URL content provided or processed successfully."
1058
- transcription_summary = transcriptions_for_prompt if transcriptions_for_prompt else "No usable transcriptions or social media content available."
1059
- prompt = f"""<s>[INST] You are a professional news writer... [SAME PROMPT AS BEFORE] ...Begin the article now. [/INST]\nArticle Draft:\n""" # Keep prompt structure
1060
- prompt_words = len(prompt.split()); prompt_chars = len(prompt)
1061
- logger.info(f"Generated prompt length: {prompt_words} words / {prompt_chars} characters.")
1062
- logger.debug(f"Prompt Start: {prompt[:200]}...")
1063
- logger.debug(f"...Prompt End: {prompt[-200:]}")
1064
- logger.info("--- Finished preparing final prompt ---")
1065
-
1066
-
1067
- # --- Generate News Article ---
1068
- logger.info("--- Starting LLM Generation Phase ---")
1069
- generation_start_time = time.time()
1070
-
1071
- # Ensure LLM is ready (will also reset Whisper if loaded)
1072
- logger.info("Ensuring LLM is initialized for generation...")
1073
- try:
1074
- # *** Crucial Change: Reset Whisper before ensuring LLM is ready ***
1075
- # model_manager.reset_whisper()
1076
- # *** Let's try NOT resetting whisper, check logs if fails ***
1077
- model_manager.check_llm_initialized() # Raises error if fails
1078
- logger.info("LLM confirmed ready for generation.")
1079
- except Exception as llm_init_err:
1080
- logger.error(f"!!! FATAL: LLM could not be initialized. Cannot generate article.")
1081
- logger.error(traceback.format_exc())
1082
- raise RuntimeError(f"LLM failed to initialize, cannot generate article: {llm_init_err}")
1083
-
1084
-
1085
- # Estimate max_new_tokens
1086
- # (Same as before)
1087
- estimated_tokens_per_word = 1.5
1088
- max_new_tokens = int(size * estimated_tokens_per_word + 150)
1089
- model_max_length = 2048
1090
- prompt_tokens_estimate = prompt_chars // 3
1091
- available_tokens = model_max_length - prompt_tokens_estimate - 50
1092
- max_new_tokens = min(max_new_tokens, available_tokens)
1093
- max_new_tokens = max(max_new_tokens, 100)
1094
- logger.info(f"Estimated prompt tokens: ~{prompt_tokens_estimate}. Model max length: {model_max_length}. Requesting max_new_tokens: {max_new_tokens}")
1095
-
1096
- try:
1097
- # Generate text
1098
- # (Same pipeline call as before)
1099
- logger.info("Calling LLM text generation pipeline...")
1100
- outputs = model_manager.text_pipeline(
1101
- prompt, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7,
1102
- top_p=0.95, top_k=50, repetition_penalty=1.15,
1103
- pad_token_id=model_manager.tokenizer.eos_token_id, num_return_sequences=1
1104
- )
1105
- logger.info("LLM pipeline call finished.")
1106
-
1107
- if not outputs or not isinstance(outputs, list) or not outputs[0].get('generated_text'):
1108
- logger.error("LLM pipeline returned invalid or empty output.")
1109
- raise RuntimeError("LLM generation failed: Pipeline returned empty or invalid output.")
1110
-
1111
- full_generated_text = outputs[0]['generated_text']
1112
- logger.info(f"Raw generated text length: {len(full_generated_text)} chars.")
1113
-
1114
- # Clean output
1115
- # (Same cleaning logic as before)
1116
- logger.info("Cleaning LLM output (removing prompt)...")
1117
- inst_marker = "[/INST]"
1118
- marker_pos = full_generated_text.find(inst_marker)
1119
- if marker_pos != -1:
1120
- generated_article = full_generated_text[marker_pos + len(inst_marker):].strip()
1121
- if generated_article.startswith("Article Draft:"):
1122
- generated_article = generated_article[len("Article Draft:"):].strip()
1123
- logger.info("Prompt removed successfully using '[/INST]' marker.")
1124
- else:
1125
- generated_article = full_generated_text
1126
- logger.warning("Prompt marker '[/INST]' not found in LLM output. Returning full generated text.")
1127
-
1128
-
1129
- generation_time = time.time() - generation_start_time
1130
- logger.info(f"News generation completed in {generation_time:.2f} seconds.")
1131
- logger.info(f"Final article length: {len(generated_article)} characters.")
1132
- logger.info("--- Finished LLM Generation Phase ---")
1133
- # *** Optional: Reset LLM immediately after generation ***
1134
- # logger.info("Resetting LLM model after successful generation.")
1135
- # model_manager.reset_llm()
1136
-
1137
- # ... (keep OOM and general Exception handling for generation same as before) ...
1138
- except torch.cuda.OutOfMemoryError as oom_error:
1139
- logger.error(f"!!! CUDA Out of Memory error during LLM generation: {oom_error}")
1140
- logger.error(traceback.format_exc())
1141
- logger.info("Attempting to reset models after OOM error...")
1142
- model_manager.reset_models(force=True)
1143
- raise RuntimeError("Generation failed due to insufficient GPU memory.") from oom_error
1144
- except Exception as gen_error:
1145
- logger.error(f"!!! Error during text generation pipeline: {str(gen_error)}")
1146
- logger.error(traceback.format_exc())
1147
- raise RuntimeError(f"LLM generation failed: {gen_error}") from gen_error
1148
-
1149
- total_time = time.time() - request_start_time
1150
- logger.info(f"--- generate_news function completed successfully in {total_time:.2f} seconds. ---")
1151
- return generated_article.strip(), raw_transcriptions.strip()
1152
-
1153
- except Exception as e:
1154
- # Catch-all for any unexpected error during the entire generate_news flow
1155
- # (Same as before)
1156
- total_time = time.time() - request_start_time
1157
- logger.error(f"!!! UNHANDLED Error in generate_news function after {total_time:.2f} seconds: {str(e)}")
1158
- logger.error(traceback.format_exc())
1159
- try:
1160
- logger.info("Attempting model reset due to unhandled error in generate_news.")
1161
- model_manager.reset_models(force=True)
1162
- except Exception as reset_error:
1163
- logger.error(f"Failed to reset models after error: {str(reset_error)}")
1164
- error_message = f"Error generating the news article: An unexpected error occurred. Please check logs. ({str(e)})"
1165
- transcription_log = raw_transcriptions.strip() + f"\n\n[CRITICAL ERROR] News generation failed unexpectedly: {str(e)}"
1166
- return error_message, transcription_log
1167
- finally:
1168
- # Final cleanup/logging
1169
- logger.info("--- generate_news function finished execution (either success or error) ---")
1170
- # Force cleanup after every run attempt on ZeroGPU
1171
- logger.info("Forcing model reset at the end of generate_news call.")
1172
- model_manager.reset_models(force=True)
1173
 
1174
 
1175
  # --- create_demo function remains the same as the previous version ---
@@ -1259,6 +700,7 @@ def create_demo():
1259
 
1260
  outputs_list = [news_output, transcriptions_output]
1261
  logger.info("Setting up event handlers.")
 
1262
  generate_button.click(fn=generate_news, inputs=all_inputs, outputs=outputs_list)
1263
  logger.info("Generate button click handler set.")
1264
 
 
54
  return cls._instance
55
 
56
  def __init__(self):
57
+ if not hasattr(self, '_initialized') or not self._initialized: # Ensure init runs only once
58
  logger.info("Initializing ModelManager attributes.")
59
  self.tokenizer = None
60
  self.model = None
61
  self.text_pipeline = None
62
  self.whisper_model = None
 
63
  self.llm_loaded = False
64
  self.whisper_loaded = False
65
  self.last_used = time.time()
66
  self.llm_loading = False
67
  self.whisper_loading = False
68
+ self._initialized = True # Mark as initialized
69
 
70
  def _cleanup_memory(self):
71
  """Utility function to force memory cleanup"""
 
81
  """Explicitly resets the LLM components."""
82
  logger.info("--- Attempting to reset LLM ---")
83
  try:
84
+ # Check attributes before deleting
85
  if hasattr(self, 'model') and self.model is not None:
86
  del self.model
87
  logger.info("LLM model deleted.")
 
92
  del self.text_pipeline
93
  logger.info("LLM pipeline deleted.")
94
 
95
+ # Reset attributes
96
  self.model = None
97
  self.tokenizer = None
98
  self.text_pipeline = None
99
+ self.llm_loaded = False # Mark as not loaded
100
  self._cleanup_memory()
101
  logger.info("LLM components reset successfully.")
102
  except Exception as e:
 
112
  logger.info("Whisper model deleted.")
113
 
114
  self.whisper_model = None
115
+ self.whisper_loaded = False # Mark as not loaded
116
  self._cleanup_memory()
117
  logger.info("Whisper component reset successfully.")
118
  except Exception as e:
 
132
  return True
133
 
134
  # Explicitly try to free Whisper memory before loading LLM
135
+ # self.reset_whisper() # Optional: Uncomment if severe memory pressure
136
 
137
  self.llm_loading = True
138
  logger.info("Starting LLM initialization...")
 
188
  return True
189
 
190
  # Explicitly try to free LLM memory before loading Whisper
191
+ # self.reset_llm() # Optional: Uncomment if severe memory pressure
192
 
193
  self.whisper_loading = True
194
  logger.info("Starting Whisper initialization...")
 
221
  self.initialize_llm() # This will raise error if it fails
222
  logger.info("LLM initialization completed by check_llm_initialized.")
223
  else:
224
+ logger.info("LLM initialization is already in progress. Waiting briefly.")
 
225
  time.sleep(10)
226
  if not self.llm_loaded:
227
  logger.error("LLM initialization timed out or failed after waiting.")
 
242
  self.initialize_whisper() # This will raise error if it fails
243
  logger.info("Whisper initialization completed by check_whisper_initialized.")
244
  else:
245
+ logger.info("Whisper initialization is already in progress. Waiting briefly.")
246
  time.sleep(10)
247
  if not self.whisper_loaded:
248
  logger.error("Whisper initialization timed out or failed after waiting.")
 
255
 
256
  def reset_models(self, force=False):
257
  """Reset models if idle or forced."""
 
 
258
  if force:
259
  logger.info("Forcing reset of all models.")
260
  self.reset_llm()
261
  self.reset_whisper()
 
 
 
 
 
 
262
 
263
 
264
+ # Create global model manager instance
265
+ logger.info("Creating global ModelManager instance.")
266
+ model_manager = ModelManager()
267
 
268
+
269
+ # --- Functions: download_social_media_video, convert_video_to_audio, etc. ---
270
+ # --- These functions are kept exactly the same as the previous full version ---
271
+ # --- with detailed logging. Paste them here. ---
272
+
273
+ @lru_cache(maxsize=16)
274
  def download_social_media_video(url):
275
  """Download audio from a social media video URL."""
276
  logger.info(f"Attempting to download audio from social media URL: {url}")
277
  temp_dir = tempfile.mkdtemp()
 
278
  output_template = os.path.join(temp_dir, '%(id)s.%(ext)s')
279
+ final_audio_file_path = None
 
280
  ydl_opts = {
281
+ 'format': 'bestaudio/best', 'postprocessors': [{'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192'}],
282
+ 'outtmpl': output_template, 'quiet': True, 'no_warnings': True, 'nocheckcertificate': True, 'retries': 3, 'socket_timeout': 15, 'cachedir': False
 
 
 
 
 
 
 
 
 
 
 
283
  }
284
  try:
285
+ logger.debug(f"yt-dlp options: {ydl_opts}")
286
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
287
+ logger.debug("Extracting info and downloading...")
 
288
  info_dict = ydl.extract_info(url, download=True)
289
+ logger.debug(f"yt-dlp extraction complete for {url}. ID: {info_dict.get('id')}")
 
 
290
  found_files = [f for f in os.listdir(temp_dir) if f.endswith('.mp3')]
291
  if found_files:
292
  final_audio_file_path = os.path.join(temp_dir, found_files[0])
293
+ logger.debug(f"Found downloaded MP3: {final_audio_file_path}")
294
  else:
295
  logger.error(f"Could not find downloaded MP3 file in {temp_dir} for URL {url}")
296
  raise FileNotFoundError(f"Downloaded MP3 not found in {temp_dir}")
297
+ logger.debug(f"Reading content of {final_audio_file_path}")
298
+ with open(final_audio_file_path, 'rb') as f: audio_content = f.read()
299
+ logger.debug("Saving audio content to a new temporary file...")
 
 
 
 
 
300
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_output_file:
301
  temp_output_file.write(audio_content)
302
  final_path_for_gradio = temp_output_file.name
303
  logger.info(f"Audio content saved to temporary file for processing: {final_path_for_gradio}")
304
  return final_path_for_gradio
 
305
  except yt_dlp.utils.DownloadError as e:
306
  logger.error(f"!!! yt-dlp download error for {url}: {str(e)}")
307
+ return None
 
308
  except Exception as e:
309
  logger.error(f"!!! Unexpected error downloading video from {url}: {str(e)}")
310
  logger.error(traceback.format_exc())
311
+ return None
312
  finally:
 
313
  if os.path.exists(temp_dir):
314
+ logger.debug(f"Cleaning up temporary download directory: {temp_dir}")
315
  try:
316
  import shutil
317
  shutil.rmtree(temp_dir)
318
+ except Exception as cleanup_e: logger.warning(f"Could not clean up {temp_dir}: {cleanup_e}")
 
 
 
319
 
320
  def convert_video_to_audio(video_file_path):
321
  """Convert a video file to audio using ffmpeg directly."""
322
  logger.info(f"Attempting to convert video to audio: {video_file_path}")
323
+ output_file_path = None
324
  try:
325
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file: output_file_path = temp_file.name
326
+ logger.debug(f"Output audio path will be: {output_file_path}")
327
+ command = ["ffmpeg", "-i", video_file_path, "-vn", "-acodec", "libmp3lame", "-ab", "192k", "-ar", "44100", "-ac", "2", output_file_path, "-y", "-loglevel", "error"]
328
+ logger.debug(f"Executing ffmpeg command: {' '.join(command)}")
329
+ process = subprocess.run(command, check=True, capture_output=True, text=True, timeout=120)
330
+ logger.debug(f"ffmpeg conversion successful for {video_file_path}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  if not os.path.exists(output_file_path) or os.path.getsize(output_file_path) == 0:
332
  logger.error(f"ffmpeg conversion failed: Output file '{output_file_path}' not created or is empty.")
333
  raise RuntimeError(f"ffmpeg conversion failed: Output file '{output_file_path}' not created or is empty.")
 
334
  logger.info(f"Video successfully converted to audio: {output_file_path}")
335
  return output_file_path
336
  except subprocess.CalledProcessError as e:
337
  logger.error(f"!!! ffmpeg command failed with exit code {e.returncode} for video: {video_file_path}")
338
  logger.error(f"ffmpeg stderr: {e.stderr}")
 
339
  if output_file_path and os.path.exists(output_file_path):
340
+ try: os.remove(output_file_path)
341
+ except: pass
342
  raise RuntimeError(f"ffmpeg conversion failed: {e.stderr}") from e
343
  except subprocess.TimeoutExpired as e:
344
  logger.error(f"!!! ffmpeg command timed out after {e.timeout} seconds for video: {video_file_path}")
345
  if output_file_path and os.path.exists(output_file_path):
346
+ try: os.remove(output_file_path)
347
+ except: pass
348
  raise RuntimeError(f"ffmpeg conversion timed out after {e.timeout} seconds.") from e
349
  except Exception as e:
350
  logger.error(f"!!! Error converting video '{video_file_path}': {str(e)}")
351
  logger.error(traceback.format_exc())
 
352
  if output_file_path and os.path.exists(output_file_path):
353
+ try: os.remove(output_file_path)
354
+ except: pass
355
+ raise
356
 
357
  def preprocess_audio(input_audio_path):
358
  """Preprocess the audio file (e.g., normalize volume)."""
359
  logger.info(f"Attempting to preprocess audio file: {input_audio_path}")
360
  output_path = None
361
  try:
 
362
  if not os.path.exists(input_audio_path):
363
  logger.error(f"Input audio file for preprocessing not found: {input_audio_path}")
364
  raise FileNotFoundError(f"Input audio file not found: {input_audio_path}")
365
+ logger.debug("Loading audio with pydub...")
 
366
  audio = AudioSegment.from_file(input_audio_path)
367
+ logger.debug("Audio loaded.")
368
+ # Optional normalization can be added here
369
+ logger.debug("Exporting preprocessed audio...")
 
 
 
 
 
 
 
370
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
371
  output_path = temp_file.name
372
  audio.export(output_path, format="mp3")
 
374
  return output_path
375
  except FileNotFoundError as e:
376
  logger.error(f"!!! File not found during audio preprocessing: {e}")
377
+ raise
378
  except Exception as e:
379
  logger.error(f"!!! Error preprocessing audio '{input_audio_path}': {str(e)}")
380
  logger.error(traceback.format_exc())
 
381
  if output_path and os.path.exists(output_path):
382
+ try: os.remove(output_path)
383
+ except: pass
384
+ raise
385
 
386
+ @spaces.GPU(duration=300)
387
  def transcribe_audio_or_video(file_input):
388
  """Transcribe an audio or video file (local path or Gradio File object)."""
389
  logger.info(f"--- Starting transcription process for input: {type(file_input)} ---")
390
+ audio_file_to_transcribe = None; original_input_path = None
391
+ temp_files_to_clean = []; processing_step = "Initialization"; transcription = ""
 
 
 
 
392
  try:
393
  processing_step = "Whisper Model Check"
394
  logger.info("Checking/Initializing Whisper model for transcription...")
395
+ model_manager.check_whisper_initialized()
 
 
 
396
  logger.info("Whisper model is ready for transcription.")
397
+ if file_input is None: return ""
 
 
 
 
 
398
  processing_step = "Input Type Handling"
399
+ if isinstance(file_input, str):
400
  original_input_path = file_input
401
+ if not os.path.exists(original_input_path): raise FileNotFoundError(f"Input file not found: {original_input_path}")
 
 
 
402
  input_path = original_input_path
403
+ elif hasattr(file_input, 'name') and file_input.name:
404
  original_input_path = file_input.name
405
+ if not os.path.exists(original_input_path): raise FileNotFoundError(f"Gradio temporary file not found: {original_input_path}")
406
+ input_path = original_input_path
407
+ else: raise TypeError("Invalid input type for transcription.")
408
+ logger.debug(f"Input path identified: {input_path}")
 
 
 
 
 
 
409
  file_extension = os.path.splitext(input_path)[1].lower()
410
+ logger.debug(f"File extension: {file_extension}")
 
411
  processing_step = "Video Conversion Check"
412
  if file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.webm']:
413
+ logger.info(f"Detected video file ({file_extension}), converting...")
414
+ converted_audio_path = convert_video_to_audio(input_path)
415
+ temp_files_to_clean.append(converted_audio_path); audio_file_to_process = converted_audio_path
416
+ elif file_extension in ['.mp3', '.wav', '.ogg', '.flac', '.m4a', '.aac']:
 
 
417
  logger.info(f"Detected audio file ({file_extension}).")
418
  audio_file_to_process = input_path
419
+ else: raise ValueError(f"Unsupported file type: {file_extension}")
 
 
 
420
  processing_step = "Audio Preprocessing"
421
  try:
422
+ logger.debug(f"Attempting to preprocess audio file: {audio_file_to_process}")
423
  preprocessed_audio_path = preprocess_audio(audio_file_to_process)
424
+ if preprocessed_audio_path != audio_file_to_process: temp_files_to_clean.append(preprocessed_audio_path)
 
 
425
  audio_file_to_transcribe = preprocessed_audio_path
426
+ logger.debug(f"Using preprocessed audio: {audio_file_to_transcribe}")
427
  except Exception as preprocess_err:
428
+ logger.warning(f"Audio preprocessing failed: {preprocess_err}. Using original/converted audio.")
 
429
  audio_file_to_transcribe = audio_file_to_process
 
430
  processing_step = "Transcription Execution"
431
  logger.info(f"Starting transcription execution for: {audio_file_to_transcribe}")
432
+ if not os.path.exists(audio_file_to_transcribe): raise FileNotFoundError(f"Audio file to transcribe not found: {audio_file_to_transcribe}")
433
+ logger.debug("Calling Whisper model transcribe method...")
 
 
 
434
  with torch.inference_mode():
435
+ use_fp16 = torch.cuda.is_available(); logger.debug(f"Using fp16: {use_fp16}")
436
+ result = model_manager.whisper_model.transcribe(audio_file_to_transcribe, fp16=use_fp16)
437
+ logger.debug("Whisper transcribe method finished.")
438
+ if not result or "text" not in result: raise RuntimeError("Transcription failed to produce results")
 
 
 
 
 
 
 
439
  transcription = result.get("text", "Error: Transcription result empty")
440
+ logger.info(f"Transcription completed successfully: '{transcription[:100]}...'")
 
 
441
  processing_step = "Success"
 
 
 
 
 
442
  except FileNotFoundError as e:
443
+ logger.error(f"!!! File not found error (Step: {processing_step}): {e}"); transcription = f"Error: Input file not found ({e})"
 
 
444
  except ValueError as e:
445
+ logger.error(f"!!! Value error (Step: {processing_step}): {e}"); transcription = f"Error: Unsupported file type ({e})"
 
 
446
  except TypeError as e:
447
+ logger.error(f"!!! Type error (Step: {processing_step}): {e}"); transcription = f"Error: Invalid input provided ({e})"
 
 
448
  except RuntimeError as e:
449
+ logger.error(f"!!! Runtime error (Step: {processing_step}): {e}"); logger.error(traceback.format_exc()); transcription = f"Error during processing: {e}"
 
 
450
  except Exception as e:
451
+ logger.error(f"!!! Unexpected error (Step: {processing_step}): {str(e)}"); logger.error(traceback.format_exc()); transcription = f"Error processing the file: An unexpected error occurred."
 
 
452
  finally:
453
+ logger.debug(f"--- Cleaning up {len(temp_files_to_clean)} temp files for transcription ---")
 
454
  for temp_file in temp_files_to_clean:
455
  try:
456
+ if os.path.exists(temp_file): os.remove(temp_file); logger.debug(f"Cleaned: {temp_file}")
457
+ except Exception as e: logger.warning(f"Could not remove temp file {temp_file}: {e}")
458
+ logger.debug("--- Finished transcription cleanup ---")
 
 
 
 
 
 
459
  return transcription
460
 
 
461
  @lru_cache(maxsize=16)
462
  def read_document(document_path):
463
  """Read the content of a document (PDF, DOCX, XLSX, CSV)."""
464
  logger.info(f"Attempting to read document: {document_path}")
465
  try:
466
+ if not os.path.exists(document_path): raise FileNotFoundError(f"Document not found: {document_path}")
467
+ file_extension = os.path.splitext(document_path)[1].lower(); logger.debug(f"Doc type: {file_extension}")
 
 
 
 
 
468
  content = ""
469
  if file_extension == ".pdf":
470
+ logger.debug("Reading PDF using PyMuPDF...")
471
  doc = fitz.open(document_path)
 
472
  if doc.is_encrypted:
473
+ logger.warning(f"PDF {document_path} encrypted. Trying empty password.")
474
+ if not doc.authenticate(""): raise ValueError("Encrypted PDF cannot be read.")
475
+ content = "\n".join([page.get_text() for page in doc]); doc.close()
 
 
 
 
 
476
  elif file_extension == ".docx":
477
+ logger.debug("Reading DOCX using python-docx...")
478
+ doc = docx.Document(document_path); content = "\n".join([p.text for p in doc.paragraphs])
 
 
479
  elif file_extension in (".xlsx", ".xls"):
480
+ logger.debug("Reading Excel using pandas...")
481
+ xls = pd.ExcelFile(document_path); text_parts = []
 
482
  for sheet_name in xls.sheet_names:
483
+ logger.debug(f"Reading sheet: {sheet_name}")
484
+ df = pd.read_excel(xls, sheet_name=sheet_name); text_parts.append(f"--- Sheet: {sheet_name} ---\n{df.to_string()}")
 
485
  content = "\n\n".join(text_parts).strip()
 
486
  elif file_extension == ".csv":
487
+ logger.debug("Reading CSV using pandas...")
488
  try:
489
+ with open(document_path, 'rb') as f: import chardet; encoding = chardet.detect(f.read())['encoding']
490
+ logger.debug(f"Detected CSV encoding: {encoding}")
 
 
 
 
491
  df = pd.read_csv(document_path, encoding=encoding)
492
+ except (pd.errors.ParserError, UnicodeDecodeError, LookupError) as e1:
493
+ logger.warning(f"CSV parse failed ({e1}), trying semicolon.")
494
+ try: df = pd.read_csv(document_path, sep=';', encoding=encoding)
 
495
  except Exception as e2:
496
+ logger.error(f"Also failed with semicolon ({e2}). Trying latin1.")
497
+ try: df = pd.read_csv(document_path, encoding='latin1')
498
+ except Exception as e3: raise ValueError(f"Failed to parse CSV: {e1}, {e2}, {e3}")
 
 
 
 
499
  content = df.to_string()
500
+ else: return "Unsupported file type. Please upload a PDF, DOCX, XLSX or CSV document."
501
+ logger.info(f"Document read successfully. Length: {len(content)} chars.")
 
 
 
502
  return content
503
+ except FileNotFoundError as e: logger.error(f"!!! File not found reading doc: {e}"); return f"Error: Document file not found: {e}"
504
+ except ValueError as e: logger.error(f"!!! Value error reading doc: {e}"); return f"Error reading document: {e}"
505
+ except Exception as e: logger.error(f"!!! Error reading doc: {str(e)}"); logger.error(traceback.format_exc()); return f"Error reading document: {str(e)}"
 
 
 
 
 
 
 
 
 
506
 
507
  @lru_cache(maxsize=16)
508
  def read_url(url):
509
  """Read the main textual content of a URL."""
510
  logger.info(f"Attempting to read URL: {url}")
511
+ if not url or not url.strip().startswith('http'): return ""
 
 
 
512
  try:
513
+ headers = {'User-Agent': 'Mozilla/5.0 ... Chrome/91...', 'Accept': 'text/html...', 'Accept-Language': 'en-US,en;q=0.9', 'Connection': 'keep-alive'}
514
+ logger.debug(f"Sending GET to {url}")
 
 
 
 
 
515
  response = requests.get(url, headers=headers, timeout=20, allow_redirects=True)
516
+ logger.debug(f"Response from {url}: {response.status_code}, CT: {response.headers.get('content-type')}")
517
  response.raise_for_status()
 
518
  content_type = response.headers.get('content-type', '').lower()
519
+ if not ('html' in content_type or 'text' in content_type): return f"Error: URL content type ({content_type}) is not text/html."
 
 
 
 
520
  detected_encoding = response.encoding if response.encoding else response.apparent_encoding
 
521
  html_content = response.content.decode(detected_encoding or 'utf-8', errors='ignore')
522
+ logger.debug(f"Parsing HTML ({len(html_content)} bytes) from {url}...")
 
523
  soup = BeautifulSoup(html_content, 'html.parser')
 
 
 
524
  tags_to_remove = ["script", "style", "meta", "noscript", "iframe", "header", "footer", "nav", "aside", "form", "button", "link", "head"]
525
  for tag_name in tags_to_remove:
526
+ for element in soup.find_all(tag_name): element.extract()
527
+ logger.debug("Finding main content container...")
528
+ main_content = (soup.find("main") or soup.find("article") or soup.find("div", class_=["content", "main", "post-content", "entry-content", "article-body", "story-content"]) or soup.find("div", id=["content", "main", "article", "story"]))
 
 
 
 
 
 
 
 
 
529
  text = ""
530
+ if main_content: text = main_content.get_text(separator='\n', strip=True)
 
 
531
  else:
 
532
  body = soup.find("body")
533
+ if body: text = body.get_text(separator='\n', strip=True)
534
+ else: text = soup.get_text(separator='\n', strip=True)
535
+ lines = [line.strip() for line in text.split('\n') if line.strip()]; cleaned_text = "\n".join(lines)
536
+ if not cleaned_text: return "Error: Could not extract text content from URL."
 
 
 
 
 
 
 
 
 
 
 
 
537
  max_chars = 15000
538
+ final_text = (cleaned_text[:max_chars] + "... [content truncated]") if len(cleaned_text) > max_chars else cleaned_text
539
+ logger.info(f"Successfully read URL {url}. Final length: {len(final_text)}")
 
 
 
 
 
540
  return final_text
541
+ except requests.exceptions.RequestException as e: logger.error(f"!!! Error fetching URL {url}: {e}"); return f"Error reading URL: Could not fetch content ({e})"
542
+ except Exception as e: logger.error(f"!!! Error parsing URL {url}: {e}"); logger.error(traceback.format_exc()); return f"Error reading URL: Could not parse content ({e})"
 
 
 
 
 
543
 
544
  def process_social_media_url(url):
545
  """Process a social media URL, attempting to get text and transcribe video/audio."""
546
  logger.info(f"--- Starting processing for social media URL: {url} ---")
547
+ if not url or not url.strip().startswith('http'): return None
548
+ text_content = None; video_transcription = None; temp_audio_file = None
 
 
 
 
 
 
 
 
549
  try:
550
+ logger.debug(f"Attempting text read from social URL: {url}")
551
  text_content_result = read_url(url)
552
+ if text_content_result and not text_content_result.startswith("Error:"): text_content = text_content_result; logger.debug("Text read success.")
553
+ elif text_content_result: logger.warning(f"read_url error for {url}: {text_content_result}")
554
+ else: logger.debug("No text via read_url.")
555
+ except Exception as e: logger.error(f"!!! Exception text reading social URL {url}: {e}"); logger.error(traceback.format_exc())
 
 
 
 
 
 
 
 
 
556
  try:
557
+ logger.debug(f"Attempting audio download from social URL: {url}")
558
+ temp_audio_file = download_social_media_video(url)
559
  if temp_audio_file:
560
+ logger.info(f"Audio downloaded from {url} to {temp_audio_file}. Transcribing...")
561
+ transcription_result = transcribe_audio_or_video(temp_audio_file)
562
+ if transcription_result and not transcription_result.startswith("Error"): video_transcription = transcription_result; logger.info("Transcription success.")
563
+ elif transcription_result: logger.warning(f"Transcription error for {url}: {transcription_result}")
564
+ else: logger.warning(f"Empty transcription for {url}.")
565
+ else: logger.debug("No downloadable audio found.")
566
+ except Exception as e: logger.error(f"!!! Exception audio processing social URL {url}: {e}"); logger.error(traceback.format_exc())
 
 
 
 
 
 
 
567
  finally:
568
+ if temp_audio_file and os.path.exists(temp_audio_file):
569
+ logger.debug(f"Cleaning up social temp audio: {temp_audio_file}")
570
+ try: os.remove(temp_audio_file)
571
+ except Exception as e: logger.warning(f"Failed cleanup {temp_audio_file}: {e}")
572
+ logger.debug(f"--- Finished processing social URL: {url} ---")
573
+ if text_content or video_transcription: return {"text": text_content or "", "video": video_transcription or ""}
574
+ else: logger.info(f"No usable content retrieved for social URL: {url}"); return None
575
+
576
+ # ==============================================================
577
+ # ========= SIMPLIFIED generate_news FOR DEBUGGING =============
578
+ # ==============================================================
 
 
 
 
 
 
 
 
 
 
 
579
 
580
  @spaces.GPU(duration=10) # Duraci贸n corta solo para prueba
581
  def generate_news(instructions, facts, size, tone, *args):
 
593
  # --- NO CONSTRUYAS EL PROMPT ---
594
  # --- NO LLAMES A text_pipeline ---
595
  pass # Simplemente no hacemos nada
596
+ logger.info("Simplified version: Reached end of try block.")
597
 
598
  except Exception as e:
599
  total_time = time.time() - request_start_time
 
608
  # Aseg煤rate de devolver dos strings
609
  return generated_article, raw_transcriptions
610
 
611
+ # ==============================================================
612
+ # ================= END OF SIMPLIFIED VERSION ==================
613
+ # ==============================================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
 
615
 
616
  # --- create_demo function remains the same as the previous version ---
 
700
 
701
  outputs_list = [news_output, transcriptions_output]
702
  logger.info("Setting up event handlers.")
703
+ # Aseg煤rate de que el bot贸n llama a la funci贸n generate_news (aunque ahora est茅 simplificada)
704
  generate_button.click(fn=generate_news, inputs=all_inputs, outputs=outputs_list)
705
  logger.info("Generate button click handler set.")
706