Spaces:

CamiloVega
/

NewsIA

Sleeping

App Files Files Community

CamiloVega commited on Nov 2, 2024

Commit

1b167bf

verified ·

1 Parent(s): e12f85e

Upload app (10).py

Browse files

Files changed (1) hide show

app (10).py +456 -0

app (10).py ADDED Viewed

	@@ -0,0 +1,456 @@

+import spaces
+import gradio as gr
+import logging
+import os
+import tempfile
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+import torch
+import whisper
+from moviepy.editor import VideoFileClip
+from pydub import AudioSegment
+import fitz  # PyMuPDF for handling PDFs
+import docx  # For handling .docx files
+import yt_dlp
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Get HuggingFace token from environment variable
+HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')
+if not HUGGINGFACE_TOKEN:
+    logger.error("HUGGINGFACE_TOKEN environment variable not set")
+    raise ValueError("Please set the HUGGINGFACE_TOKEN environment variable")
+# Global variables for models
+tokenizer = None
+model = None
+news_generator = None
+whisper_model = None
+@spaces.GPU(duration=60)
+def initialize_models():
+    """Initialize models with Zero GPU optimizations"""
+    global tokenizer, model, news_generator, whisper_model
+    try:
+        logger.info("Starting model initialization...")
+        model_name = "meta-llama/Llama-2-7b-chat-hf"
+        # Load tokenizer
+        logger.info("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            token=HUGGINGFACE_TOKEN
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+        # Load model
+        logger.info("Loading model...")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            token=HUGGINGFACE_TOKEN,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            low_cpu_mem_usage=True
+        )
+        # Create pipeline
+        logger.info("Creating pipeline...")
+        news_generator = pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            max_length=2048,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.95,
+            repetition_penalty=1.2
+        )
+        # Load Whisper model
+        logger.info("Loading Whisper model...")
+        whisper_model = whisper.load_model("base")
+        logger.info("All models initialized successfully")
+        return True
+    except Exception as e:
+        logger.error(f"Error during model initialization: {str(e)}")
+        raise
+# Inicializar los modelos
+initialize_models()
+def download_social_media_video(url):
+    """Download a video from social media."""
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+        'outtmpl': '%(id)s.%(ext)s',
+    }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info_dict = ydl.extract_info(url, download=True)
+            audio_file = f"{info_dict['id']}.mp3"
+        logger.info(f"Video downloaded successfully: {audio_file}")
+        return audio_file
+    except Exception as e:
+        logger.error(f"Error downloading video: {str(e)}")
+        raise
+def convert_video_to_audio(video_file):
+    """Convert a video file to audio."""
+    try:
+        video = VideoFileClip(video_file)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
+            video.audio.write_audiofile(temp_file.name)
+            logger.info(f"Video converted to audio: {temp_file.name}")
+            return temp_file.name
+    except Exception as e:
+        logger.error(f"Error converting video: {str(e)}")
+        raise
+def preprocess_audio(audio_file):
+    """Preprocess the audio file to improve quality."""
+    try:
+        audio = AudioSegment.from_file(audio_file)
+        audio = audio.apply_gain(-audio.dBFS + (-20))
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
+            audio.export(temp_file.name, format="mp3")
+            logger.info(f"Audio preprocessed: {temp_file.name}")
+            return temp_file.name
+    except Exception as e:
+        logger.error(f"Error preprocessing audio: {str(e)}")
+        raise
+@spaces.GPU(duration=60)
+def transcribe_audio(file):
+    """Transcribe an audio or video file."""
+    try:
+        if isinstance(file, str) and file.startswith('http'):
+            file_path = download_social_media_video(file)
+        elif isinstance(file, str) and file.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
+            file_path = convert_video_to_audio(file)
+        else:
+            file_path = preprocess_audio(file)
+        logger.info(f"Transcribing audio: {file_path}")
+        with torch.inference_mode():
+            result = whisper_model.transcribe(file_path)
+        transcription = result.get("text", "Error in transcription")
+        logger.info(f"Transcription completed: {transcription[:50]}...")
+        return transcription
+    except Exception as e:
+        logger.error(f"Error transcribing: {str(e)}")
+        return f"Error processing the file: {str(e)}"
+def read_document(document_path):
+    """Read the content of a document."""
+    try:
+        if document_path.endswith(".pdf"):
+            doc = fitz.open(document_path)
+            return "\n".join([page.get_text() for page in doc])
+        elif document_path.endswith(".docx"):
+            doc = docx.Document(document_path)
+            return "\n".join([paragraph.text for paragraph in doc.paragraphs])
+        elif document_path.endswith(".xlsx"):
+            return pd.read_excel(document_path).to_string()
+        elif document_path.endswith(".csv"):
+            return pd.read_csv(document_path).to_string()
+        else:
+            return "Unsupported file type. Please upload a PDF, DOCX, XLSX or CSV document."
+    except Exception as e:
+        return f"Error reading document: {str(e)}"
+def read_url(url):
+    """Read the content of a URL."""
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        return soup.get_text()
+    except Exception as e:
+        return f"Error reading URL: {str(e)}"
+def process_social_content(url):
+    """Process social media content."""
+    try:
+        text_content = read_url(url)
+        try:
+            video_content = transcribe_audio(url)
+        except Exception:
+            video_content = None
+        return {
+            "text": text_content,
+            "video": video_content
+        }
+    except Exception as e:
+        logger.error(f"Error processing social content: {str(e)}")
+        return None
+@spaces.GPU(duration=60)
+def generate_news(instructions, facts, size, tone, *args):
+    try:
+        # Initialize knowledge base
+        knowledge_base = {
+            "instructions": instructions,
+            "facts": facts,
+            "document_content": [],
+            "audio_data": [],
+            "url_content": [],
+            "social_content": []
+        }
+        # Parse arguments
+        num_audios = 5 * 3
+        num_social_urls = 3 * 3
+        num_urls = 5
+        audios = args[:num_audios]
+        social_urls = args[num_audios:num_audios+num_social_urls]
+        urls = args[num_audios+num_social_urls:num_audios+num_social_urls+num_urls]
+        documents = args[num_audios+num_social_urls+num_urls:]
+        # Process URLs
+        for url in urls:
+            if url:
+                knowledge_base["url_content"].append(read_url(url))
+        # Process documents
+        for document in documents:
+            if document is not None:
+                knowledge_base["document_content"].append(read_document(document.name))
+        # Process audio files
+        for i in range(0, len(audios), 3):
+            audio_file, name, position = audios[i:i+3]
+            if audio_file is not None:
+                knowledge_base["audio_data"].append({
+                    "audio": audio_file,
+                    "name": name,
+                    "position": position
+                })
+        # Process social media content
+        for i in range(0, len(social_urls), 3):
+            social_url, social_name, social_context = social_urls[i:i+3]
+            if social_url:
+                social_content = process_social_content(social_url)
+                if social_content:
+                    knowledge_base["social_content"].append({
+                        "url": social_url,
+                        "name": social_name,
+                        "context": social_context,
+                        "text": social_content["text"],
+                        "video": social_content["video"]
+                    })
+        # Build transcriptions
+        transcriptions_text = ""
+        raw_transcriptions = ""
+        for idx, data in enumerate(knowledge_base["audio_data"]):
+            if data["audio"] is not None:
+                transcription = transcribe_audio(data["audio"])
+                transcriptions_text += f'"{transcription}" - {data["name"]}, {data["position"]}\n'
+                raw_transcriptions += f'[Audio/Video {idx + 1}]: "{transcription}" - {data["name"]}, {data["position"]}\n\n'
+        for data in knowledge_base["social_content"]:
+            if data["text"]:
+                transcriptions_text += f'[Social media text]: "{data["text"][:200]}..." - {data["name"]}, {data["context"]}\n'
+                raw_transcriptions += transcriptions_text + "\n\n"
+            if data["video"]:
+                video_transcription = f'[Social media video]: "{data["video"]}" - {data["name"]}, {data["context"]}\n'
+                transcriptions_text += video_transcription
+                raw_transcriptions += video_transcription + "\n\n"
+        document_content = "\n\n".join(knowledge_base["document_content"])
+        url_content = "\n\n".join(knowledge_base["url_content"])
+        # Create prompt
+        prompt = f"""[INST] You are a professional news writer. Write a news article based on the following information:
+Instructions: {knowledge_base["instructions"]}
+Facts: {knowledge_base["facts"]}
+Additional content from documents: {document_content}
+Additional content from URLs: {url_content}
+Use these transcriptions as direct and indirect quotes:
+{transcriptions_text}
+Follow these requirements:
+- Write a title
+- Write a 15-word hook that complements the title
+- Write the body with {size} words
+- Use a {tone} tone
+- Answer the 5 Ws (Who, What, When, Where, Why) in the first paragraph
+- Use at least 80% direct quotes (in quotation marks)
+- Use proper journalistic style
+- Do not invent information
+- Be rigorous with the provided facts [/INST]"""
+        # Generate article with specific handling for Zero GPU
+        with torch.inference_mode():
+            outputs = news_generator(
+                prompt,
+                max_new_tokens=min(int(size * 2), 1024),
+                return_full_text=False,
+                pad_token_id=tokenizer.eos_token_id,
+                num_return_sequences=1,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.95,
+                repetition_penalty=1.2
+            )
+        news_article = outputs[0]['generated_text']
+        news_article = news_article.replace('[INST]', '').replace('[/INST]', '').strip()
+        return news_article, raw_transcriptions
+    except Exception as e:
+        logger.error(f"Error generating news: {str(e)}")
+        return f"Error generating the news article: {str(e)}", ""
+# Create Gradio interface
+def create_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("## Generador de noticias todo en uno")
+        # Contenedor principal con dos columnas
+        with gr.Row():
+            # Columna izquierda - Formulario principal
+            with gr.Column(scale=2):
+                instrucciones = gr.Textbox(
+                    label="Instrucciones para la noticia",
+                    lines=2
+                )
+                hechos = gr.Textbox(
+                    label="Describe los hechos de la noticia",
+                    lines=4
+                )
+                tamaño = gr.Number(
+                    label="Tamaño del cuerpo de la noticia (en palabras)",
+                    value=100
+                )
+                tono = gr.Dropdown(
+                    label="Tono de la noticia",
+                    choices=["serio", "neutral", "divertido"],
+                    value="neutral"
+                )
+            # Columna derecha - Tabs y campos
+            with gr.Column(scale=3):
+                # Lista de inputs que empezamos a construir
+                inputs_list = [instrucciones, hechos, tamaño, tono]
+                # Tabs en la parte superior
+                with gr.Tabs():
+                    # Audio/Video tabs
+                    for i in range(1, 6):
+                        with gr.TabItem(f"Audio/Video {i}"):
+                            file = gr.File(
+                                label=f"Audio/Video {i}",
+                                file_types=["audio", "video"]
+                            )
+                            nombre = gr.Textbox(
+                                label="Nombre",
+                                placeholder="Nombre del entrevistado"
+                            )
+                            cargo = gr.Textbox(
+                                label="Cargo",
+                                placeholder="Cargo o rol"
+                            )
+                            inputs_list.extend([file, nombre, cargo])
+                    # Redes Sociales tabs
+                    for i in range(1, 4):
+                        with gr.TabItem(f"Red Social {i}"):
+                            social_url = gr.Textbox(
+                                label=f"URL de red social {i}",
+                                placeholder="https://..."
+                            )
+                            social_nombre = gr.Textbox(
+                                label=f"Nombre de persona/cuenta {i}"
+                            )
+                            social_contexto = gr.Textbox(
+                                label=f"Contexto del contenido {i}",
+                                lines=2
+                            )
+                            inputs_list.extend([social_url, social_nombre, social_contexto])
+                    # URL tabs
+                    for i in range(1, 6):
+                        with gr.TabItem(f"URL {i}"):
+                            url = gr.Textbox(
+                                label=f"URL {i}",
+                                placeholder="https://..."
+                            )
+                            inputs_list.append(url)
+                    # Documento tabs
+                    for i in range(1, 6):
+                        with gr.TabItem(f"Documento {i}"):
+                            documento = gr.File(
+                                label=f"Documento {i}",
+                                file_types=["pdf", "docx", "xlsx", "csv"],
+                                file_count="single"
+                            )
+                            inputs_list.append(documento)
+        # Separador
+        gr.Markdown("---")
+        # Transcripciones
+        with gr.Row():
+            transcripciones_output = gr.Textbox(
+                label="Transcripciones",
+                lines=10,
+                show_copy_button=True
+            )
+        # Separador
+        gr.Markdown("---")
+        # Botón y output
+        with gr.Row():
+            generar = gr.Button("Generar borrador")
+        with gr.Row():
+            noticia_output = gr.Textbox(
+                label="Borrador generado",
+                lines=20,
+                show_copy_button=True
+            )
+        # Event handler
+        generar.click(
+            fn=generate_news,
+            inputs=inputs_list,
+            outputs=[noticia_output, transcripciones_output]
+        )
+    return demo
+# Launch the app
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.queue()
+    demo.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )