import gradio as gr import requests from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration from outetts.v0_1.interface import InterfaceHF, InterfaceGGUF interface = InterfaceHF("OuteAI/OuteTTS-0.1-350M") processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cpu") class Aspecto(): pass screen = Aspecto() with gr.Blocks(theme=gr.themes.Ocean(primary_hue="pink", neutral_hue="indigo", font=[gr.themes.GoogleFont("Montserrat"), "Playwrite England SemiJoine", "Quicksand"])) as demo: textbox = gr.Textbox(label="Url") with gr.Row(): button = gr.Button("Describir", variant="primary") clear = gr.Button("Borrar") output = gr.Textbox(label="Resumen") with gr.Row(): button2 = gr.Button("Leer", variant="primary") clear = gr.Button("Borrar") output2 = gr.Audio(label="Audio") def describir(url): raw_image = Image.open(requests.get(url, stream=True).raw).convert('RGB') inputs = processor(raw_image, return_tensors="pt").to("cpu") out = model.generate(**inputs) return processor.decode(out[0], skip_special_tokens=True) def leer(texto): output = interface.generate( text= texto, temperature=0.1, repetition_penalty=1.1, max_length=4096 ) #output.play() #output.save("output.wav") return output button.click(describir, [textbox], output) button2.click(leer, [output], output2) demo.launch(debug=True)