import gradio as gr import requests from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub from fairseq.models.text_to_speech.hub_interface import TTSHubInterface import IPython.display as ipd processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cpu") class Aspecto(): pass screen = Aspecto() with gr.Blocks(theme=gr.themes.Ocean(primary_hue="pink", neutral_hue="indigo", font=[gr.themes.GoogleFont("Montserrat"), "Playwrite England SemiJoine", "Quicksand"])) as demo: textbox = gr.Textbox(label="Url") with gr.Row(): button = gr.Button("Describir", variant="primary") clear = gr.Button("Borrar") output = gr.Textbox(label="Resumen") with gr.Row(): button2 = gr.Button("Leer", variant="primary") clear = gr.Button("Borrar") output2 = gr.Audio(label="Audio") def describir(url): raw_image = Image.open(requests.get(url, stream=True).raw).convert('RGB') inputs = processor(raw_image, return_tensors="pt").to("cpu") out = model.generate(**inputs) return processor.decode(out[0], skip_special_tokens=True) def leer(texto): models, cfg, task = load_model_ensemble_and_task_from_hf_hub( "facebook/fastspeech2-en-ljspeech", arg_overrides={"vocoder": "hifigan", "fp16": False} ) modelA = models[0] TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg) generator = task.build_generator(modelA, cfg) text = texto sample = TTSHubInterface.get_model_input(task, text) wav, rate = TTSHubInterface.get_prediction(task, modelA, generator, sample) return ipd.Audio(wav, rate=rate) button.click(describir, [textbox], output) button2.click(leer, [output], output2) demo.launch(debug=True)