import gradio as gr import requests import transformers import PIL import diffusers from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration import torch import soundfile as sf from diffusers import StableAudioPipeline import torchsde processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda") pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16) pipe = pipe.to("cuda") #img_url = 'https://www.caracteristicass.de/wp-content/uploads/2023/02/imagenes-artisticas.jpg' class Aspecto(): pass screen = Aspecto() with gr.Blocks(theme=gr.themes.Ocean(primary_hue="pink", neutral_hue="indigo", font=[gr.themes.GoogleFont("Montserrat"), "Playwrite England SemiJoine", "Quicksand"])) as demo: textbox = gr.Textbox(label="Url") with gr.Row(): button = gr.Button("Intro", variant="primary") button2 = gr.Button("Leer", variant="primary") clear = gr.Button("Borrar") output = gr.Textbox(label="Resumen") output2 = gr.Audio(label="Audio") def describir(url): raw_image = Image.open(requests.get(url, stream=True).raw).convert('RGB') inputs = processor(raw_image, return_tensors="pt").to("cuda") out = model.generate(**inputs) return processor.decode(out[0], skip_special_tokens=True) def leer(texto): prompt = texto negative_prompt = "Low quality." # set the seed for generator generator = torch.Generator("cuda").manual_seed(0) # run the generation audio = pipe( prompt, negative_prompt=negative_prompt, num_inference_steps=200, audio_end_in_s=10.0, num_waveforms_per_prompt=3, generator=generator, ).audios salida = audio[0].T.float().cpu().numpy() #sf.write("demo.wav", salida, pipe.vae.sampling_rate) return (salida,pipe.vae.sampling_rate) button.click(describir, [textbox], output) button.click(leer, [output], output2) demo.launch(debug=True)