proyectomod / app.py
MaykaGR's picture
Update app.py
e60395e verified
raw
history blame
2.31 kB
import gradio as gr
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import soundfile as sf
from diffusers import StableAudioPipeline
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
from huggingface_hub import login
from torch.nn.utils.parametrizations import weight_norm
login(token=os.environ["HF_TOKEN"])
device = torch.device("cpu")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cpu")
pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0")
pipe = pipe.to("cpu")
#img_url = 'https://www.caracteristicass.de/wp-content/uploads/2023/02/imagenes-artisticas.jpg'
class Aspecto():
pass
screen = Aspecto()
with gr.Blocks(theme=gr.themes.Ocean(primary_hue="pink", neutral_hue="indigo", font=[gr.themes.GoogleFont("Montserrat"), "Playwrite England SemiJoine", "Quicksand"])) as demo:
textbox = gr.Textbox(label="Url")
with gr.Row():
button = gr.Button("Describir", variant="primary")
clear = gr.Button("Borrar")
output = gr.Textbox(label="Resumen")
output2 = gr.Audio(label="Audio")
def describir(url):
raw_image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
inputs = processor(raw_image, return_tensors="pt").to("cpu")
out = model.generate(**inputs)
leer(processor.decode(out[0], skip_special_tokens=True))
return processor.decode(out[0], skip_special_tokens=True)
def leer(texto):
prompt = texto
negative_prompt = "Low quality."
# set the seed for generator
generator = torch.Generator("cpu").manual_seed(0)
# run the generation
audio = pipe(
prompt,
negative_prompt=negative_prompt,
num_inference_steps=200,
audio_end_in_s=10.0,
num_waveforms_per_prompt=3,
generator=generator,
).audios
salida = audio[0].T.float().cpu().numpy()
sf.write("demo.wav", salida, pipe.vae.sampling_rate)
return sf.read("demo.wav")
button.click(describir, [textbox], output, leer, [output], output2)
demo.launch(debug=True)