File size: 2,204 Bytes
c7ce7f7
e97d4b7
 
e9c062e
 
 
e97d4b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

import gradio as gr
import requests
import transformers
import PIL
import diffusers
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import soundfile as sf
from diffusers import StableAudioPipeline
import torchsde

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")
pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16)
pipe = pipe.to("cuda")


#img_url = 'https://www.caracteristicass.de/wp-content/uploads/2023/02/imagenes-artisticas.jpg'


class Aspecto():
    pass

screen = Aspecto()
with gr.Blocks(theme=gr.themes.Ocean(primary_hue="pink", neutral_hue="indigo", font=[gr.themes.GoogleFont("Montserrat"), "Playwrite England SemiJoine", "Quicksand"])) as demo:
    textbox = gr.Textbox(label="Url")
    with gr.Row():
        button = gr.Button("Intro", variant="primary")
        button2 = gr.Button("Leer", variant="primary")
        clear = gr.Button("Borrar")
    output = gr.Textbox(label="Resumen")
    output2 = gr.Audio(label="Audio")

    def describir(url):
      raw_image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
      inputs = processor(raw_image, return_tensors="pt").to("cuda")
      out = model.generate(**inputs)
      return processor.decode(out[0], skip_special_tokens=True)

    def leer(texto):
        prompt = texto
        negative_prompt = "Low quality."

        # set the seed for generator
        generator = torch.Generator("cuda").manual_seed(0)

        # run the generation
        audio = pipe(
            prompt,
            negative_prompt=negative_prompt,
            num_inference_steps=200,
            audio_end_in_s=10.0,
            num_waveforms_per_prompt=3,
            generator=generator,
        ).audios

        salida = audio[0].T.float().cpu().numpy()
        #sf.write("demo.wav", salida, pipe.vae.sampling_rate)
        return (salida,pipe.vae.sampling_rate)


    button.click(describir, [textbox], output)
    button.click(leer, [output], output2)

demo.launch(debug=True)