MaykaGR commited on
Commit
e97d4b7
verified
1 Parent(s): f21427e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from PIL import Image
4
+ from transformers import BlipProcessor, BlipForConditionalGeneration
5
+ import torch
6
+ import soundfile as sf
7
+ from diffusers import StableAudioPipeline
8
+ import torchsde
9
+
10
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
11
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to("cuda")
12
+ pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16)
13
+ pipe = pipe.to("cuda")
14
+
15
+
16
+ #img_url = 'https://www.caracteristicass.de/wp-content/uploads/2023/02/imagenes-artisticas.jpg'
17
+
18
+
19
+ class Aspecto():
20
+ pass
21
+
22
+ screen = Aspecto()
23
+ with gr.Blocks(theme=gr.themes.Ocean(primary_hue="pink", neutral_hue="indigo", font=[gr.themes.GoogleFont("Montserrat"), "Playwrite England SemiJoine", "Quicksand"])) as demo:
24
+ textbox = gr.Textbox(label="Url")
25
+ with gr.Row():
26
+ button = gr.Button("Intro", variant="primary")
27
+ button2 = gr.Button("Leer", variant="primary")
28
+ clear = gr.Button("Borrar")
29
+ output = gr.Textbox(label="Resumen")
30
+ output2 = gr.Audio(label="Audio")
31
+
32
+ def describir(url):
33
+ raw_image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
34
+ inputs = processor(raw_image, return_tensors="pt").to("cuda")
35
+ out = model.generate(**inputs)
36
+ return processor.decode(out[0], skip_special_tokens=True)
37
+
38
+ def leer(texto):
39
+ prompt = texto
40
+ negative_prompt = "Low quality."
41
+
42
+ # set the seed for generator
43
+ generator = torch.Generator("cuda").manual_seed(0)
44
+
45
+ # run the generation
46
+ audio = pipe(
47
+ prompt,
48
+ negative_prompt=negative_prompt,
49
+ num_inference_steps=200,
50
+ audio_end_in_s=10.0,
51
+ num_waveforms_per_prompt=3,
52
+ generator=generator,
53
+ ).audios
54
+
55
+ salida = audio[0].T.float().cpu().numpy()
56
+ #sf.write("demo.wav", salida, pipe.vae.sampling_rate)
57
+ return (salida,pipe.vae.sampling_rate)
58
+
59
+
60
+ button.click(describir, [textbox], output)
61
+ button.click(leer, [output], output2)
62
+
63
+ demo.launch(debug=True)