Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,119 Bytes
686ef17 689cd53 8813ed8 686ef17 ffe142e 10da074 ec7a1c2 686ef17 689cd53 686ef17 689cd53 686ef17 fdd8dd8 2629137 8813ed8 fdd8dd8 2629137 0c2a731 689cd53 54122f4 686ef17 54122f4 686ef17 11ec7bf 686ef17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
from PIL import Image
import requests
import torch
from threading import Thread
import gradio as gr
from gradio import FileData
import time
import spaces
ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(ckpt,
torch_dtype=torch.bfloat16).to("cuda")
processor = AutoProcessor.from_pretrained(ckpt)
import requests
import json
@spaces.GPU
def bot_streaming(message, history, max_new_tokens=250):
print("message ", message)
print("\n\n\nhostory ", history)
# txt = message["text"]
# ext_buffer = f"{txt}"
messages= []
images = []
# for i, msg in enumerate(history):
# if isinstance(msg[0], tuple):
# messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
# messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
# images.append(Image.open(msg[0][0]).convert("RGB"))
# elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
# # messages are already handled
# pass
# elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
# messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
# messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
# # add current message
# if len(message["files"]) == 1:
# if isinstance(message["files"][0], str): # examples
# image = Image.open(message["files"][0]).convert("RGB")
# else: # regular input
# image = Image.open(message["files"][0]["path"]).convert("RGB")
# images.append(image)
# messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
# else:
# messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
messages= message['text']
print("messages ", messages)
messages = json.loads(messages)
files = message['files']
for x in messages:
try:
if x['content'][1]['type']=='image':
url = x['content'][1]['url']
response = requests.get(url)
img = Image.open(BytesIO(response.content)).convert("RGB")
images.append(img)
except :
try:
if x['content'][0]['type']=='image':
url = x['content'][1]['url']
response = requests.get(url)
img = Image.open(BytesIO(response.content)).convert("RGB")
images.append(img)
except:
pass
print("\n\nfinal messages ", messages)
texts = processor.apply_chat_template(messages, add_generation_prompt=True)
print("\n\ntexts final chat text ", texts)
if images == []:
inputs = processor(text=texts, return_tensors="pt").to("cuda")
else:
inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
generated_text = ""
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
generated_text_without_prompt = buffer
time.sleep(0.01)
yield buffer
demo = gr.ChatInterface(fn=bot_streaming, title="Multimodal Llama", examples=[
[{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]},
200],
[{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]},
250],
[{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]},
250],
[{"text": "How long does it take from invoice date to due date? Be short and concise.", "files":["./examples/invoice.png"]},
250],
[{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]},
250],
],
textbox=gr.MultimodalTextbox(),
additional_inputs = [gr.Slider(
minimum=10,
maximum=500,
value=250,
step=10,
label="Maximum number of new tokens to generate",
)
],
cache_examples=False,
description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply try one of the examples below. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32). ",
stop_btn="Stop Generation",
fill_height=True,
multimodal=True)
demo.launch(debug=True) |