Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,043 Bytes
686ef17 20712dc 686ef17 689cd53 8813ed8 686ef17 bffc93c 686ef17 ffe142e 10da074 ec7a1c2 686ef17 689cd53 686ef17 689cd53 686ef17 fdd8dd8 2629137 8813ed8 fdd8dd8 2629137 f997318 0c2a731 3f9a87e 0c2a731 3f9a87e 0c2a731 3f9a87e f997318 99e424b f997318 132e348 99e424b 132e348 3f9a87e 021221d f997318 021221d 689cd53 54122f4 686ef17 54122f4 3f9a87e 686ef17 11ec7bf 686ef17 7de4d7a 686ef17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
from PIL import Image
import requests
import torch
from threading import Thread
import gradio as gr
from gradio import FileData
import time
import spaces
from PIL import Image
from io import BytesIO
ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(ckpt,
torch_dtype=torch.bfloat16).to("cuda")
processor = AutoProcessor.from_pretrained(ckpt)
import requests
import json
@spaces.GPU(duration=100)
def bot_streaming(message, history, max_new_tokens=250):
print("message ", message)
print("\n\n\nhostory ", history)
# txt = message["text"]
# ext_buffer = f"{txt}"
messages= []
images = []
# for i, msg in enumerate(history):
# if isinstance(msg[0], tuple):
# messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
# messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
# images.append(Image.open(msg[0][0]).convert("RGB"))
# elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
# # messages are already handled
# pass
# elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
# messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
# messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
# # add current message
# if len(message["files"]) == 1:
# if isinstance(message["files"][0], str): # examples
# image = Image.open(message["files"][0]).convert("RGB")
# else: # regular input
# image = Image.open(message["files"][0]["path"]).convert("RGB")
# images.append(image)
# messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
# else:
# messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
messages= message['text']
print("messages ", messages)
messages = json.loads(messages)
files = message['files']
co_mess=[]
for x in messages:
print("x , ", x)
try:
if x['img_url']:
url = x['img_url']
response = requests.get(url)
img = Image.open(BytesIO(response.content)).convert("RGB")
images.append(img)
co_mess.append({"role": x['role'], "content": [{"type": "text", "text": x['content']}, {"type": "image"}]})
else:
raise ValueError('no imaghe')
except Exception as e:
co_mess.append({"role": x['role'], "content": [{"type": "text", "text": x['content']} ]})
print(e)
# try:
# if x['content'][1]['type']=='image':
# url = x['content'][1]['url']
# response = requests.get(url)
# img = Image.open(BytesIO(response.content)).convert("RGB")
# images.append(img)
# except Exception as e:
# print(e)
# try:
# if x['content'][0]['type']=='image':
# url = x['content'][0]['url']
# response = requests.get(url)
# img = Image.open(BytesIO(response.content)).convert("RGB")
# images.append(img)
# except Exception as e:
# print(e)
# pass
messages= co_mess
print("images ",images)
print("\n\nfinal messages ", messages)
texts = processor.apply_chat_template(messages, add_generation_prompt=True)
print("\n\ntexts final chat text ", texts)
print('len of images ', len(images))
if images == []:
inputs = processor(text=texts, return_tensors="pt").to("cuda")
else:
inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
generated_text = ""
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
generated_text_without_prompt = buffer
time.sleep(0.01)
yield buffer
demo = gr.ChatInterface(fn=bot_streaming, title="Multimodal Llama", examples=[
[{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]},
200],
[{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]},
250],
[{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]},
250],
[{"text": "How long does it take from invoice date to due date? Be short and concise.", "files":["./examples/invoice.png"]},
250],
[{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]},
250],
],
textbox=gr.MultimodalTextbox(),
additional_inputs = [gr.Slider(
minimum=10,
maximum=4000,
value=250,
step=10,
label="Maximum number of new tokens to generate",
)
],
cache_examples=False,
description="Try Multimodal Llama by Meta with transformers in this demo. Upload an image, and start chatting about it, or simply try one of the examples below. To learn more about Llama Vision, visit [our blog post](https://huggingface.co/blog/llama32). ",
stop_btn="Stop Generation",
fill_height=True,
multimodal=True)
demo.launch(debug=True) |