AustingDong
commited on
Commit
·
035a152
1
Parent(s):
b9b9d9b
fixed llava
Browse files- app.py +4 -4
- demo/model_utils.py +3 -3
app.py
CHANGED
@@ -208,13 +208,13 @@ def model_slider_change(model_type):
|
|
208 |
set_seed()
|
209 |
model_utils = LLaVA_Utils()
|
210 |
vl_gpt, tokenizer = model_utils.init_LLaVA()
|
211 |
-
language_model_max_layer =
|
212 |
-
language_model_best_layer =
|
213 |
|
214 |
res = (
|
215 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
216 |
-
gr.Slider(minimum=1, maximum=
|
217 |
-
gr.Slider(minimum=1, maximum=
|
218 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
219 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
220 |
)
|
|
|
208 |
set_seed()
|
209 |
model_utils = LLaVA_Utils()
|
210 |
vl_gpt, tokenizer = model_utils.init_LLaVA()
|
211 |
+
language_model_max_layer = 32
|
212 |
+
language_model_best_layer = 24
|
213 |
|
214 |
res = (
|
215 |
gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
|
216 |
+
gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers min"),
|
217 |
+
gr.Slider(minimum=1, maximum=32, value=24, step=1, label="visualization layers max"),
|
218 |
gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
|
219 |
gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
|
220 |
)
|
demo/model_utils.py
CHANGED
@@ -2,7 +2,7 @@ import torch
|
|
2 |
import numpy as np
|
3 |
import spaces
|
4 |
from PIL import Image, ImageDraw, ImageFont
|
5 |
-
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, AutoProcessor, PaliGemmaForConditionalGeneration
|
6 |
from transformers import CLIPProcessor, CLIPModel
|
7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
8 |
|
@@ -123,13 +123,13 @@ class LLaVA_Utils(Model_Utils):
|
|
123 |
model_path = "llava-hf/llava-v1.6-mistral-7b-hf"
|
124 |
config = AutoConfig.from_pretrained(model_path)
|
125 |
|
126 |
-
self.vl_gpt =
|
127 |
low_cpu_mem_usage=True,
|
128 |
attn_implementation = 'eager',
|
129 |
output_attentions=True
|
130 |
)
|
131 |
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
|
132 |
-
self.processor =
|
133 |
self.tokenizer = self.processor.tokenizer
|
134 |
|
135 |
return self.vl_gpt, self.tokenizer
|
|
|
2 |
import numpy as np
|
3 |
import spaces
|
4 |
from PIL import Image, ImageDraw, ImageFont
|
5 |
+
from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, LlavaNextProcessor, AutoProcessor, PaliGemmaForConditionalGeneration
|
6 |
from transformers import CLIPProcessor, CLIPModel
|
7 |
from janus.models import MultiModalityCausalLM, VLChatProcessor
|
8 |
|
|
|
123 |
model_path = "llava-hf/llava-v1.6-mistral-7b-hf"
|
124 |
config = AutoConfig.from_pretrained(model_path)
|
125 |
|
126 |
+
self.vl_gpt = LlavaNextForConditionalGeneration.from_pretrained(model_path,
|
127 |
low_cpu_mem_usage=True,
|
128 |
attn_implementation = 'eager',
|
129 |
output_attentions=True
|
130 |
)
|
131 |
self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
|
132 |
+
self.processor = LlavaNextProcessor.from_pretrained(model_path)
|
133 |
self.tokenizer = self.processor.tokenizer
|
134 |
|
135 |
return self.vl_gpt, self.tokenizer
|