Ligeng-Zhu commited on
Commit
942e894
·
verified ·
1 Parent(s): 4ba795c

Upload files with `vila-upload`.

Browse files

Upload README.md
Upload auto_processor.py
Upload tokenizer_utils.py
Upload mm_utils.py
Upload modeling_vila.py
Upload builder.py
Upload llm/tokenizer_config.json

README.md CHANGED
@@ -67,7 +67,7 @@ model.eval()
67
  gpt_conv = [{
68
  "role": "user",
69
  "content": [
70
- {"type": "image", "path": "demo_images/demo_img_1.png"},
71
  {"type": "text", "text": "Describe this image."}
72
  ]
73
  }]
@@ -106,14 +106,14 @@ model.eval()
106
  gpt_conv1 = [{
107
  "role": "user",
108
  "content": [
109
- {"type": "image", "path": "demo_images/demo_img_1.png"},
110
  {"type": "text", "text": "Describe this image."}
111
  ]
112
  }]
113
  gpt_conv2 = [{
114
  "role": "user",
115
  "content": [
116
- {"type": "image", "path": "demo_images/demo_img_2.png"},
117
  {"type": "text", "text": "Describe this image for me. Provide a detailed description of the image."}
118
  ]
119
  }]
 
67
  gpt_conv = [{
68
  "role": "user",
69
  "content": [
70
+ {"type": "image", "path": "https://nvlabs.github.io/VILA/asset/example.jpg"},
71
  {"type": "text", "text": "Describe this image."}
72
  ]
73
  }]
 
106
  gpt_conv1 = [{
107
  "role": "user",
108
  "content": [
109
+ {"type": "image", "path": "https://nvlabs.github.io/VILA/asset/example.jpg"},
110
  {"type": "text", "text": "Describe this image."}
111
  ]
112
  }]
113
  gpt_conv2 = [{
114
  "role": "user",
115
  "content": [
116
+ {"type": "image", "path": "https://nvlabs.github.io/VILA/asset/example_vqa.jpg"},
117
  {"type": "text", "text": "Describe this image for me. Provide a detailed description of the image."}
118
  ]
119
  }]
auto_processor.py CHANGED
@@ -3,8 +3,11 @@ import os
3
  import os.path as osp
4
  import warnings
5
  from collections import defaultdict
6
- from typing import List, Union
 
7
 
 
 
8
  import torch
9
  from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoProcessor, AutoTokenizer
10
  from transformers.feature_extraction_utils import BatchFeature
@@ -18,35 +21,73 @@ from .media import Image, Video, extract_media
18
  from .mm_utils import process_image, process_images
19
  from .tokenizer_utils import tokenize_conversation
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def fetch_image_url_or_fpath(url_or_fpath):
22
  if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
23
  import tempfile
 
24
  import requests
25
-
26
  # Download the image to a temporary file
27
  temp_dir = tempfile.mkdtemp()
28
  temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
29
-
30
  response = requests.get(url_or_fpath, stream=True)
31
  response.raise_for_status()
32
-
33
  with open(temp_file, "wb") as f:
34
  for chunk in response.iter_content(chunk_size=8192):
35
  f.write(chunk)
36
-
37
  return temp_file
38
  elif url_or_fpath.startswith("file://"):
39
  fpath = url_or_fpath.replace("file://", "")
40
  assert osp.exists(fpath), f"File {fpath} does not exist"
41
  return fpath
42
  elif osp.exists(url_or_fpath):
43
- assert osp.isfile(url_or_fpath), f"File {url_or_fpath} is not a file"
44
  return url_or_fpath
45
  else:
46
  raise ValueError(f"Unsupported image path: {url_or_fpath}")
47
-
48
 
49
- def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
 
50
  # tensor shape is (batch_size, seq_len)
51
  max_len = max([ids.shape[1] for ids in input_ids_list])
52
  if target_len is not None:
@@ -66,6 +107,36 @@ def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="lef
66
  return torch.cat(new_input_ids_list, dim=0)
67
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  class VILAProcessorKwargs(ProcessingKwargs, total=False):
70
  _defaults = {
71
  "text_kwargs": {
@@ -74,8 +145,6 @@ class VILAProcessorKwargs(ProcessingKwargs, total=False):
74
  }
75
 
76
 
77
-
78
-
79
  class VILAProcessor(ProcessorMixin):
80
  # attributes = ["image_processor", "tokenizer"]
81
  attributes = []
@@ -84,25 +153,96 @@ class VILAProcessor(ProcessorMixin):
84
  # image_processor_class = "VILAImageProcessor"
85
  # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
86
 
87
- def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, **kwargs):
88
- # self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
89
- # self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
90
  self.image_token = MEDIA_TOKENS["image"]
91
  self.video_token = MEDIA_TOKENS["video"]
92
  self.config = config
93
  self.image_processor = image_processor
94
  self.tokenizer = tokenizer
 
 
 
 
 
 
95
 
96
  super().__init__(image_processor, tokenizer, chat_template=chat_template)
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  @classmethod
99
  def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
100
  if os.path.isdir(pretrained_model_name_or_path):
101
  pretrained_model_name_or_path = pretrained_model_name_or_path
102
  else:
103
  print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
104
  from huggingface_hub import snapshot_download
105
-
106
  pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
107
 
108
  image_processor = AutoImageProcessor.from_pretrained(
@@ -112,43 +252,64 @@ class VILAProcessor(ProcessorMixin):
112
  osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
113
  )
114
  config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
115
- return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
116
 
117
  def __repr__(self):
118
- return (
119
- f"VILAProcessor(image_processor={self.image_processor}, tokenizer={self.tokenizer}, config={self.config})"
120
- )
121
 
122
  def __call__(
123
  self,
124
- conversation,
125
- images: ImageInput = None,
126
- text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
127
- videos: VideoInput = None,
128
  **kwargs: Unpack[VILAProcessorKwargs],
129
  ) -> BatchFeature:
130
- if images is not None:
131
- warnings.warn("images is not supported in __call__")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- input_ids = []
 
134
  media = defaultdict(list)
135
  media_config = defaultdict(dict)
136
  for conv in conversation:
137
- feat = self.__single_call__(conv, images, text, videos, **kwargs)
138
- input_ids.append(feat.input_ids)
 
139
  for name in feat.media:
140
  media[name] += feat.media[name]
141
  for name in feat.media_config:
142
  media_config[name].update(feat.media_config[name])
143
-
 
 
 
 
 
 
 
 
 
 
 
144
  return BatchFeature(
145
  data={
146
- # "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
147
- "input_ids": __pad_fn(
148
- input_ids,
149
- padding_value=self.tokenizer.pad_token_id,
150
- padding_side="left",
151
- ),
152
  "media": media,
153
  "media_config": media_config,
154
  }
@@ -174,6 +335,8 @@ class VILAProcessor(ProcessorMixin):
174
  self.config.image_processor = self.image_processor
175
  if self.config.image_aspect_ratio == "dynamic":
176
  images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
 
 
177
  conversation[0]["value"] = conversation[0]["value"].replace(
178
  DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
179
  )
@@ -195,9 +358,18 @@ class VILAProcessor(ProcessorMixin):
195
  ]
196
  else:
197
  raise ValueError(f"Unsupported media type: {name}")
198
- input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).cuda().unsqueeze(0)
199
- # Set up the generation config
200
- return BatchFeature(data={"input_ids": input_ids, "media": media, "media_config": media_config})
 
 
 
 
 
 
 
 
 
201
 
202
  def batch_decode(self, *args, **kwargs):
203
  """
@@ -235,39 +407,26 @@ class VILAProcessor(ProcessorMixin):
235
  image_processor_input_names = self.image_processor.model_input_names
236
  return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
237
 
238
- # inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
239
- def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
240
  vila_conv = []
241
  for chat in conversation:
242
  vila_chat = {"from": "", "value": []}
243
- if chat["role"] == "user":
244
  # user allows to input image and text
245
- vila_chat["from"] = "human"
246
- for content in chat["content"]:
247
- if content["type"] == "image":
248
- if "path" in content:
249
- # VILA style
250
- media_key = "path"
251
- elif "image" in content:
252
- # Qwen style
253
- media_key = "image"
254
- else:
255
- raise ValueError(f"Unsupported content type `image`: {content}, `image` and `path` are required")
256
- vila_chat["value"].append(Image(fetch_image_url_or_fpath(content[media_key])))
257
- elif content["type"] == "text":
258
- vila_chat["value"].append(content["text"])
259
- # NOTE(ligeng): video supports are needed here
260
- else:
261
- raise ValueError(f"Unsupported content type: {content['type']}")
262
  elif chat["role"] == "assistant":
263
  vila_chat["from"] = "gpt"
264
- for content in chat["content"]:
265
- assert content["type"] == "text", f"Unsupported content type: {content['type']}"
266
- vila_chat["value"].append(content["text"])
267
  vila_conv.append(vila_chat)
268
 
269
  return vila_conv
270
 
 
 
 
271
 
272
  if __name__ == "__main__":
273
  # gpt style: user, assistant
@@ -302,7 +461,6 @@ if __name__ == "__main__":
302
  # print(model.config)
303
  # print(model.tokenizer)
304
  # print(res)
305
- # exit(0)
306
 
307
  processor = VILAProcessor(
308
  config=model.config,
 
3
  import os.path as osp
4
  import warnings
5
  from collections import defaultdict
6
+ from io import BytesIO
7
+ from typing import List, Optional, Union
8
 
9
+ import PIL.Image
10
+ import requests
11
  import torch
12
  from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoProcessor, AutoTokenizer
13
  from transformers.feature_extraction_utils import BatchFeature
 
21
  from .mm_utils import process_image, process_images
22
  from .tokenizer_utils import tokenize_conversation
23
 
24
+
25
+ def to_rgb(pil_image: PIL.Image.Image) -> PIL.Image.Image:
26
+ if pil_image.mode == "RGBA":
27
+ white_background = PIL.Image.new("RGB", pil_image.size, (255, 255, 255))
28
+ white_background.paste(pil_image, mask=pil_image.split()[3]) # Use alpha channel as mask
29
+ return white_background
30
+ else:
31
+ return pil_image.convert("RGB")
32
+
33
+
34
+ def fetch_image(ele: dict[str, str | PIL.Image.Image], size_factor=None) -> PIL.Image.Image:
35
+ if "image" in ele:
36
+ image = ele["image"]
37
+ else:
38
+ image = ele["image_url"]
39
+ image_obj = None
40
+ if isinstance(image, PIL.Image.Image):
41
+ image_obj = image
42
+ elif image.startswith("http://") or image.startswith("https://"):
43
+ response = requests.get(image, stream=True)
44
+ image_obj = PIL.Image.open(BytesIO(response.content))
45
+ elif image.startswith("file://"):
46
+ image_obj = PIL.Image.open(image[7:])
47
+ elif image.startswith("data:image"):
48
+ if "base64," in image:
49
+ _, base64_data = image.split("base64,", 1)
50
+ data = base64.b64decode(base64_data)
51
+ image_obj = PIL.Image.open(BytesIO(data))
52
+ else:
53
+ image_obj = PIL.Image.open(image)
54
+ if image_obj is None:
55
+ raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
56
+ image = to_rgb(image_obj)
57
+
58
+ return image
59
+
60
+
61
  def fetch_image_url_or_fpath(url_or_fpath):
62
  if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
63
  import tempfile
64
+
65
  import requests
66
+
67
  # Download the image to a temporary file
68
  temp_dir = tempfile.mkdtemp()
69
  temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
70
+
71
  response = requests.get(url_or_fpath, stream=True)
72
  response.raise_for_status()
73
+
74
  with open(temp_file, "wb") as f:
75
  for chunk in response.iter_content(chunk_size=8192):
76
  f.write(chunk)
77
+
78
  return temp_file
79
  elif url_or_fpath.startswith("file://"):
80
  fpath = url_or_fpath.replace("file://", "")
81
  assert osp.exists(fpath), f"File {fpath} does not exist"
82
  return fpath
83
  elif osp.exists(url_or_fpath):
84
+ assert osp.isfile(url_or_fpath), f"File {url_or_fpath} does not exist"
85
  return url_or_fpath
86
  else:
87
  raise ValueError(f"Unsupported image path: {url_or_fpath}")
 
88
 
89
+
90
+ def pad_fn(input_ids_list: List[torch.Tensor], padding_value=0, target_len=None, padding_side="left") -> torch.Tensor:
91
  # tensor shape is (batch_size, seq_len)
92
  max_len = max([ids.shape[1] for ids in input_ids_list])
93
  if target_len is not None:
 
107
  return torch.cat(new_input_ids_list, dim=0)
108
 
109
 
110
+ def extract_value_from_conv(chat):
111
+ value = []
112
+ if isinstance(chat["content"], str):
113
+ # vila_chat["value"].append(chat["content"])
114
+ value.append(chat["content"])
115
+ return value
116
+
117
+ # otherwise, it's a list of content
118
+ for content in chat["content"]:
119
+ if content["type"] == "image":
120
+ if "path" in content:
121
+ # VILA style, can be either filepath or http url
122
+ value.append(Image(fetch_image_url_or_fpath(content["path"])))
123
+ elif "image" in content:
124
+ # Qwen style
125
+ value.append(Image(fetch_image_url_or_fpath(content["image"])))
126
+ elif "image_pil" in content:
127
+ # Qwen style
128
+ assert isinstance(content["image_pil"], PIL.Image.Image), f"Type of {media_key} must be PIL.Image.Image"
129
+ value.append(content["image_pil"])
130
+ else:
131
+ raise ValueError(f"Type = `image` , but no `path` or `image` in | {content=}, {conversation=}")
132
+ elif content["type"] == "text":
133
+ value.append(content["text"])
134
+ # NOTE(ligeng): video supports are needed here
135
+ else:
136
+ raise ValueError(f"Unsupported content type: {content['type']}")
137
+ return value
138
+
139
+
140
  class VILAProcessorKwargs(ProcessingKwargs, total=False):
141
  _defaults = {
142
  "text_kwargs": {
 
145
  }
146
 
147
 
 
 
148
  class VILAProcessor(ProcessorMixin):
149
  # attributes = ["image_processor", "tokenizer"]
150
  attributes = []
 
153
  # image_processor_class = "VILAImageProcessor"
154
  # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
155
 
156
+ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs):
 
 
157
  self.image_token = MEDIA_TOKENS["image"]
158
  self.video_token = MEDIA_TOKENS["video"]
159
  self.config = config
160
  self.image_processor = image_processor
161
  self.tokenizer = tokenizer
162
+ self.padding_side = padding_side
163
+
164
+ # This is a special setting for Qwen.
165
+ # self.pad_token_id = tokenizer.pad_token_id
166
+ self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0] # 151643
167
+ self.eos_token_id = self.tokenizer.eos_token_id
168
 
169
  super().__init__(image_processor, tokenizer, chat_template=chat_template)
170
 
171
+ @staticmethod
172
+ def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
173
+ """
174
+ referernce from qwen_vl_utils
175
+ """
176
+ vision_infos = []
177
+ if isinstance(conversations[0], dict):
178
+ conversations = [conversations]
179
+ for conversation in conversations:
180
+ for message in conversation:
181
+ if isinstance(message["content"], list):
182
+ for ele in message["content"]:
183
+ if (
184
+ "image" in ele
185
+ or "image_url" in ele
186
+ or "video" in ele
187
+ or ele["type"] in ("image", "image_url", "video")
188
+ ):
189
+ vision_infos.append(ele)
190
+ return vision_infos
191
+
192
+ @staticmethod
193
+ def process_vision_info(
194
+ conversations: list[dict] | list[list[dict]],
195
+ return_video_kwargs: bool = False,
196
+ ) -> tuple[list[PIL.Image.Image] | None, list[torch.Tensor | list[PIL.Image.Image]] | None, Optional[dict]]:
197
+ """
198
+ referernce from qwen_vl_utils
199
+ NVILA does not depend on the function, but the interface is the same.
200
+ """
201
+ vision_infos = extract_vision_info(conversations)
202
+ ## Read images or videos
203
+ image_inputs = []
204
+ video_inputs = []
205
+ video_sample_fps_list = []
206
+ for vision_info in vision_infos:
207
+ if "image" in vision_info or "image_url" in vision_info:
208
+ image_inputs.append(fetch_image(vision_info))
209
+ elif "video" in vision_info:
210
+ video_input, video_sample_fps = fetch_video(vision_info, return_video_sample_fps=True)
211
+ video_sample_fps_list.append(video_sample_fps)
212
+ video_inputs.append(video_input)
213
+ else:
214
+ raise ValueError("image, image_url or video should in content.")
215
+ if len(image_inputs) == 0:
216
+ image_inputs = None
217
+ if len(video_inputs) == 0:
218
+ video_inputs = None
219
+ if return_video_kwargs:
220
+ return image_inputs, video_inputs, {"fps": video_sample_fps_list}
221
+ return image_inputs, video_inputs
222
+
223
+ @staticmethod
224
+ def move_data_to_device(cls, prompt_inputs):
225
+ def _move_data_to_device(item):
226
+ # wrap function grpo trainer _prepare_input
227
+ kwargs = {"device": cls.args.device}
228
+ if cls.is_deepspeed_enabled and (torch.is_floating_point(item) or torch.is_complex(item)):
229
+ kwargs.update({"dtype": cls.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
230
+ return item.to(**kwargs)
231
+
232
+ prompt_inputs.input_ids = _move_data_to_device(prompt_inputs.input_ids)
233
+ prompt_inputs.attention_mask = _move_data_to_device(prompt_inputs.attention_mask)
234
+ if "image" in prompt_inputs.media:
235
+ prompt_inputs.media["image"] = [_move_data_to_device(img) for img in prompt_inputs.media["image"]]
236
+ return prompt_inputs
237
+
238
  @classmethod
239
  def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
240
+ padding_side = kwargs.get("padding_side", "left")
241
  if os.path.isdir(pretrained_model_name_or_path):
242
  pretrained_model_name_or_path = pretrained_model_name_or_path
243
  else:
244
  print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
245
  from huggingface_hub import snapshot_download
 
246
  pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
247
 
248
  image_processor = AutoImageProcessor.from_pretrained(
 
252
  osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
253
  )
254
  config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
255
+ return cls(image_processor=image_processor, tokenizer=tokenizer, config=config, padding_side=padding_side)
256
 
257
  def __repr__(self):
258
+ # NOTE(ligeng): hard coded image_processor to avoid serialization error. Dirty fix
259
+ return f"VILAProcessor(image_processor=SigLip, tokenizer={self.tokenizer}, config={self.config})"
 
260
 
261
  def __call__(
262
  self,
263
+ conversation=None,
 
 
 
264
  **kwargs: Unpack[VILAProcessorKwargs],
265
  ) -> BatchFeature:
266
+ """
267
+ The `conv` will be look like
268
+ [
269
+ {
270
+ 'from': 'human',
271
+ 'value': [
272
+ <transformers_modules.NVILA-Lite-2B-hf-preview.media.Image object at 0x154e68e4c460>,
273
+ 'What are the common elements in these pictures?'
274
+ ]
275
+ }
276
+ ]
277
+ and `conversation` will be a list of such `conv`s
278
+ """
279
+ if kwargs.get("text", None) is not None:
280
+ conversation = kwargs.get("text")
281
+ assert conversation is not None, "`conversation` or `text` is required"
282
+ padding_side = kwargs.get("padding_side", self.padding_side)
283
 
284
+ input_ids_list = []
285
+ attention_mask = []
286
  media = defaultdict(list)
287
  media_config = defaultdict(dict)
288
  for conv in conversation:
289
+ feat = self.__single_call__(conv, **kwargs)
290
+ input_ids_list.append(feat.input_ids)
291
+ attention_mask.append(feat.attention_mask)
292
  for name in feat.media:
293
  media[name] += feat.media[name]
294
  for name in feat.media_config:
295
  media_config[name].update(feat.media_config[name])
296
+
297
+ # pad the input_ids to batchfy
298
+ input_ids = pad_fn(
299
+ input_ids_list,
300
+ padding_value=self.pad_token_id,
301
+ padding_side=padding_side,
302
+ )
303
+ # ignore the pad token in the attention mask
304
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
305
+ attention_mask[input_ids == self.pad_token_id] = False
306
+ # print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
307
+ input_texts = self.tokenizer.batch_decode(input_ids)
308
  return BatchFeature(
309
  data={
310
+ "input_texts": input_texts,
311
+ "input_ids": input_ids,
312
+ "attention_mask": attention_mask,
 
 
 
313
  "media": media,
314
  "media_config": media_config,
315
  }
 
335
  self.config.image_processor = self.image_processor
336
  if self.config.image_aspect_ratio == "dynamic":
337
  images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
338
+ # print("DEBUG", len(images)); input()
339
+ # NOTE: this only works for images appears at the first conversation
340
  conversation[0]["value"] = conversation[0]["value"].replace(
341
  DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
342
  )
 
358
  ]
359
  else:
360
  raise ValueError(f"Unsupported media type: {name}")
361
+
362
+ inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
363
+ input_ids = inputs.input_ids[0].unsqueeze(0)#.cuda()
364
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
365
+ return BatchFeature(
366
+ data={
367
+ "input_ids": input_ids,
368
+ "attention_mask": attention_mask,
369
+ "media": media,
370
+ "media_config": media_config,
371
+ }
372
+ )
373
 
374
  def batch_decode(self, *args, **kwargs):
375
  """
 
407
  image_processor_input_names = self.image_processor.model_input_names
408
  return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
409
 
410
+ def convert_gpt_conv_to_vila_conv(self, conversation):
 
411
  vila_conv = []
412
  for chat in conversation:
413
  vila_chat = {"from": "", "value": []}
414
+ if chat["role"] in ("user", "system"):
415
  # user allows to input image and text
416
+ vila_chat["from"] = "human" if chat["role"] == "user" else "system"
417
+ vila_chat["value"] = extract_value_from_conv(chat)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  elif chat["role"] == "assistant":
419
  vila_chat["from"] = "gpt"
420
+ vila_chat["value"] = extract_value_from_conv(chat)
421
+ else:
422
+ raise ValueError(f"Unsupported role: {chat['role']} in chat {chat}")
423
  vila_conv.append(vila_chat)
424
 
425
  return vila_conv
426
 
427
+ def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
428
+ return self.convert_gpt_conv_to_vila_conv(conversation)
429
+
430
 
431
  if __name__ == "__main__":
432
  # gpt style: user, assistant
 
461
  # print(model.config)
462
  # print(model.tokenizer)
463
  # print(res)
 
464
 
465
  processor = VILAProcessor(
466
  config=model.config,
builder.py CHANGED
@@ -34,6 +34,7 @@ from transformers import (
34
  PreTrainedTokenizer,
35
  )
36
  from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 
37
  # from .conversation import *
38
  from .conversation import SeparatorStyle, default_conversation
39
 
@@ -203,6 +204,7 @@ def build_llm_and_tokenizer(
203
  )
204
  else:
205
  if is_deepspeed_zero3_enabled():
 
206
  kwargs.pop("device_map")
207
  llm = AutoModelForCausalLM.from_pretrained(
208
  model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
 
34
  PreTrainedTokenizer,
35
  )
36
  from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
37
+
38
  # from .conversation import *
39
  from .conversation import SeparatorStyle, default_conversation
40
 
 
204
  )
205
  else:
206
  if is_deepspeed_zero3_enabled():
207
+ # NOTE: found by wei, need to pop out device_map when using zero3
208
  kwargs.pop("device_map")
209
  llm = AutoModelForCausalLM.from_pretrained(
210
  model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
llm/tokenizer_config.json CHANGED
@@ -78,7 +78,7 @@
78
  "legacy": false,
79
  "model_max_length": 4096,
80
  "pad_token": "[PAD]",
81
- "padding_side": "right",
82
  "split_special_tokens": false,
83
  "tokenizer_class": "Qwen2Tokenizer",
84
  "unk_token": null
 
78
  "legacy": false,
79
  "model_max_length": 4096,
80
  "pad_token": "[PAD]",
81
+ "padding_side": "left",
82
  "split_special_tokens": false,
83
  "tokenizer_class": "Qwen2Tokenizer",
84
  "unk_token": null
mm_utils.py CHANGED
@@ -521,8 +521,11 @@ def process_images(images, image_processor, model_cfg, enable_dynamic_res=False,
521
  return new_images
522
 
523
 
524
- def tokenizer_image_token(prompt, tokenizer, return_tensors=None):
525
- return tokenizer(prompt, return_tensors=return_tensors).input_ids[0]
 
 
 
526
 
527
 
528
  def is_gemma_tokenizer(tokenizer):
 
521
  return new_images
522
 
523
 
524
+ def tokenizer_image_token(prompt, tokenizer, return_tensors=None, return_ids=True):
525
+ if return_ids:
526
+ return tokenizer(prompt, return_tensors=return_tensors).input_ids[0]
527
+ else:
528
+ return tokenizer(prompt, return_tensors=return_tensors)
529
 
530
 
531
  def is_gemma_tokenizer(tokenizer):
modeling_vila.py CHANGED
@@ -201,17 +201,19 @@ class VILAPretrainedModel(PreTrainedModel):
201
  else:
202
  raise ValueError("`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config.")
203
 
204
- # loading on cpu by default
205
- device_map = kwargs.get("device_map", "cpu")
206
  self.mm_projector = build_mm_projector(mm_projector_cfg, config)
207
  self.vision_tower = build_vision_tower(vision_tower_cfg, config)
208
- if "auto" in device_map or "cuda" in device_map:
209
  self.mm_projector = self.mm_projector.cuda()
210
  self.vision_tower = self.vision_tower.cuda()
211
  # set device_map auto can autoamtically shard llm to different devices
212
  self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
213
 
214
- # NOTE(ligeng): need to add other decoders from config
 
 
215
  self.encoders = {"image": BasicImageEncoder(self), "video": BasicVideoEncoder(self)}
216
 
217
  self.post_config()
@@ -418,6 +420,7 @@ class VILAPretrainedModel(PreTrainedModel):
418
  weights_only: bool = True,
419
  **kwargs,
420
  ):
 
421
  config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
422
  return cls._from_config(config, **kwargs)
423
 
@@ -428,6 +431,12 @@ class VILAPretrainedModel(PreTrainedModel):
428
  # print("DEBUG", len(self.tokenizer.added_tokens_encoder.keys()), self.tokenizer.added_tokens_encoder.keys())
429
  NUM_EXTRA_TOKENS = len(self.tokenizer.added_tokens_encoder.keys())
430
 
 
 
 
 
 
 
431
  # TODO: SENTINEL_TOKEN is not added, need to check with Zhijian
432
  self.vocab_size = self.tokenizer.vocab_size + NUM_EXTRA_TOKENS
433
  # XGrammar tokenizer and grammar compiler
@@ -444,6 +453,10 @@ class VILAPretrainedModel(PreTrainedModel):
444
  self.vision_tower = self.vision_tower.to(torch.float16)
445
  ######################################################################
446
  self.training = self.llm.training
 
 
 
 
447
  ## configuration
448
  if getattr(self.config, "llm_cfg", None) is None:
449
  self.config.llm_cfg = self.llm.config
@@ -589,10 +602,6 @@ class VILAForCasualLM(VILAPretrainedModel):
589
  return image_features
590
 
591
  def train(self, mode: bool = True):
592
- if mode:
593
- self.tokenizer.padding_side = "right"
594
- else:
595
- self.tokenizer.padding_side = "left"
596
  super().train(mode)
597
  return self
598
 
@@ -650,11 +659,10 @@ class VILAForCasualLM(VILAPretrainedModel):
650
  name = media_tokens[input_ids[k][pos].item()]
651
  input = media_embeds[name].popleft()
652
  label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
653
- # print(f"{self.tokenizer.padding_side} [media] {k=} {pos=}, {self.tokenizer.batch_decode(input_ids[k][pos:pos+1])}"); python_input()
654
- elif input_ids[k][pos].item() in (self.tokenizer.pad_token_id, self.tokenizer.eos_token_id):
655
  end = pos + 1
656
  pos = end
657
- # print(f"[skip PAD/EOS] {k=} {pos=}, {self.tokenizer.batch_decode(input_ids[k][pos:end])}"); python_input()
658
  continue
659
  else:
660
  end = pos
@@ -662,7 +670,6 @@ class VILAForCasualLM(VILAPretrainedModel):
662
  end += 1
663
  input = text_embeds[k][pos:end]
664
  label = labels[k][pos:end]
665
- # print(f"[text] {k=} {pos=}, {self.tokenizer.batch_decode(input_ids[k][pos:end])}"); python_input()
666
 
667
  inputs_mk.append(input)
668
  labels_mk.append(label)
@@ -1018,6 +1025,7 @@ class VILAForCasualLM(VILAPretrainedModel):
1018
  media: Optional[Dict[str, List[torch.Tensor]]] = None,
1019
  images: Optional[torch.FloatTensor] = None,
1020
  media_config: Optional[List] = None,
 
1021
  attention_mask: Optional[torch.Tensor] = None,
1022
  position_ids: Optional[torch.LongTensor] = None,
1023
  past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1074,21 +1082,62 @@ class VILAForCasualLM(VILAPretrainedModel):
1074
 
1075
  return outputs
1076
 
1077
- @torch.inference_mode()
1078
  def generate(
1079
  self,
1080
  input_ids: Optional[torch.FloatTensor] = None,
1081
  media: Optional[Dict[str, List[torch.Tensor]]] = None,
1082
  media_config: Dict[str, Dict[str, Any]] = None,
1083
  attention_mask: Optional[torch.LongTensor] = None,
 
1084
  **generation_kwargs,
1085
- ):
 
1086
  if self.training:
1087
  warnings.warn(
1088
- "Model is in training mode, using default padding strategy to right. This is not recommended for generation."
1089
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1090
  inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
1091
- return self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
 
1093
  @torch.inference_mode()
1094
  def generate_content(
@@ -1101,10 +1150,7 @@ class VILAForCasualLM(VILAPretrainedModel):
1101
  conversation = [{"from": "human", "value": prompt}]
1102
 
1103
  # Convert response format to logits processor
1104
- if response_format:
1105
- xgr_logits_processor = self.get_xgr_logits_processor(response_format)
1106
- else:
1107
- xgr_logits_processor = None
1108
 
1109
  # Extract media from the conversation
1110
 
@@ -1173,7 +1219,7 @@ class VILAForCasualLM(VILAPretrainedModel):
1173
  raise ValueError(f"Unsupported media type: {name}")
1174
 
1175
  # Tokenize the conversation
1176
- input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).cuda().unsqueeze(0)
1177
 
1178
  # Set up the generation config
1179
  generation_config = generation_config or self.default_generation_config
 
201
  else:
202
  raise ValueError("`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config.")
203
 
204
+ # loading on auto by default
205
+ device_map = kwargs.get("device_map", "auto")
206
  self.mm_projector = build_mm_projector(mm_projector_cfg, config)
207
  self.vision_tower = build_vision_tower(vision_tower_cfg, config)
208
+ if device_map in ["auto", "cuda"]:
209
  self.mm_projector = self.mm_projector.cuda()
210
  self.vision_tower = self.vision_tower.cuda()
211
  # set device_map auto can autoamtically shard llm to different devices
212
  self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
213
 
214
+ # NOTE(ligeng): hard code to set padding_side to left
215
+ self.tokenizer.padding_side = "left"
216
+ # TODO(ligeng): need to add other decoders from config
217
  self.encoders = {"image": BasicImageEncoder(self), "video": BasicVideoEncoder(self)}
218
 
219
  self.post_config()
 
420
  weights_only: bool = True,
421
  **kwargs,
422
  ):
423
+ # print("DEBUG2", kwargs); input()
424
  config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
425
  return cls._from_config(config, **kwargs)
426
 
 
431
  # print("DEBUG", len(self.tokenizer.added_tokens_encoder.keys()), self.tokenizer.added_tokens_encoder.keys())
432
  NUM_EXTRA_TOKENS = len(self.tokenizer.added_tokens_encoder.keys())
433
 
434
+ self.pad_token_list = (
435
+ self.tokenizer.pad_token_id,
436
+ self.tokenizer.eos_token_id,
437
+ self.tokenizer.tokenize("<|endoftext|>")[0], # for qwen
438
+ )
439
+
440
  # TODO: SENTINEL_TOKEN is not added, need to check with Zhijian
441
  self.vocab_size = self.tokenizer.vocab_size + NUM_EXTRA_TOKENS
442
  # XGrammar tokenizer and grammar compiler
 
453
  self.vision_tower = self.vision_tower.to(torch.float16)
454
  ######################################################################
455
  self.training = self.llm.training
456
+ if self.training:
457
+ self.train()
458
+ else:
459
+ self.eval()
460
  ## configuration
461
  if getattr(self.config, "llm_cfg", None) is None:
462
  self.config.llm_cfg = self.llm.config
 
602
  return image_features
603
 
604
  def train(self, mode: bool = True):
 
 
 
 
605
  super().train(mode)
606
  return self
607
 
 
659
  name = media_tokens[input_ids[k][pos].item()]
660
  input = media_embeds[name].popleft()
661
  label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
662
+ elif input_ids[k][pos].item() in self.pad_token_list:
663
+ # skip pad tokens
664
  end = pos + 1
665
  pos = end
 
666
  continue
667
  else:
668
  end = pos
 
670
  end += 1
671
  input = text_embeds[k][pos:end]
672
  label = labels[k][pos:end]
 
673
 
674
  inputs_mk.append(input)
675
  labels_mk.append(label)
 
1025
  media: Optional[Dict[str, List[torch.Tensor]]] = None,
1026
  images: Optional[torch.FloatTensor] = None,
1027
  media_config: Optional[List] = None,
1028
+ pixel_values: Optional[torch.FloatTensor] = None,
1029
  attention_mask: Optional[torch.Tensor] = None,
1030
  position_ids: Optional[torch.LongTensor] = None,
1031
  past_key_values: Optional[List[torch.FloatTensor]] = None,
 
1082
 
1083
  return outputs
1084
 
1085
+ # @torch.inference_mode()
1086
  def generate(
1087
  self,
1088
  input_ids: Optional[torch.FloatTensor] = None,
1089
  media: Optional[Dict[str, List[torch.Tensor]]] = None,
1090
  media_config: Dict[str, Dict[str, Any]] = None,
1091
  attention_mask: Optional[torch.LongTensor] = None,
1092
+ return_output_ids_only: bool = False,
1093
  **generation_kwargs,
1094
+ ) -> torch.LongTensor:
1095
+ model_training_status = False
1096
  if self.training:
1097
  warnings.warn(
1098
+ "Model is in training mode, using default padding strategy to right. This is not recommended for generation. We implicitly set the model to evaluation mode and restore the model training status after generation."
1099
  )
1100
+ self.eval()
1101
+ model_training_status = True
1102
+ """
1103
+ input_tokens: <image> describe the image
1104
+ media: [Tensor(1, 3, 384, 384), ]
1105
+ ----------->
1106
+ input_tokens: 36000 001 002 003 004
1107
+ input_emds: <media emd> 001 002 003 004
1108
+ """
1109
+ # NOTE: hard code to move to GPU
1110
+ input_ids = input_ids.cuda()
1111
+ media = {k: [v.cuda() for v in media[k]] for k in media}
1112
+ if attention_mask is not None:
1113
+ attention_mask = attention_mask.cuda()
1114
+
1115
+ # TODO: there is still a padding left vs right issue unsovled here.
1116
+ # print("prev args:",input_ids.shape, media, media_config, None, attention_mask)
1117
  inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
1118
+ # print("inputs_embeds", inputs_embeds.shape, inputs_embeds.mean(), inputs_embeds.std())
1119
+ # print("attention_mask", attention_mask.shape, attention_mask)
1120
+ output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
1121
+ # print("output_ids", self.tokenizer.batch_decode(output_ids))
1122
+ # input("wait for debug")
1123
+
1124
+ if return_output_ids_only:
1125
+ return_value = output_ids
1126
+ else:
1127
+ # by default, return the input_ids and output_ids concatenated to keep consistency with the community VLMs like qwen
1128
+ generation_config = generation_kwargs.get("generation_config", None)
1129
+ if generation_config is not None:
1130
+ num_generations = generation_config.num_return_sequences
1131
+ repeat_input_ids = input_ids.repeat_interleave(num_generations, dim=0)
1132
+ return_value = torch.cat([repeat_input_ids, output_ids], dim=-1)
1133
+ else:
1134
+ return_value = torch.cat([input_ids, output_ids], dim=-1)
1135
+
1136
+ if model_training_status:
1137
+ # restore the model training status
1138
+ self.train()
1139
+
1140
+ return return_value
1141
 
1142
  @torch.inference_mode()
1143
  def generate_content(
 
1150
  conversation = [{"from": "human", "value": prompt}]
1151
 
1152
  # Convert response format to logits processor
1153
+ xgr_logits_processor = None
 
 
 
1154
 
1155
  # Extract media from the conversation
1156
 
 
1219
  raise ValueError(f"Unsupported media type: {name}")
1220
 
1221
  # Tokenize the conversation
1222
+ input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
1223
 
1224
  # Set up the generation config
1225
  generation_config = generation_config or self.default_generation_config
tokenizer_utils.py CHANGED
@@ -68,13 +68,16 @@ def tokenize_conversation_legacy(
68
  return tokenizer_image_token(conv.get_prompt(), tokenizer, return_tensors="pt")
69
 
70
 
 
71
  def tokenize_conversation(
72
  messages: Sequence[Dict[str, str]],
73
  tokenizer: transformers.PreTrainedTokenizer,
74
  add_generation_prompt: bool = False,
75
  overrides: Optional[Dict[str, str]] = None,
76
  no_system_prompt: bool = False,
 
77
  ) -> torch.Tensor:
 
78
  # Normalize the conversation before tokenization
79
  for message in messages:
80
  message["value"] = message["value"].strip()
@@ -95,6 +98,10 @@ def tokenize_conversation(
95
  message["role"] = "user"
96
  elif m["from"] == "gpt":
97
  message["role"] = "assistant"
 
 
 
 
98
  else:
99
  raise ValueError(f"Unexpected sender '{m['from']}' in conversation entry.")
100
 
@@ -111,7 +118,7 @@ def tokenize_conversation(
111
  add_generation_prompt=add_generation_prompt,
112
  tokenize=False,
113
  )
114
- return tokenizer_image_token(text, tokenizer, return_tensors="pt")
115
 
116
 
117
  def _maybe_add_sentinel_token(tokenizer: transformers.PreTrainedTokenizer) -> None:
 
68
  return tokenizer_image_token(conv.get_prompt(), tokenizer, return_tensors="pt")
69
 
70
 
71
+ # NOTE(ligeng): add a return typing to help code analyze
72
  def tokenize_conversation(
73
  messages: Sequence[Dict[str, str]],
74
  tokenizer: transformers.PreTrainedTokenizer,
75
  add_generation_prompt: bool = False,
76
  overrides: Optional[Dict[str, str]] = None,
77
  no_system_prompt: bool = False,
78
+ return_ids_only=True,
79
  ) -> torch.Tensor:
80
+ # print("messages", messages); input()
81
  # Normalize the conversation before tokenization
82
  for message in messages:
83
  message["value"] = message["value"].strip()
 
98
  message["role"] = "user"
99
  elif m["from"] == "gpt":
100
  message["role"] = "assistant"
101
+ elif m["from"] == "system":
102
+ message["role"] = "system"
103
+ if no_system_prompt:
104
+ raise ValueError("System prompt is not allowed when no_system_prompt is True.")
105
  else:
106
  raise ValueError(f"Unexpected sender '{m['from']}' in conversation entry.")
107
 
 
118
  add_generation_prompt=add_generation_prompt,
119
  tokenize=False,
120
  )
121
+ return tokenizer_image_token(text, tokenizer, return_tensors="pt", return_ids=return_ids_only)
122
 
123
 
124
  def _maybe_add_sentinel_token(tokenizer: transformers.PreTrainedTokenizer) -> None: