Ligeng-Zhu commited on
Commit
30c1060
·
verified ·
1 Parent(s): 942e894

Upload files with `vila-upload`.

Browse files

Upload modeling_vila.py
Upload auto_processor.py

Files changed (2) hide show
  1. auto_processor.py +8 -3
  2. modeling_vila.py +2 -19
auto_processor.py CHANGED
@@ -305,15 +305,20 @@ class VILAProcessor(ProcessorMixin):
305
  attention_mask[input_ids == self.pad_token_id] = False
306
  # print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
307
  input_texts = self.tokenizer.batch_decode(input_ids)
308
- return BatchFeature(
309
  data={
310
- "input_texts": input_texts,
311
  "input_ids": input_ids,
312
  "attention_mask": attention_mask,
313
  "media": media,
314
  "media_config": media_config,
315
  }
316
  )
 
 
 
 
 
317
 
318
  def __single_call__(
319
  self,
@@ -360,7 +365,7 @@ class VILAProcessor(ProcessorMixin):
360
  raise ValueError(f"Unsupported media type: {name}")
361
 
362
  inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
363
- input_ids = inputs.input_ids[0].unsqueeze(0)#.cuda()
364
  attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
365
  return BatchFeature(
366
  data={
 
305
  attention_mask[input_ids == self.pad_token_id] = False
306
  # print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
307
  input_texts = self.tokenizer.batch_decode(input_ids)
308
+ bdata = BatchFeature(
309
  data={
310
+ # "input_texts": input_texts,
311
  "input_ids": input_ids,
312
  "attention_mask": attention_mask,
313
  "media": media,
314
  "media_config": media_config,
315
  }
316
  )
317
+ # NOTE: hard coded to cuda
318
+ # bdata.input_ids = bdata.input_ids.cuda()
319
+ # bdata.attention_mask = bdata.attention_mask.cuda()
320
+ # bdata.media["image"] = [img.cuda() for img in bdata.media["image"]]
321
+ return bdata
322
 
323
  def __single_call__(
324
  self,
 
365
  raise ValueError(f"Unsupported media type: {name}")
366
 
367
  inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
368
+ input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
369
  attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
370
  return BatchFeature(
371
  data={
modeling_vila.py CHANGED
@@ -1082,7 +1082,7 @@ class VILAForCasualLM(VILAPretrainedModel):
1082
 
1083
  return outputs
1084
 
1085
- # @torch.inference_mode()
1086
  def generate(
1087
  self,
1088
  input_ids: Optional[torch.FloatTensor] = None,
@@ -1092,18 +1092,11 @@ class VILAForCasualLM(VILAPretrainedModel):
1092
  return_output_ids_only: bool = False,
1093
  **generation_kwargs,
1094
  ) -> torch.LongTensor:
1095
- model_training_status = False
1096
- if self.training:
1097
- warnings.warn(
1098
- "Model is in training mode, using default padding strategy to right. This is not recommended for generation. We implicitly set the model to evaluation mode and restore the model training status after generation."
1099
- )
1100
- self.eval()
1101
- model_training_status = True
1102
  """
1103
  input_tokens: <image> describe the image
1104
  media: [Tensor(1, 3, 384, 384), ]
1105
  ----------->
1106
- input_tokens: 36000 001 002 003 004
1107
  input_emds: <media emd> 001 002 003 004
1108
  """
1109
  # NOTE: hard code to move to GPU
@@ -1112,14 +1105,8 @@ class VILAForCasualLM(VILAPretrainedModel):
1112
  if attention_mask is not None:
1113
  attention_mask = attention_mask.cuda()
1114
 
1115
- # TODO: there is still a padding left vs right issue unsovled here.
1116
- # print("prev args:",input_ids.shape, media, media_config, None, attention_mask)
1117
  inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
1118
- # print("inputs_embeds", inputs_embeds.shape, inputs_embeds.mean(), inputs_embeds.std())
1119
- # print("attention_mask", attention_mask.shape, attention_mask)
1120
  output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
1121
- # print("output_ids", self.tokenizer.batch_decode(output_ids))
1122
- # input("wait for debug")
1123
 
1124
  if return_output_ids_only:
1125
  return_value = output_ids
@@ -1133,10 +1120,6 @@ class VILAForCasualLM(VILAPretrainedModel):
1133
  else:
1134
  return_value = torch.cat([input_ids, output_ids], dim=-1)
1135
 
1136
- if model_training_status:
1137
- # restore the model training status
1138
- self.train()
1139
-
1140
  return return_value
1141
 
1142
  @torch.inference_mode()
 
1082
 
1083
  return outputs
1084
 
1085
+ @torch.inference_mode()
1086
  def generate(
1087
  self,
1088
  input_ids: Optional[torch.FloatTensor] = None,
 
1092
  return_output_ids_only: bool = False,
1093
  **generation_kwargs,
1094
  ) -> torch.LongTensor:
 
 
 
 
 
 
 
1095
  """
1096
  input_tokens: <image> describe the image
1097
  media: [Tensor(1, 3, 384, 384), ]
1098
  ----------->
1099
+ input_tokens: 36000 001 002 003 004
1100
  input_emds: <media emd> 001 002 003 004
1101
  """
1102
  # NOTE: hard code to move to GPU
 
1105
  if attention_mask is not None:
1106
  attention_mask = attention_mask.cuda()
1107
 
 
 
1108
  inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
 
 
1109
  output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
 
 
1110
 
1111
  if return_output_ids_only:
1112
  return_value = output_ids
 
1120
  else:
1121
  return_value = torch.cat([input_ids, output_ids], dim=-1)
1122
 
 
 
 
 
1123
  return return_value
1124
 
1125
  @torch.inference_mode()