Upload files with `vila-upload`.
Browse filesUpload modeling_vila.py
Upload auto_processor.py
- auto_processor.py +8 -3
- modeling_vila.py +2 -19
auto_processor.py
CHANGED
@@ -305,15 +305,20 @@ class VILAProcessor(ProcessorMixin):
|
|
305 |
attention_mask[input_ids == self.pad_token_id] = False
|
306 |
# print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
|
307 |
input_texts = self.tokenizer.batch_decode(input_ids)
|
308 |
-
|
309 |
data={
|
310 |
-
"input_texts": input_texts,
|
311 |
"input_ids": input_ids,
|
312 |
"attention_mask": attention_mask,
|
313 |
"media": media,
|
314 |
"media_config": media_config,
|
315 |
}
|
316 |
)
|
|
|
|
|
|
|
|
|
|
|
317 |
|
318 |
def __single_call__(
|
319 |
self,
|
@@ -360,7 +365,7 @@ class VILAProcessor(ProcessorMixin):
|
|
360 |
raise ValueError(f"Unsupported media type: {name}")
|
361 |
|
362 |
inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
|
363 |
-
input_ids = inputs.input_ids[0].unsqueeze(0)
|
364 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
365 |
return BatchFeature(
|
366 |
data={
|
|
|
305 |
attention_mask[input_ids == self.pad_token_id] = False
|
306 |
# print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
|
307 |
input_texts = self.tokenizer.batch_decode(input_ids)
|
308 |
+
bdata = BatchFeature(
|
309 |
data={
|
310 |
+
# "input_texts": input_texts,
|
311 |
"input_ids": input_ids,
|
312 |
"attention_mask": attention_mask,
|
313 |
"media": media,
|
314 |
"media_config": media_config,
|
315 |
}
|
316 |
)
|
317 |
+
# NOTE: hard coded to cuda
|
318 |
+
# bdata.input_ids = bdata.input_ids.cuda()
|
319 |
+
# bdata.attention_mask = bdata.attention_mask.cuda()
|
320 |
+
# bdata.media["image"] = [img.cuda() for img in bdata.media["image"]]
|
321 |
+
return bdata
|
322 |
|
323 |
def __single_call__(
|
324 |
self,
|
|
|
365 |
raise ValueError(f"Unsupported media type: {name}")
|
366 |
|
367 |
inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
|
368 |
+
input_ids = inputs.input_ids[0].unsqueeze(0).cuda()
|
369 |
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
|
370 |
return BatchFeature(
|
371 |
data={
|
modeling_vila.py
CHANGED
@@ -1082,7 +1082,7 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
1082 |
|
1083 |
return outputs
|
1084 |
|
1085 |
-
|
1086 |
def generate(
|
1087 |
self,
|
1088 |
input_ids: Optional[torch.FloatTensor] = None,
|
@@ -1092,18 +1092,11 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
1092 |
return_output_ids_only: bool = False,
|
1093 |
**generation_kwargs,
|
1094 |
) -> torch.LongTensor:
|
1095 |
-
model_training_status = False
|
1096 |
-
if self.training:
|
1097 |
-
warnings.warn(
|
1098 |
-
"Model is in training mode, using default padding strategy to right. This is not recommended for generation. We implicitly set the model to evaluation mode and restore the model training status after generation."
|
1099 |
-
)
|
1100 |
-
self.eval()
|
1101 |
-
model_training_status = True
|
1102 |
"""
|
1103 |
input_tokens: <image> describe the image
|
1104 |
media: [Tensor(1, 3, 384, 384), ]
|
1105 |
----------->
|
1106 |
-
input_tokens:
|
1107 |
input_emds: <media emd> 001 002 003 004
|
1108 |
"""
|
1109 |
# NOTE: hard code to move to GPU
|
@@ -1112,14 +1105,8 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
1112 |
if attention_mask is not None:
|
1113 |
attention_mask = attention_mask.cuda()
|
1114 |
|
1115 |
-
# TODO: there is still a padding left vs right issue unsovled here.
|
1116 |
-
# print("prev args:",input_ids.shape, media, media_config, None, attention_mask)
|
1117 |
inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
|
1118 |
-
# print("inputs_embeds", inputs_embeds.shape, inputs_embeds.mean(), inputs_embeds.std())
|
1119 |
-
# print("attention_mask", attention_mask.shape, attention_mask)
|
1120 |
output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
|
1121 |
-
# print("output_ids", self.tokenizer.batch_decode(output_ids))
|
1122 |
-
# input("wait for debug")
|
1123 |
|
1124 |
if return_output_ids_only:
|
1125 |
return_value = output_ids
|
@@ -1133,10 +1120,6 @@ class VILAForCasualLM(VILAPretrainedModel):
|
|
1133 |
else:
|
1134 |
return_value = torch.cat([input_ids, output_ids], dim=-1)
|
1135 |
|
1136 |
-
if model_training_status:
|
1137 |
-
# restore the model training status
|
1138 |
-
self.train()
|
1139 |
-
|
1140 |
return return_value
|
1141 |
|
1142 |
@torch.inference_mode()
|
|
|
1082 |
|
1083 |
return outputs
|
1084 |
|
1085 |
+
@torch.inference_mode()
|
1086 |
def generate(
|
1087 |
self,
|
1088 |
input_ids: Optional[torch.FloatTensor] = None,
|
|
|
1092 |
return_output_ids_only: bool = False,
|
1093 |
**generation_kwargs,
|
1094 |
) -> torch.LongTensor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1095 |
"""
|
1096 |
input_tokens: <image> describe the image
|
1097 |
media: [Tensor(1, 3, 384, 384), ]
|
1098 |
----------->
|
1099 |
+
input_tokens: 36000 001 002 003 004
|
1100 |
input_emds: <media emd> 001 002 003 004
|
1101 |
"""
|
1102 |
# NOTE: hard code to move to GPU
|
|
|
1105 |
if attention_mask is not None:
|
1106 |
attention_mask = attention_mask.cuda()
|
1107 |
|
|
|
|
|
1108 |
inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
|
|
|
|
|
1109 |
output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
|
|
|
|
|
1110 |
|
1111 |
if return_output_ids_only:
|
1112 |
return_value = output_ids
|
|
|
1120 |
else:
|
1121 |
return_value = torch.cat([input_ids, output_ids], dim=-1)
|
1122 |
|
|
|
|
|
|
|
|
|
1123 |
return return_value
|
1124 |
|
1125 |
@torch.inference_mode()
|