Upload files with `vila-upload`.

Upload README.md
Upload auto_processor.py
Upload tokenizer_utils.py
Upload mm_utils.py
Upload modeling_vila.py
Upload builder.py
Upload llm/tokenizer_config.json

Files changed (7) hide show

README.md +3 -3
auto_processor.py +219 -61
builder.py +2 -0
llm/tokenizer_config.json +1 -1
mm_utils.py +5 -2
modeling_vila.py +67 -21
tokenizer_utils.py +8 -1

README.md CHANGED Viewed

@@ -67,7 +67,7 @@ model.eval()
 gpt_conv = [{
     "role": "user",
     "content": [
-        {"type": "image", "path": "demo_images/demo_img_1.png"},
         {"type": "text", "text": "Describe this image."}
     ]
 }]
@@ -106,14 +106,14 @@ model.eval()
 gpt_conv1 = [{
     "role": "user",
     "content": [
-        {"type": "image", "path": "demo_images/demo_img_1.png"},
         {"type": "text", "text": "Describe this image."}
     ]
 }]
 gpt_conv2 = [{
     "role": "user",
     "content": [
-        {"type": "image", "path": "demo_images/demo_img_2.png"},
         {"type": "text", "text": "Describe this image for me. Provide a detailed description of the image."}
     ]
 }]

 gpt_conv = [{
     "role": "user",
     "content": [
+        {"type": "image", "path": "https://nvlabs.github.io/VILA/asset/example.jpg"},
         {"type": "text", "text": "Describe this image."}
     ]
 }]
 gpt_conv1 = [{
     "role": "user",
     "content": [
+        {"type": "image", "path": "https://nvlabs.github.io/VILA/asset/example.jpg"},
         {"type": "text", "text": "Describe this image."}
     ]
 }]
 gpt_conv2 = [{
     "role": "user",
     "content": [
+        {"type": "image", "path": "https://nvlabs.github.io/VILA/asset/example_vqa.jpg"},
         {"type": "text", "text": "Describe this image for me. Provide a detailed description of the image."}
     ]
 }]

auto_processor.py CHANGED Viewed

@@ -3,8 +3,11 @@ import os
 import os.path as osp
 import warnings
 from collections import defaultdict
-from typing import List, Union
 import torch
 from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoProcessor, AutoTokenizer
 from transformers.feature_extraction_utils import BatchFeature
@@ -18,35 +21,73 @@ from .media import Image, Video, extract_media
 from .mm_utils import process_image, process_images
 from .tokenizer_utils import tokenize_conversation
 def fetch_image_url_or_fpath(url_or_fpath):
     if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
         import tempfile
         import requests
         # Download the image to a temporary file
         temp_dir = tempfile.mkdtemp()
         temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
         response = requests.get(url_or_fpath, stream=True)
         response.raise_for_status()
         with open(temp_file, "wb") as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
         return temp_file
     elif url_or_fpath.startswith("file://"):
         fpath = url_or_fpath.replace("file://", "")
         assert osp.exists(fpath), f"File {fpath} does not exist"
         return fpath
     elif osp.exists(url_or_fpath):
-        assert osp.isfile(url_or_fpath), f"File {url_or_fpath} is not a file"
         return url_or_fpath
     else:
         raise ValueError(f"Unsupported image path: {url_or_fpath}")
-def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
     # tensor shape is (batch_size, seq_len)
     max_len = max([ids.shape[1] for ids in input_ids_list])
     if target_len is not None:
@@ -66,6 +107,36 @@ def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="lef
     return torch.cat(new_input_ids_list, dim=0)
 class VILAProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {
         "text_kwargs": {
@@ -74,8 +145,6 @@ class VILAProcessorKwargs(ProcessingKwargs, total=False):
     }
 class VILAProcessor(ProcessorMixin):
     # attributes = ["image_processor", "tokenizer"]
     attributes = []
@@ -84,25 +153,96 @@ class VILAProcessor(ProcessorMixin):
     # image_processor_class = "VILAImageProcessor"
     # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, **kwargs):
-        # self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
-        # self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
         self.image_token = MEDIA_TOKENS["image"]
         self.video_token = MEDIA_TOKENS["video"]
         self.config = config
         self.image_processor = image_processor
         self.tokenizer = tokenizer
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         if os.path.isdir(pretrained_model_name_or_path):
             pretrained_model_name_or_path = pretrained_model_name_or_path
         else:
             print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
             from huggingface_hub import snapshot_download
             pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
         image_processor = AutoImageProcessor.from_pretrained(
@@ -112,43 +252,64 @@ class VILAProcessor(ProcessorMixin):
             osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
         )
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
-        return cls(image_processor=image_processor, tokenizer=tokenizer, config=config)
     def __repr__(self):
-        return (
-            f"VILAProcessor(image_processor={self.image_processor}, tokenizer={self.tokenizer}, config={self.config})"
-        )
     def __call__(
         self,
-        conversation,
-        images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        videos: VideoInput = None,
         **kwargs: Unpack[VILAProcessorKwargs],
     ) -> BatchFeature:
-        if images is not None:
-            warnings.warn("images is not supported in __call__")
-        input_ids = []
         media = defaultdict(list)
         media_config = defaultdict(dict)
         for conv in conversation:
-            feat = self.__single_call__(conv, images, text, videos, **kwargs)
-            input_ids.append(feat.input_ids)
             for name in feat.media:
                 media[name] += feat.media[name]
             for name in feat.media_config:
                 media_config[name].update(feat.media_config[name])
         return BatchFeature(
             data={
-                # "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
-                "input_ids": __pad_fn(
-                    input_ids,
-                    padding_value=self.tokenizer.pad_token_id,
-                    padding_side="left",
-                ),
                 "media": media,
                 "media_config": media_config,
             }
@@ -174,6 +335,8 @@ class VILAProcessor(ProcessorMixin):
                     self.config.image_processor = self.image_processor
                     if self.config.image_aspect_ratio == "dynamic":
                         images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
                         conversation[0]["value"] = conversation[0]["value"].replace(
                             DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
                         )
@@ -195,9 +358,18 @@ class VILAProcessor(ProcessorMixin):
                 ]
             else:
                 raise ValueError(f"Unsupported media type: {name}")
-        input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).cuda().unsqueeze(0)
-        # Set up the generation config
-        return BatchFeature(data={"input_ids": input_ids, "media": media, "media_config": media_config})
     def batch_decode(self, *args, **kwargs):
         """
@@ -235,39 +407,26 @@ class VILAProcessor(ProcessorMixin):
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
-    #     inputs = processor(conversation=llavaconv, padding=True, return_tensors="pt")
-    def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
         vila_conv = []
         for chat in conversation:
             vila_chat = {"from": "", "value": []}
-            if chat["role"] == "user":
                 # user allows to input image and text
-                vila_chat["from"] = "human"
-                for content in chat["content"]:
-                    if content["type"] == "image":
-                        if "path" in content:
-                            # VILA style
-                            media_key = "path"
-                        elif "image" in content:
-                            # Qwen style
-                            media_key = "image"
-                        else:
-                            raise ValueError(f"Unsupported content type `image`: {content}, `image` and `path` are required")
-                        vila_chat["value"].append(Image(fetch_image_url_or_fpath(content[media_key])))
-                    elif content["type"] == "text":
-                        vila_chat["value"].append(content["text"])
-                    # NOTE(ligeng): video supports are needed here
-                    else:
-                        raise ValueError(f"Unsupported content type: {content['type']}")
             elif chat["role"] == "assistant":
                 vila_chat["from"] = "gpt"
-                for content in chat["content"]:
-                    assert content["type"] == "text", f"Unsupported content type: {content['type']}"
-                    vila_chat["value"].append(content["text"])
             vila_conv.append(vila_chat)
         return vila_conv
 if __name__ == "__main__":
     # gpt style: user, assistant
@@ -302,7 +461,6 @@ if __name__ == "__main__":
     # print(model.config)
     # print(model.tokenizer)
     # print(res)
-    # exit(0)
     processor = VILAProcessor(
         config=model.config,

 import os.path as osp
 import warnings
 from collections import defaultdict
+from io import BytesIO
+from typing import List, Optional, Union
+import PIL.Image
+import requests
 import torch
 from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoProcessor, AutoTokenizer
 from transformers.feature_extraction_utils import BatchFeature
 from .mm_utils import process_image, process_images
 from .tokenizer_utils import tokenize_conversation
+def to_rgb(pil_image: PIL.Image.Image) -> PIL.Image.Image:
+    if pil_image.mode == "RGBA":
+        white_background = PIL.Image.new("RGB", pil_image.size, (255, 255, 255))
+        white_background.paste(pil_image, mask=pil_image.split()[3])  # Use alpha channel as mask
+        return white_background
+    else:
+        return pil_image.convert("RGB")
+def fetch_image(ele: dict[str, str | PIL.Image.Image], size_factor=None) -> PIL.Image.Image:
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, PIL.Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        response = requests.get(image, stream=True)
+        image_obj = PIL.Image.open(BytesIO(response.content))
+    elif image.startswith("file://"):
+        image_obj = PIL.Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            image_obj = PIL.Image.open(BytesIO(data))
+    else:
+        image_obj = PIL.Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    image = to_rgb(image_obj)
+    return image
 def fetch_image_url_or_fpath(url_or_fpath):
     if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
         import tempfile
         import requests
         # Download the image to a temporary file
         temp_dir = tempfile.mkdtemp()
         temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
         response = requests.get(url_or_fpath, stream=True)
         response.raise_for_status()
         with open(temp_file, "wb") as f:
             for chunk in response.iter_content(chunk_size=8192):
                 f.write(chunk)
         return temp_file
     elif url_or_fpath.startswith("file://"):
         fpath = url_or_fpath.replace("file://", "")
         assert osp.exists(fpath), f"File {fpath} does not exist"
         return fpath
     elif osp.exists(url_or_fpath):
+        assert osp.isfile(url_or_fpath), f"File {url_or_fpath} does not exist"
         return url_or_fpath
     else:
         raise ValueError(f"Unsupported image path: {url_or_fpath}")
+def pad_fn(input_ids_list: List[torch.Tensor], padding_value=0, target_len=None, padding_side="left") -> torch.Tensor:
     # tensor shape is (batch_size, seq_len)
     max_len = max([ids.shape[1] for ids in input_ids_list])
     if target_len is not None:
     return torch.cat(new_input_ids_list, dim=0)
+def extract_value_from_conv(chat):
+    value = []
+    if isinstance(chat["content"], str):
+        # vila_chat["value"].append(chat["content"])
+        value.append(chat["content"])
+        return value
+    # otherwise, it's a list of content
+    for content in chat["content"]:
+        if content["type"] == "image":
+            if "path" in content:
+                # VILA style, can be either filepath or http url
+                value.append(Image(fetch_image_url_or_fpath(content["path"])))
+            elif "image" in content:
+                # Qwen style
+                value.append(Image(fetch_image_url_or_fpath(content["image"])))
+            elif "image_pil" in content:
+                # Qwen style
+                assert isinstance(content["image_pil"], PIL.Image.Image), f"Type of {media_key} must be PIL.Image.Image"
+                value.append(content["image_pil"])
+            else:
+                raise ValueError(f"Type = `image` , but no `path` or `image` in | {content=}, {conversation=}")
+        elif content["type"] == "text":
+            value.append(content["text"])
+        # NOTE(ligeng): video supports are needed here
+        else:
+            raise ValueError(f"Unsupported content type: {content['type']}")
+    return value
 class VILAProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {
         "text_kwargs": {
     }
 class VILAProcessor(ProcessorMixin):
     # attributes = ["image_processor", "tokenizer"]
     attributes = []
     # image_processor_class = "VILAImageProcessor"
     # tokenizer_class = ("VILATokenizer", "VILATokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, config=None, padding_side="left", **kwargs):
         self.image_token = MEDIA_TOKENS["image"]
         self.video_token = MEDIA_TOKENS["video"]
         self.config = config
         self.image_processor = image_processor
         self.tokenizer = tokenizer
+        self.padding_side = padding_side
+        # This is a special setting for Qwen.
+        # self.pad_token_id = tokenizer.pad_token_id
+        self.pad_token_id = self.tokenizer("<|endoftext|>").input_ids[0] # 151643
+        self.eos_token_id = self.tokenizer.eos_token_id
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    @staticmethod
+    def extract_vision_info(conversations: list[dict] | list[list[dict]]) -> list[dict]:
+        """
+        referernce from qwen_vl_utils
+        """
+        vision_infos = []
+        if isinstance(conversations[0], dict):
+            conversations = [conversations]
+        for conversation in conversations:
+            for message in conversation:
+                if isinstance(message["content"], list):
+                    for ele in message["content"]:
+                        if (
+                            "image" in ele
+                            or "image_url" in ele
+                            or "video" in ele
+                            or ele["type"] in ("image", "image_url", "video")
+                        ):
+                            vision_infos.append(ele)
+        return vision_infos
+    @staticmethod
+    def process_vision_info(
+        conversations: list[dict] | list[list[dict]],
+        return_video_kwargs: bool = False,
+    ) -> tuple[list[PIL.Image.Image] | None, list[torch.Tensor | list[PIL.Image.Image]] | None, Optional[dict]]:
+        """
+        referernce from qwen_vl_utils
+        NVILA does not depend on the function, but the interface is the same.
+        """
+        vision_infos = extract_vision_info(conversations)
+        ## Read images or videos
+        image_inputs = []
+        video_inputs = []
+        video_sample_fps_list = []
+        for vision_info in vision_infos:
+            if "image" in vision_info or "image_url" in vision_info:
+                image_inputs.append(fetch_image(vision_info))
+            elif "video" in vision_info:
+                video_input, video_sample_fps = fetch_video(vision_info, return_video_sample_fps=True)
+                video_sample_fps_list.append(video_sample_fps)
+                video_inputs.append(video_input)
+            else:
+                raise ValueError("image, image_url or video should in content.")
+        if len(image_inputs) == 0:
+            image_inputs = None
+        if len(video_inputs) == 0:
+            video_inputs = None
+        if return_video_kwargs:
+            return image_inputs, video_inputs, {"fps": video_sample_fps_list}
+        return image_inputs, video_inputs
+    @staticmethod
+    def move_data_to_device(cls, prompt_inputs):
+        def _move_data_to_device(item):
+            # wrap function grpo trainer _prepare_input
+            kwargs = {"device": cls.args.device}
+            if cls.is_deepspeed_enabled and (torch.is_floating_point(item) or torch.is_complex(item)):
+                kwargs.update({"dtype": cls.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()})
+            return item.to(**kwargs)
+        prompt_inputs.input_ids = _move_data_to_device(prompt_inputs.input_ids)
+        prompt_inputs.attention_mask = _move_data_to_device(prompt_inputs.attention_mask)
+        if "image" in prompt_inputs.media:
+            prompt_inputs.media["image"] = [_move_data_to_device(img) for img in prompt_inputs.media["image"]]
+        return prompt_inputs
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        padding_side = kwargs.get("padding_side", "left")
         if os.path.isdir(pretrained_model_name_or_path):
             pretrained_model_name_or_path = pretrained_model_name_or_path
         else:
             print(f"pretrained_model_name_or_path {pretrained_model_name_or_path} is not a directory, downloading")
             from huggingface_hub import snapshot_download
             pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path)
         image_processor = AutoImageProcessor.from_pretrained(
             osp.join(pretrained_model_name_or_path, "llm"), trust_remote_code=True
         )
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        return cls(image_processor=image_processor, tokenizer=tokenizer, config=config, padding_side=padding_side)
     def __repr__(self):
+        # NOTE(ligeng):  hard coded image_processor to avoid serialization error. Dirty fix
+        return f"VILAProcessor(image_processor=SigLip, tokenizer={self.tokenizer}, config={self.config})"
     def __call__(
         self,
+        conversation=None,
         **kwargs: Unpack[VILAProcessorKwargs],
     ) -> BatchFeature:
+        """
+        The `conv` will be look like
+        [
+            {
+                'from': 'human',
+                'value': [
+                    <transformers_modules.NVILA-Lite-2B-hf-preview.media.Image object at 0x154e68e4c460>,
+                    'What are the common elements in these pictures?'
+                ]
+            }
+        ]
+        and `conversation` will be a list of such `conv`s
+        """
+        if kwargs.get("text", None) is not None:
+            conversation = kwargs.get("text")
+        assert conversation is not None, "`conversation` or `text` is required"
+        padding_side = kwargs.get("padding_side", self.padding_side)
+        input_ids_list = []
+        attention_mask = []
         media = defaultdict(list)
         media_config = defaultdict(dict)
         for conv in conversation:
+            feat = self.__single_call__(conv, **kwargs)
+            input_ids_list.append(feat.input_ids)
+            attention_mask.append(feat.attention_mask)
             for name in feat.media:
                 media[name] += feat.media[name]
             for name in feat.media_config:
                 media_config[name].update(feat.media_config[name])
+        # pad the input_ids to batchfy
+        input_ids = pad_fn(
+            input_ids_list,
+            padding_value=self.pad_token_id,
+            padding_side=padding_side,
+        )
+        # ignore the pad token in the attention mask
+        attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        attention_mask[input_ids == self.pad_token_id] = False
+        # print("[DEBUGAAA]", self.pad_token_id, self.tokenizer.pad_token_id); exit(0)
+        input_texts = self.tokenizer.batch_decode(input_ids)
         return BatchFeature(
             data={
+                "input_texts": input_texts,
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
                 "media": media,
                 "media_config": media_config,
             }
                     self.config.image_processor = self.image_processor
                     if self.config.image_aspect_ratio == "dynamic":
                         images = process_image(media["image"][0], self.config, None, enable_dynamic_res=True).half()
+                        # print("DEBUG", len(images)); input()
+                        # NOTE: this only works for images appears at the first conversation
                         conversation[0]["value"] = conversation[0]["value"].replace(
                             DEFAULT_IMAGE_TOKEN, f"{DEFAULT_IMAGE_TOKEN}\n" * images.shape[0]
                         )
                 ]
             else:
                 raise ValueError(f"Unsupported media type: {name}")
+        inputs = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True, return_ids_only=False)
+        input_ids = inputs.input_ids[0].unsqueeze(0)#.cuda()
+        attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        return BatchFeature(
+            data={
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "media": media,
+                "media_config": media_config,
+            }
+        )
     def batch_decode(self, *args, **kwargs):
         """
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def convert_gpt_conv_to_vila_conv(self, conversation):
         vila_conv = []
         for chat in conversation:
             vila_chat = {"from": "", "value": []}
+            if chat["role"] in ("user", "system"):
                 # user allows to input image and text
+                vila_chat["from"] = "human" if chat["role"] == "user" else "system"
+                vila_chat["value"] = extract_value_from_conv(chat)
             elif chat["role"] == "assistant":
                 vila_chat["from"] = "gpt"
+                vila_chat["value"] = extract_value_from_conv(chat)
+            else:
+                raise ValueError(f"Unsupported role: {chat['role']} in chat {chat}")
             vila_conv.append(vila_chat)
         return vila_conv
+    def apply_chat_template(self, conversation, add_generation_prompt=True, **kwargs):
+        return self.convert_gpt_conv_to_vila_conv(conversation)
 if __name__ == "__main__":
     # gpt style: user, assistant
     # print(model.config)
     # print(model.tokenizer)
     # print(res)
     processor = VILAProcessor(
         config=model.config,

builder.py CHANGED Viewed

@@ -34,6 +34,7 @@ from transformers import (
     PreTrainedTokenizer,
 )
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 # from .conversation import *
 from .conversation import SeparatorStyle, default_conversation
@@ -203,6 +204,7 @@ def build_llm_and_tokenizer(
         )
     else:
         if is_deepspeed_zero3_enabled():
             kwargs.pop("device_map")
         llm = AutoModelForCausalLM.from_pretrained(
             model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs

     PreTrainedTokenizer,
 )
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 # from .conversation import *
 from .conversation import SeparatorStyle, default_conversation
         )
     else:
         if is_deepspeed_zero3_enabled():
+            # NOTE: found by wei, need to pop out device_map when using zero3
             kwargs.pop("device_map")
         llm = AutoModelForCausalLM.from_pretrained(
             model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs

llm/tokenizer_config.json CHANGED Viewed

@@ -78,7 +78,7 @@
   "legacy": false,
   "model_max_length": 4096,
   "pad_token": "[PAD]",
-  "padding_side": "right",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

   "legacy": false,
   "model_max_length": 4096,
   "pad_token": "[PAD]",
+  "padding_side": "left",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

mm_utils.py CHANGED Viewed

@@ -521,8 +521,11 @@ def process_images(images, image_processor, model_cfg, enable_dynamic_res=False,
     return new_images
-def tokenizer_image_token(prompt, tokenizer, return_tensors=None):
-    return tokenizer(prompt, return_tensors=return_tensors).input_ids[0]
 def is_gemma_tokenizer(tokenizer):

     return new_images
+def tokenizer_image_token(prompt, tokenizer, return_tensors=None, return_ids=True):
+    if return_ids:
+        return tokenizer(prompt, return_tensors=return_tensors).input_ids[0]
+    else:
+        return tokenizer(prompt, return_tensors=return_tensors)
 def is_gemma_tokenizer(tokenizer):

modeling_vila.py CHANGED Viewed

@@ -201,17 +201,19 @@ class VILAPretrainedModel(PreTrainedModel):
         else:
             raise ValueError("`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config.")
-        # loading on cpu by default
-        device_map = kwargs.get("device_map", "cpu")
         self.mm_projector = build_mm_projector(mm_projector_cfg, config)
         self.vision_tower = build_vision_tower(vision_tower_cfg, config)
-        if "auto" in device_map or "cuda" in device_map:
             self.mm_projector = self.mm_projector.cuda()
             self.vision_tower = self.vision_tower.cuda()
         # set device_map auto can autoamtically shard llm to different devices
         self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
-        # NOTE(ligeng): need to add other decoders from config
         self.encoders = {"image": BasicImageEncoder(self), "video": BasicVideoEncoder(self)}
         self.post_config()
@@ -418,6 +420,7 @@ class VILAPretrainedModel(PreTrainedModel):
         weights_only: bool = True,
         **kwargs,
     ):
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
         return cls._from_config(config, **kwargs)
@@ -428,6 +431,12 @@ class VILAPretrainedModel(PreTrainedModel):
         # print("DEBUG", len(self.tokenizer.added_tokens_encoder.keys()), self.tokenizer.added_tokens_encoder.keys())
         NUM_EXTRA_TOKENS = len(self.tokenizer.added_tokens_encoder.keys())
         # TODO: SENTINEL_TOKEN is not added, need to check with Zhijian
         self.vocab_size = self.tokenizer.vocab_size + NUM_EXTRA_TOKENS
         # XGrammar tokenizer and grammar compiler
@@ -444,6 +453,10 @@ class VILAPretrainedModel(PreTrainedModel):
         self.vision_tower = self.vision_tower.to(torch.float16)
         ######################################################################
         self.training = self.llm.training
         ## configuration
         if getattr(self.config, "llm_cfg", None) is None:
             self.config.llm_cfg = self.llm.config
@@ -589,10 +602,6 @@ class VILAForCasualLM(VILAPretrainedModel):
         return image_features
     def train(self, mode: bool = True):
-        if mode:
-            self.tokenizer.padding_side = "right"
-        else:
-            self.tokenizer.padding_side = "left"
         super().train(mode)
         return self
@@ -650,11 +659,10 @@ class VILAForCasualLM(VILAPretrainedModel):
                     name = media_tokens[input_ids[k][pos].item()]
                     input = media_embeds[name].popleft()
                     label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
-                    # print(f"{self.tokenizer.padding_side} [media] {k=} {pos=}, {self.tokenizer.batch_decode(input_ids[k][pos:pos+1])}"); python_input()
-                elif input_ids[k][pos].item() in (self.tokenizer.pad_token_id, self.tokenizer.eos_token_id):
                     end = pos + 1
                     pos = end
-                    # print(f"[skip PAD/EOS] {k=} {pos=}, {self.tokenizer.batch_decode(input_ids[k][pos:end])}"); python_input()
                     continue
                 else:
                     end = pos
@@ -662,7 +670,6 @@ class VILAForCasualLM(VILAPretrainedModel):
                         end += 1
                     input = text_embeds[k][pos:end]
                     label = labels[k][pos:end]
-                    # print(f"[text] {k=} {pos=}, {self.tokenizer.batch_decode(input_ids[k][pos:end])}"); python_input()
                 inputs_mk.append(input)
                 labels_mk.append(label)
@@ -1018,6 +1025,7 @@ class VILAForCasualLM(VILAPretrainedModel):
         media: Optional[Dict[str, List[torch.Tensor]]] = None,
         images: Optional[torch.FloatTensor] = None,
         media_config: Optional[List] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -1074,21 +1082,62 @@ class VILAForCasualLM(VILAPretrainedModel):
         return outputs
-    @torch.inference_mode()
     def generate(
         self,
         input_ids: Optional[torch.FloatTensor] = None,
         media: Optional[Dict[str, List[torch.Tensor]]] = None,
         media_config: Dict[str, Dict[str, Any]] = None,
         attention_mask: Optional[torch.LongTensor] = None,
         **generation_kwargs,
-    ):
         if self.training:
             warnings.warn(
-                "Model is in training mode, using default padding strategy to right. This is not recommended for generation."
             )
         inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
-        return self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
     @torch.inference_mode()
     def generate_content(
@@ -1101,10 +1150,7 @@ class VILAForCasualLM(VILAPretrainedModel):
         conversation = [{"from": "human", "value": prompt}]
         # Convert response format to logits processor
-        if response_format:
-            xgr_logits_processor = self.get_xgr_logits_processor(response_format)
-        else:
-            xgr_logits_processor = None
         # Extract media from the conversation
@@ -1173,7 +1219,7 @@ class VILAForCasualLM(VILAPretrainedModel):
                 raise ValueError(f"Unsupported media type: {name}")
         # Tokenize the conversation
-        input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).cuda().unsqueeze(0)
         # Set up the generation config
         generation_config = generation_config or self.default_generation_config

         else:
             raise ValueError("`llm_cfg` `mm_projector_cfg` `vision_tower_cfg` not found in the config.")
+        # loading on auto by default
+        device_map = kwargs.get("device_map", "auto")
         self.mm_projector = build_mm_projector(mm_projector_cfg, config)
         self.vision_tower = build_vision_tower(vision_tower_cfg, config)
+        if device_map in ["auto", "cuda"]:
             self.mm_projector = self.mm_projector.cuda()
             self.vision_tower = self.vision_tower.cuda()
         # set device_map auto can autoamtically shard llm to different devices
         self.llm, self.tokenizer = self.init_llm(llm_cfg, config, device_map=device_map)
+        # NOTE(ligeng): hard code to set padding_side to left
+        self.tokenizer.padding_side = "left"
+        # TODO(ligeng): need to add other decoders from config
         self.encoders = {"image": BasicImageEncoder(self), "video": BasicVideoEncoder(self)}
         self.post_config()
         weights_only: bool = True,
         **kwargs,
     ):
+        # print("DEBUG2", kwargs); input()
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
         return cls._from_config(config, **kwargs)
         # print("DEBUG", len(self.tokenizer.added_tokens_encoder.keys()), self.tokenizer.added_tokens_encoder.keys())
         NUM_EXTRA_TOKENS = len(self.tokenizer.added_tokens_encoder.keys())
+        self.pad_token_list = (
+            self.tokenizer.pad_token_id,
+            self.tokenizer.eos_token_id,
+            self.tokenizer.tokenize("<|endoftext|>")[0],  # for qwen
+        )
         # TODO: SENTINEL_TOKEN is not added, need to check with Zhijian
         self.vocab_size = self.tokenizer.vocab_size + NUM_EXTRA_TOKENS
         # XGrammar tokenizer and grammar compiler
         self.vision_tower = self.vision_tower.to(torch.float16)
         ######################################################################
         self.training = self.llm.training
+        if self.training:
+            self.train()
+        else:
+            self.eval()
         ## configuration
         if getattr(self.config, "llm_cfg", None) is None:
             self.config.llm_cfg = self.llm.config
         return image_features
     def train(self, mode: bool = True):
         super().train(mode)
         return self
                     name = media_tokens[input_ids[k][pos].item()]
                     input = media_embeds[name].popleft()
                     label = torch.full([input.shape[0]], IGNORE_INDEX, device=labels[k].device, dtype=labels[k].dtype)
+                elif input_ids[k][pos].item() in self.pad_token_list:
+                    # skip pad tokens
                     end = pos + 1
                     pos = end
                     continue
                 else:
                     end = pos
                         end += 1
                     input = text_embeds[k][pos:end]
                     label = labels[k][pos:end]
                 inputs_mk.append(input)
                 labels_mk.append(label)
         media: Optional[Dict[str, List[torch.Tensor]]] = None,
         images: Optional[torch.FloatTensor] = None,
         media_config: Optional[List] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         return outputs
+    # @torch.inference_mode()
     def generate(
         self,
         input_ids: Optional[torch.FloatTensor] = None,
         media: Optional[Dict[str, List[torch.Tensor]]] = None,
         media_config: Dict[str, Dict[str, Any]] = None,
         attention_mask: Optional[torch.LongTensor] = None,
+        return_output_ids_only: bool = False,
         **generation_kwargs,
+    ) -> torch.LongTensor:
+        model_training_status = False
         if self.training:
             warnings.warn(
+                "Model is in training mode, using default padding strategy to right. This is not recommended for generation. We implicitly set the model to evaluation mode and restore the model training status after generation."
             )
+            self.eval()
+            model_training_status = True
+        """
+        input_tokens: <image> describe the image
+        media:        [Tensor(1, 3, 384, 384), ]
+        ----------->
+        input_tokens:     36000       001 002 003 004
+        input_emds:     <media emd>   001 002 003 004
+        """
+        # NOTE: hard code to move to GPU
+        input_ids = input_ids.cuda()
+        media = {k: [v.cuda() for v in media[k]] for k in media}
+        if attention_mask is not None:
+            attention_mask = attention_mask.cuda()
+        # TODO: there is still a padding left vs right issue unsovled here.
+        # print("prev args:",input_ids.shape, media, media_config, None, attention_mask)
         inputs_embeds, _, attention_mask = self._embed(input_ids, media, media_config, None, attention_mask)
+        # print("inputs_embeds", inputs_embeds.shape, inputs_embeds.mean(), inputs_embeds.std())
+        # print("attention_mask", attention_mask.shape, attention_mask)
+        output_ids = self.llm.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generation_kwargs)
+        # print("output_ids", self.tokenizer.batch_decode(output_ids))
+        # input("wait for debug")
+        if return_output_ids_only:
+            return_value = output_ids
+        else:
+            # by default, return the input_ids and output_ids concatenated to keep consistency with the community VLMs like qwen
+            generation_config = generation_kwargs.get("generation_config", None)
+            if generation_config is not None:
+                num_generations = generation_config.num_return_sequences
+                repeat_input_ids = input_ids.repeat_interleave(num_generations, dim=0)
+                return_value = torch.cat([repeat_input_ids, output_ids], dim=-1)
+            else:
+                return_value = torch.cat([input_ids, output_ids], dim=-1)
+        if model_training_status:
+            # restore the model training status
+            self.train()
+        return return_value
     @torch.inference_mode()
     def generate_content(
         conversation = [{"from": "human", "value": prompt}]
         # Convert response format to logits processor
+        xgr_logits_processor = None
         # Extract media from the conversation
                 raise ValueError(f"Unsupported media type: {name}")
         # Tokenize the conversation
+        input_ids = tokenize_conversation(conversation, self.tokenizer, add_generation_prompt=True).unsqueeze(0).cuda()
         # Set up the generation config
         generation_config = generation_config or self.default_generation_config

tokenizer_utils.py CHANGED Viewed

@@ -68,13 +68,16 @@ def tokenize_conversation_legacy(
     return tokenizer_image_token(conv.get_prompt(), tokenizer, return_tensors="pt")
 def tokenize_conversation(
     messages: Sequence[Dict[str, str]],
     tokenizer: transformers.PreTrainedTokenizer,
     add_generation_prompt: bool = False,
     overrides: Optional[Dict[str, str]] = None,
     no_system_prompt: bool = False,
 ) -> torch.Tensor:
     # Normalize the conversation before tokenization
     for message in messages:
         message["value"] = message["value"].strip()
@@ -95,6 +98,10 @@ def tokenize_conversation(
             message["role"] = "user"
         elif m["from"] == "gpt":
             message["role"] = "assistant"
         else:
             raise ValueError(f"Unexpected sender '{m['from']}' in conversation entry.")
@@ -111,7 +118,7 @@ def tokenize_conversation(
         add_generation_prompt=add_generation_prompt,
         tokenize=False,
     )
-    return tokenizer_image_token(text, tokenizer, return_tensors="pt")
 def _maybe_add_sentinel_token(tokenizer: transformers.PreTrainedTokenizer) -> None:

     return tokenizer_image_token(conv.get_prompt(), tokenizer, return_tensors="pt")
+# NOTE(ligeng): add a return typing to help code analyze
 def tokenize_conversation(
     messages: Sequence[Dict[str, str]],
     tokenizer: transformers.PreTrainedTokenizer,
     add_generation_prompt: bool = False,
     overrides: Optional[Dict[str, str]] = None,
     no_system_prompt: bool = False,
+    return_ids_only=True,
 ) -> torch.Tensor:
+    # print("messages", messages); input()
     # Normalize the conversation before tokenization
     for message in messages:
         message["value"] = message["value"].strip()
             message["role"] = "user"
         elif m["from"] == "gpt":
             message["role"] = "assistant"
+        elif m["from"] == "system":
+            message["role"] = "system"
+            if no_system_prompt:
+                raise ValueError("System prompt is not allowed when no_system_prompt is True.")
         else:
             raise ValueError(f"Unexpected sender '{m['from']}' in conversation entry.")
         add_generation_prompt=add_generation_prompt,
         tokenize=False,
     )
+    return tokenizer_image_token(text, tokenizer, return_tensors="pt", return_ids=return_ids_only)
 def _maybe_add_sentinel_token(tokenizer: transformers.PreTrainedTokenizer) -> None: