Efficient-Large-Model
/

NVILA-Lite-8B-hf-preview

@@ -18,8 +18,35 @@ from .media import Image, Video, extract_media
 from .mm_utils import process_image, process_images
 from .tokenizer_utils import tokenize_conversation
-def vila_pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
     # tensor shape is (batch_size, seq_len)
     max_len = max([ids.shape[1] for ids in input_ids_list])
     if target_len is not None:
@@ -47,6 +74,8 @@ class VILAProcessorKwargs(ProcessingKwargs, total=False):
     }
 class VILAProcessor(ProcessorMixin):
     # attributes = ["image_processor", "tokenizer"]
     attributes = []
@@ -115,7 +144,7 @@ class VILAProcessor(ProcessorMixin):
         return BatchFeature(
             data={
                 # "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
-                "input_ids": vila_pad_fn(
                     input_ids,
                     padding_value=self.tokenizer.pad_token_id,
                     padding_side="left",
@@ -216,9 +245,18 @@ class VILAProcessor(ProcessorMixin):
                 vila_chat["from"] = "human"
                 for content in chat["content"]:
                     if content["type"] == "image":
-                        vila_chat["value"].append(Image(content["path"]))
                     elif content["type"] == "text":
                         vila_chat["value"].append(content["text"])
                     else:
                         raise ValueError(f"Unsupported content type: {content['type']}")
             elif chat["role"] == "assistant":
@@ -228,7 +266,6 @@ class VILAProcessor(ProcessorMixin):
                     vila_chat["value"].append(content["text"])
             vila_conv.append(vila_chat)
-        # return self(vila_conv)
         return vila_conv

 from .mm_utils import process_image, process_images
 from .tokenizer_utils import tokenize_conversation
+def fetch_image_url_or_fpath(url_or_fpath):
+    if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
+        import tempfile
+        import requests
+        # Download the image to a temporary file
+        temp_dir = tempfile.mkdtemp()
+        temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
+        response = requests.get(url_or_fpath, stream=True)
+        response.raise_for_status()
+        with open(temp_file, "wb") as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                f.write(chunk)
+        return temp_file
+    elif url_or_fpath.startswith("file://"):
+        fpath = url_or_fpath.replace("file://", "")
+        assert osp.exists(fpath), f"File {fpath} does not exist"
+        return fpath
+    elif osp.exists(url_or_fpath):
+        assert osp.isfile(url_or_fpath), f"File {url_or_fpath} is not a file"
+        return url_or_fpath
+    else:
+        raise ValueError(f"Unsupported image path: {url_or_fpath}")
+def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
     # tensor shape is (batch_size, seq_len)
     max_len = max([ids.shape[1] for ids in input_ids_list])
     if target_len is not None:
     }
 class VILAProcessor(ProcessorMixin):
     # attributes = ["image_processor", "tokenizer"]
     attributes = []
         return BatchFeature(
             data={
                 # "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
+                "input_ids": __pad_fn(
                     input_ids,
                     padding_value=self.tokenizer.pad_token_id,
                     padding_side="left",
                 vila_chat["from"] = "human"
                 for content in chat["content"]:
                     if content["type"] == "image":
+                        if "path" in content:
+                            # VILA style
+                            media_key = "path"
+                        elif "image" in content:
+                            # Qwen style
+                            media_key = "image"
+                        else:
+                            raise ValueError(f"Unsupported content type `image`: {content}, `image` and `path` are required")
+                        vila_chat["value"].append(Image(fetch_image_url_or_fpath(content[media_key])))
                     elif content["type"] == "text":
                         vila_chat["value"].append(content["text"])
+                    # NOTE(ligeng): video supports are needed here
                     else:
                         raise ValueError(f"Unsupported content type: {content['type']}")
             elif chat["role"] == "assistant":
                     vila_chat["value"].append(content["text"])
             vila_conv.append(vila_chat)
         return vila_conv

builder.py CHANGED Viewed

@@ -33,7 +33,7 @@ from transformers import (
     PreTrainedModel,
     PreTrainedTokenizer,
 )
 # from .conversation import *
 from .conversation import SeparatorStyle, default_conversation
@@ -202,6 +202,8 @@ def build_llm_and_tokenizer(
             fp8_model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
         )
     else:
         llm = AutoModelForCausalLM.from_pretrained(
             model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
         )

     PreTrainedModel,
     PreTrainedTokenizer,
 )
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 # from .conversation import *
 from .conversation import SeparatorStyle, default_conversation
             fp8_model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
         )
     else:
+        if is_deepspeed_zero3_enabled():
+            kwargs.pop("device_map")
         llm = AutoModelForCausalLM.from_pretrained(
             model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
         )