Alibaba-NLP
/

gme-Qwen2-VL-2B-Instruct

@@ -19,11 +19,11 @@ from transformers import (
     AutoProcessor,
     PreTrainedModel,
     Qwen2VLConfig,
-    Qwen2VLModel,
 )
 import os
-# Define a config class for our model.
 class GmeQwen2VLConfig(Qwen2VLConfig):
     model_type: str = "gme_qwen2_vl"
@@ -39,11 +39,8 @@ class GmeQwen2VLConfig(Qwen2VLConfig):
         self.min_image_tokens = min_image_tokens
         self.max_image_tokens = max_image_tokens
         self.max_length = max_length
-        self.device = device
-AutoConfig.register("gme_qwen2_vl", GmeQwen2VLConfig)
-# Define the model class so that it can be loaded by AutoModel.from_pretrained.
 class GmeQwen2VLForVision2Seq(PreTrainedModel):
     config_class = GmeQwen2VLConfig
     base_model_prefix: str = "base"
@@ -51,29 +48,21 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
     def __init__(self, config: GmeQwen2VLConfig, **kwargs: Any) -> None:
         super().__init__(config)
         model_name: str = getattr(config, "_name_or_path", "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
-        # Load the underlying vision-to-sequence model.
-        self.base = Qwen2VLModel.from_pretrained(
-            model_name, trust_remote_code=True, **kwargs
-        )
         self.normalize: bool = True
-        self.device: str = config.device
         min_pixels: int = config.min_image_tokens * 28 * 28
         max_pixels: int = config.max_image_tokens * 28 * 28
         self.max_length: int = config.max_length
         self.processor = AutoProcessor.from_pretrained(
             model_name, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
         )
-        self.processor.tokenizer.padding_side = "right"
         self.defualt_instruction: str = "You are a helpful assistant."
         self.sep: str = " "
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> GmeQwen2VLForVision2Seq:
-        config = kwargs.pop("config", GmeQwen2VLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs))
-        return cls(config, **kwargs)
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -82,9 +71,11 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.Tensor] = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
         pooling_mask: Optional[torch.LongTensor] = None,
-        **kwargs: Any,
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.base.model.embed_tokens(input_ids)
@@ -93,6 +84,11 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
                 image_embeds = self.base.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
                 image_mask = input_ids == self.base.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
@@ -105,48 +101,37 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
         )
         pooling_mask = attention_mask if pooling_mask is None else pooling_mask
-        left_padding: bool = (pooling_mask[:, -1].sum() == pooling_mask.shape[0])
         if left_padding:
             embeddings = outputs.last_hidden_state[:, -1]
         else:
             sequence_lengths = pooling_mask.sum(dim=1) - 1
             batch_size = outputs.last_hidden_state.shape[0]
-            embeddings = outputs.last_hidden_state[
-                torch.arange(batch_size, device=outputs.last_hidden_state.device),
-                sequence_lengths,
-            ]
         if self.normalize:
             embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         return embeddings.contiguous()
-    def embed(
-        self,
-        texts: List[str],
-        images: List[Image.Image],
-        is_query: bool = True,
-        instruction: Optional[str] = None,
-        **kwargs: Any,
-    ) -> torch.Tensor:
         self.base.to(self.device)
-        input_texts: List[str] = []
-        input_images: List[Image.Image] = []
         for t, i in zip(texts, images):
             if not is_query or instruction is None:
                 instruction = self.defualt_instruction
-            input_str: str = ""
             if i is None:
                 input_images = None  # All examples in the same batch are consistent
             else:
-                input_str += "<|vision_start|><|image_pad|><|vision_end|>"
                 i = fetch_image(i)
                 input_images.append(i)
             if t is not None:
                 input_str += t
-            msg: str = (
-                f"<|im_start|>system\n{instruction}<|im_end|>\n"
-                f"<|im_start|>user\n{input_str}<|im_end|>\n"
-                f"<|im_start|>assistant\n<|endoftext|>"
-            )
             input_texts.append(msg)
         inputs = self.processor(
@@ -155,22 +140,22 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
             padding=True,
             truncation=True,
             max_length=self.max_length,
-            return_tensors="pt",
         )
-        inputs = {k: v.to(self.device) for k, v in inputs.items()}
         with torch.no_grad():
             embeddings = self.forward(**inputs)
         return embeddings
-    def encode(self, sentences: List[str], **kwargs: Any) -> torch.Tensor:
-        # When no images are provided, we pass a list of Nones.
-        return self.embed(texts=sentences, images=[None] * len(sentences), **kwargs)
-    def encode_queries(self, queries: List[str], **kwargs: Any) -> torch.Tensor:
-        return self.encode(queries, **kwargs)
-    def encode_corpus(self, corpus: Union[Dict[str, List[str]], List[Dict[str, str]]], **kwargs: Any) -> torch.Tensor:
-        if isinstance(corpus, dict):
             sentences = [
                 (corpus["title"][i] + self.sep + corpus["text"][i]).strip()
                 if "title" in corpus
@@ -182,49 +167,56 @@ class GmeQwen2VLForVision2Seq(PreTrainedModel):
                 (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
                 for doc in corpus
             ]
-        return self.encode(sentences, is_query=False, **kwargs)
-    def get_image_embeddings(self, images: Union[List[Image.Image], DataLoader], **kwargs: Any) -> torch.Tensor:
         return self.get_fused_embeddings(images=images, **kwargs)
-    def get_text_embeddings(self, texts: List[str], **kwargs: Any) -> torch.Tensor:
         return self.get_fused_embeddings(texts=texts, **kwargs)
-    def get_fused_embeddings(
-        self,
-        texts: Optional[List[str]] = None,
-        images: Optional[Union[List[Image.Image], DataLoader]] = None,
-        **kwargs: Any,
-    ) -> torch.Tensor:
         if isinstance(images, DataLoader):
             image_loader = images
             batch_size = image_loader.batch_size
             image_loader.dataset.transform = None
         else:
-            batch_size = kwargs.pop("batch_size", 32)
             if images is None:
-                # If texts are provided without images, create dummy image batches.
-                image_loader = [None] * ((len(texts) + batch_size - 1) // batch_size)
             else:
-                image_loader = images
-        n_batch: int = (len(texts) // batch_size + int(len(texts) % batch_size > 0)) if texts is not None else len(image_loader)
-        all_embeddings: List[torch.Tensor] = []
         none_batch = [None] * batch_size
-        show_progress_bar: bool = kwargs.pop("show_progress_bar", True)
-        pbar = tqdm(total=n_batch, disable=not show_progress_bar, mininterval=1, miniters=10, desc="encode")
         for n, img_batch in zip(range(0, n_batch * batch_size, batch_size), image_loader):
-            text_batch: List[Optional[str]] = none_batch if texts is None else texts[n: n + batch_size]
             img_batch = none_batch if img_batch is None else img_batch
             embeddings = self.embed(texts=text_batch, images=img_batch, **kwargs)
             pbar.update(1)
             all_embeddings.append(embeddings.cpu())
         pbar.close()
-        return torch.cat(all_embeddings, dim=0)
-from transformers import AutoModelForVision2Seq
-AutoModelForVision2Seq.register(GmeQwen2VLConfig, GmeQwen2VLForVision2Seq)
 # Utility functions (copied from your vision processing code)
 IMAGE_FACTOR: int = 28
@@ -309,43 +301,3 @@ def fetch_image(image: Union[str, Image.Image], size_factor: int = IMAGE_FACTOR)
     )
     image = image.resize((resized_width, resized_height))
     return image
-# # For backward compatibility, you can add a from_pretrained classmethod.
-# @classmethod
-# def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> GmeQwen2VLForVision2Seq:
-#     config = GmeQwen2VLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-#     return cls(config, **kwargs)
-# # Monkey-patch the from_pretrained method to our class so that
-# # one can load the model with AutoModel.from_pretrained.
-# GmeQwen2VLForVision2Seq.from_pretrained = from_pretrained.__get__(GmeQwen2VLForVision2Seq)
-if __name__ == "__main__":
-    texts = [
-        "What kind of car is this?",
-        "The Tesla Cybertruck is a battery electric pickup truck built by Tesla, Inc. since 2023.",
-    ]
-    images = [
-        "https://en.wikipedia.org/wiki/File:Tesla_Cybertruck_damaged_window.jpg",
-        "https://en.wikipedia.org/wiki/File:2024_Tesla_Cybertruck_Foundation_Series,_front_left_(Greenwich).jpg",
-    ]
-    # You can now load your model with AutoModel as long as your repository's config JSON has the "architectures" field set.
-    model = AutoModel.from_pretrained("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
-    # Alternatively, load it directly via our class:
-    # model = GmeQwen2VLForVision2Seq.from_pretrained("Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
-    # Single-modal embedding examples:
-    e_text = model.get_text_embeddings(texts=texts)
-    e_image = model.get_image_embeddings(images=images)
-    print("Text-Image similarity:", (e_text * e_image).sum(-1))
-    # Example with different instruction:
-    e_query = model.get_text_embeddings(texts=texts, instruction="Find an image that matches the given text.")
-    e_corpus = model.get_image_embeddings(images=images, is_query=False)
-    print("Query-Corpus similarity:", (e_query * e_corpus).sum(-1))
-    # Fused-modal embedding:
-    e_fused = model.get_fused_embeddings(texts=texts, images=images)
-    print("Fused-modal similarity:", (e_fused[0] * e_fused[1]).sum())

     AutoProcessor,
     PreTrainedModel,
     Qwen2VLConfig,
+    Qwen2VLForConditionalGeneration,
 )
 import os
+from collections.abc import Iterable
 class GmeQwen2VLConfig(Qwen2VLConfig):
     model_type: str = "gme_qwen2_vl"
         self.min_image_tokens = min_image_tokens
         self.max_image_tokens = max_image_tokens
         self.max_length = max_length
 class GmeQwen2VLForVision2Seq(PreTrainedModel):
     config_class = GmeQwen2VLConfig
     base_model_prefix: str = "base"
     def __init__(self, config: GmeQwen2VLConfig, **kwargs: Any) -> None:
         super().__init__(config)
         model_name: str = getattr(config, "_name_or_path", "Alibaba-NLP/gme-Qwen2-VL-2B-Instruct")
+        self.base = Qwen2VLForConditionalGeneration(config)
         self.normalize: bool = True
         min_pixels: int = config.min_image_tokens * 28 * 28
         max_pixels: int = config.max_image_tokens * 28 * 28
         self.max_length: int = config.max_length
         self.processor = AutoProcessor.from_pretrained(
             model_name, min_pixels=min_pixels, max_pixels=max_pixels, **kwargs
         )
+        self.processor.tokenizer.padding_side = 'right'
         self.defualt_instruction: str = "You are a helpful assistant."
         self.sep: str = " "
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.Tensor] = None,
+        # pixel_values_videos: Optional[torch.FloatTensor] = None,
         image_grid_thw: Optional[torch.LongTensor] = None,
+        # video_grid_thw: Optional[torch.LongTensor] = None,
         pooling_mask: Optional[torch.LongTensor] = None,
+        **kwargs
     ) -> torch.Tensor:
         if inputs_embeds is None:
             inputs_embeds = self.base.model.embed_tokens(input_ids)
                 image_embeds = self.base.visual(pixel_values, grid_thw=image_grid_thw).to(inputs_embeds.device)
                 image_mask = input_ids == self.base.config.image_token_id
                 inputs_embeds[image_mask] = image_embeds
+            # if pixel_values_videos is not None:
+            #     pixel_values_videos = pixel_values_videos.type(self.base.visual.get_dtype())
+            #     video_embeds = self.base.visual(pixel_values_videos, grid_thw=video_grid_thw).to(inputs_embeds.device)
+            #     video_mask = input_ids == self.base.config.video_token_id
+            #     inputs_embeds[video_mask] = video_embeds
             if attention_mask is not None:
                 attention_mask = attention_mask.to(inputs_embeds.device)
         )
         pooling_mask = attention_mask if pooling_mask is None else pooling_mask
+        left_padding = (pooling_mask[:, -1].sum() == pooling_mask.shape[0])  # TODO
         if left_padding:
             embeddings = outputs.last_hidden_state[:, -1]
         else:
             sequence_lengths = pooling_mask.sum(dim=1) - 1
             batch_size = outputs.last_hidden_state.shape[0]
+            embeddings = outputs.last_hidden_state[torch.arange(
+                batch_size, device=outputs.last_hidden_state.device
+            ), sequence_lengths]
         if self.normalize:
             embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         return embeddings.contiguous()
+    def embed(self, texts: list[str], images: list[Image.Image], is_query=True, instruction=None, **kwargs):
         self.base.to(self.device)
+        # Inputs must be batched
+        input_texts, input_images = list(), list()
         for t, i in zip(texts, images):
             if not is_query or instruction is None:
                 instruction = self.defualt_instruction
+            input_str = ''
             if i is None:
                 input_images = None  # All examples in the same batch are consistent
             else:
+                input_str += '<|vision_start|><|image_pad|><|vision_end|>'
                 i = fetch_image(i)
                 input_images.append(i)
             if t is not None:
                 input_str += t
+            msg = f'<|im_start|>system\n{instruction}<|im_end|>\n<|im_start|>user\n{input_str}<|im_end|>\n<|im_start|>assistant\n<|endoftext|>'
             input_texts.append(msg)
         inputs = self.processor(
             padding=True,
             truncation=True,
             max_length=self.max_length,
+            return_tensors='pt'
         )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}  # TODO
         with torch.no_grad():
             embeddings = self.forward(**inputs)
         return embeddings
+    def encode(self, sentences: list[str], *, prompt_name=None, **kwargs):
+        return self.get_fused_embeddings(texts=sentences, prompt_name=prompt_name, **kwargs)
+    def encode_queries(self, queries: List[str], **kwargs):
+        embeddings = self.encode(queries, **kwargs)
+        return embeddings
+    def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs):
+        if type(corpus) is dict:
             sentences = [
                 (corpus["title"][i] + self.sep + corpus["text"][i]).strip()
                 if "title" in corpus
                 (doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip()
                 for doc in corpus
             ]
+        embeddings = self.encode(sentences, is_query=False, **kwargs)
+        return embeddings
+    def get_image_embeddings(self, images: list[Image.Image] | DataLoader, **kwargs):
         return self.get_fused_embeddings(images=images, **kwargs)
+    def get_text_embeddings(self, texts: list[str], **kwargs):
         return self.get_fused_embeddings(texts=texts, **kwargs)
+    def get_fused_embeddings(self, texts: list[str] = None, images: list[Image.Image] | DataLoader = None, **kwargs):
         if isinstance(images, DataLoader):
             image_loader = images
             batch_size = image_loader.batch_size
             image_loader.dataset.transform = None
         else:
+            batch_size = kwargs.pop('batch_size', 32)
             if images is None:
+                image_loader = None
             else:
+                image_loader = DataLoader(
+                    images,
+                    batch_size=batch_size,
+                    shuffle=False,
+                    collate_fn=custom_collate_fn,
+                    num_workers=min(math.floor(os.cpu_count() / 2), 8),
+                )
+        if texts is None:
+            assert image_loader is not None
+            n_batch = len(image_loader)
+        else:
+            n_batch = len(texts) // batch_size + int(len(texts) % batch_size > 0)
+            image_loader = image_loader or [None] * n_batch
+        all_embeddings = list()
         none_batch = [None] * batch_size
+        show_progress_bar = kwargs.pop('show_progress_bar', True)
+        pbar = tqdm(total=n_batch, disable=not show_progress_bar, mininterval=1, miniters=10, desc='encode')
         for n, img_batch in zip(range(0, n_batch * batch_size, batch_size), image_loader):
+            text_batch = none_batch if texts is None else texts[n: n+batch_size]
             img_batch = none_batch if img_batch is None else img_batch
             embeddings = self.embed(texts=text_batch, images=img_batch, **kwargs)
             pbar.update(1)
             all_embeddings.append(embeddings.cpu())
         pbar.close()
+        all_embeddings = torch.cat(all_embeddings, dim=0)
+        return all_embeddings
+def custom_collate_fn(batch):
+    return batch
 # Utility functions (copied from your vision processing code)
 IMAGE_FACTOR: int = 28
     )
     image = image.resize((resized_width, resized_height))
     return image