Ligeng-Zhu commited on
Commit
4ba795c
·
verified ·
1 Parent(s): bfbdff2

Upload files with `vila-upload`.

Browse files

Upload builder.py
Upload auto_processor.py

Files changed (2) hide show
  1. auto_processor.py +42 -5
  2. builder.py +3 -1
auto_processor.py CHANGED
@@ -18,8 +18,35 @@ from .media import Image, Video, extract_media
18
  from .mm_utils import process_image, process_images
19
  from .tokenizer_utils import tokenize_conversation
20
 
21
-
22
- def vila_pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # tensor shape is (batch_size, seq_len)
24
  max_len = max([ids.shape[1] for ids in input_ids_list])
25
  if target_len is not None:
@@ -47,6 +74,8 @@ class VILAProcessorKwargs(ProcessingKwargs, total=False):
47
  }
48
 
49
 
 
 
50
  class VILAProcessor(ProcessorMixin):
51
  # attributes = ["image_processor", "tokenizer"]
52
  attributes = []
@@ -115,7 +144,7 @@ class VILAProcessor(ProcessorMixin):
115
  return BatchFeature(
116
  data={
117
  # "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
118
- "input_ids": vila_pad_fn(
119
  input_ids,
120
  padding_value=self.tokenizer.pad_token_id,
121
  padding_side="left",
@@ -216,9 +245,18 @@ class VILAProcessor(ProcessorMixin):
216
  vila_chat["from"] = "human"
217
  for content in chat["content"]:
218
  if content["type"] == "image":
219
- vila_chat["value"].append(Image(content["path"]))
 
 
 
 
 
 
 
 
220
  elif content["type"] == "text":
221
  vila_chat["value"].append(content["text"])
 
222
  else:
223
  raise ValueError(f"Unsupported content type: {content['type']}")
224
  elif chat["role"] == "assistant":
@@ -228,7 +266,6 @@ class VILAProcessor(ProcessorMixin):
228
  vila_chat["value"].append(content["text"])
229
  vila_conv.append(vila_chat)
230
 
231
- # return self(vila_conv)
232
  return vila_conv
233
 
234
 
 
18
  from .mm_utils import process_image, process_images
19
  from .tokenizer_utils import tokenize_conversation
20
 
21
+ def fetch_image_url_or_fpath(url_or_fpath):
22
+ if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
23
+ import tempfile
24
+ import requests
25
+
26
+ # Download the image to a temporary file
27
+ temp_dir = tempfile.mkdtemp()
28
+ temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
29
+
30
+ response = requests.get(url_or_fpath, stream=True)
31
+ response.raise_for_status()
32
+
33
+ with open(temp_file, "wb") as f:
34
+ for chunk in response.iter_content(chunk_size=8192):
35
+ f.write(chunk)
36
+
37
+ return temp_file
38
+ elif url_or_fpath.startswith("file://"):
39
+ fpath = url_or_fpath.replace("file://", "")
40
+ assert osp.exists(fpath), f"File {fpath} does not exist"
41
+ return fpath
42
+ elif osp.exists(url_or_fpath):
43
+ assert osp.isfile(url_or_fpath), f"File {url_or_fpath} is not a file"
44
+ return url_or_fpath
45
+ else:
46
+ raise ValueError(f"Unsupported image path: {url_or_fpath}")
47
+
48
+
49
+ def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
50
  # tensor shape is (batch_size, seq_len)
51
  max_len = max([ids.shape[1] for ids in input_ids_list])
52
  if target_len is not None:
 
74
  }
75
 
76
 
77
+
78
+
79
  class VILAProcessor(ProcessorMixin):
80
  # attributes = ["image_processor", "tokenizer"]
81
  attributes = []
 
144
  return BatchFeature(
145
  data={
146
  # "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
147
+ "input_ids": __pad_fn(
148
  input_ids,
149
  padding_value=self.tokenizer.pad_token_id,
150
  padding_side="left",
 
245
  vila_chat["from"] = "human"
246
  for content in chat["content"]:
247
  if content["type"] == "image":
248
+ if "path" in content:
249
+ # VILA style
250
+ media_key = "path"
251
+ elif "image" in content:
252
+ # Qwen style
253
+ media_key = "image"
254
+ else:
255
+ raise ValueError(f"Unsupported content type `image`: {content}, `image` and `path` are required")
256
+ vila_chat["value"].append(Image(fetch_image_url_or_fpath(content[media_key])))
257
  elif content["type"] == "text":
258
  vila_chat["value"].append(content["text"])
259
+ # NOTE(ligeng): video supports are needed here
260
  else:
261
  raise ValueError(f"Unsupported content type: {content['type']}")
262
  elif chat["role"] == "assistant":
 
266
  vila_chat["value"].append(content["text"])
267
  vila_conv.append(vila_chat)
268
 
 
269
  return vila_conv
270
 
271
 
builder.py CHANGED
@@ -33,7 +33,7 @@ from transformers import (
33
  PreTrainedModel,
34
  PreTrainedTokenizer,
35
  )
36
-
37
  # from .conversation import *
38
  from .conversation import SeparatorStyle, default_conversation
39
 
@@ -202,6 +202,8 @@ def build_llm_and_tokenizer(
202
  fp8_model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
203
  )
204
  else:
 
 
205
  llm = AutoModelForCausalLM.from_pretrained(
206
  model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
207
  )
 
33
  PreTrainedModel,
34
  PreTrainedTokenizer,
35
  )
36
+ from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
37
  # from .conversation import *
38
  from .conversation import SeparatorStyle, default_conversation
39
 
 
202
  fp8_model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
203
  )
204
  else:
205
+ if is_deepspeed_zero3_enabled():
206
+ kwargs.pop("device_map")
207
  llm = AutoModelForCausalLM.from_pretrained(
208
  model_name_or_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
209
  )