Ligeng-Zhu commited on
Commit
3383eeb
·
verified ·
1 Parent(s): 679ac79

Upload files with `vila-upload`.

Browse files
Files changed (1) hide show
  1. auto_processor.py +41 -5
auto_processor.py CHANGED
@@ -18,8 +18,35 @@ from .media import Image, Video, extract_media
18
  from .mm_utils import process_image, process_images
19
  from .tokenizer_utils import tokenize_conversation
20
 
21
-
22
- def vila_pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # tensor shape is (batch_size, seq_len)
24
  max_len = max([ids.shape[1] for ids in input_ids_list])
25
  if target_len is not None:
@@ -47,6 +74,8 @@ class VILAProcessorKwargs(ProcessingKwargs, total=False):
47
  }
48
 
49
 
 
 
50
  class VILAProcessor(ProcessorMixin):
51
  # attributes = ["image_processor", "tokenizer"]
52
  attributes = []
@@ -115,7 +144,7 @@ class VILAProcessor(ProcessorMixin):
115
  return BatchFeature(
116
  data={
117
  # "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
118
- "input_ids": vila_pad_fn(
119
  input_ids,
120
  padding_value=self.tokenizer.pad_token_id,
121
  padding_side="left",
@@ -216,9 +245,17 @@ class VILAProcessor(ProcessorMixin):
216
  vila_chat["from"] = "human"
217
  for content in chat["content"]:
218
  if content["type"] == "image":
219
- vila_chat["value"].append(Image(content["path"]))
 
 
 
 
 
 
 
220
  elif content["type"] == "text":
221
  vila_chat["value"].append(content["text"])
 
222
  else:
223
  raise ValueError(f"Unsupported content type: {content['type']}")
224
  elif chat["role"] == "assistant":
@@ -228,7 +265,6 @@ class VILAProcessor(ProcessorMixin):
228
  vila_chat["value"].append(content["text"])
229
  vila_conv.append(vila_chat)
230
 
231
- # return self(vila_conv)
232
  return vila_conv
233
 
234
 
 
18
  from .mm_utils import process_image, process_images
19
  from .tokenizer_utils import tokenize_conversation
20
 
21
+ def fetch_image_url_or_fpath(url_or_fpath):
22
+ if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
23
+ import tempfile
24
+ import requests
25
+
26
+ # Download the image to a temporary file
27
+ temp_dir = tempfile.mkdtemp()
28
+ temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
29
+
30
+ response = requests.get(url_or_fpath, stream=True)
31
+ response.raise_for_status()
32
+
33
+ with open(temp_file, "wb") as f:
34
+ for chunk in response.iter_content(chunk_size=8192):
35
+ f.write(chunk)
36
+
37
+ return temp_file
38
+ elif url_or_fpath.startswith("file://"):
39
+ fpath = url_or_fpath.replace("file://", "")
40
+ assert osp.exists(fpath), f"File {fpath} does not exist"
41
+ return fpath
42
+ elif osp.exists(url_or_fpath):
43
+ assert osp.isfile(url_or_fpath), f"File {url_or_fpath} is not a file"
44
+ return url_or_fpath
45
+ else:
46
+ raise ValueError(f"Unsupported image path: {url_or_fpath}")
47
+
48
+
49
+ def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
50
  # tensor shape is (batch_size, seq_len)
51
  max_len = max([ids.shape[1] for ids in input_ids_list])
52
  if target_len is not None:
 
74
  }
75
 
76
 
77
+
78
+
79
  class VILAProcessor(ProcessorMixin):
80
  # attributes = ["image_processor", "tokenizer"]
81
  attributes = []
 
144
  return BatchFeature(
145
  data={
146
  # "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
147
+ "input_ids": __pad_fn(
148
  input_ids,
149
  padding_value=self.tokenizer.pad_token_id,
150
  padding_side="left",
 
245
  vila_chat["from"] = "human"
246
  for content in chat["content"]:
247
  if content["type"] == "image":
248
+ if "path" in content:
249
+ # VILA style
250
+ vila_chat["value"].append(Image(fetch_image_url_or_fpath(content["path"])))
251
+ elif "image" in content:
252
+ # Qwen style
253
+ vila_chat["value"].append(Image(fetch_image_url_or_fpath(content["image"])))
254
+ else:
255
+ raise ValueError(f"Unsupported content type `image`: {content}, `image` and `path` are required")
256
  elif content["type"] == "text":
257
  vila_chat["value"].append(content["text"])
258
+ # NOTE(ligeng): video supports are needed here
259
  else:
260
  raise ValueError(f"Unsupported content type: {content['type']}")
261
  elif chat["role"] == "assistant":
 
265
  vila_chat["value"].append(content["text"])
266
  vila_conv.append(vila_chat)
267
 
 
268
  return vila_conv
269
 
270