Upload files with `vila-upload`.
Browse files- auto_processor.py +41 -5
auto_processor.py
CHANGED
@@ -18,8 +18,35 @@ from .media import Image, Video, extract_media
|
|
18 |
from .mm_utils import process_image, process_images
|
19 |
from .tokenizer_utils import tokenize_conversation
|
20 |
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# tensor shape is (batch_size, seq_len)
|
24 |
max_len = max([ids.shape[1] for ids in input_ids_list])
|
25 |
if target_len is not None:
|
@@ -47,6 +74,8 @@ class VILAProcessorKwargs(ProcessingKwargs, total=False):
|
|
47 |
}
|
48 |
|
49 |
|
|
|
|
|
50 |
class VILAProcessor(ProcessorMixin):
|
51 |
# attributes = ["image_processor", "tokenizer"]
|
52 |
attributes = []
|
@@ -115,7 +144,7 @@ class VILAProcessor(ProcessorMixin):
|
|
115 |
return BatchFeature(
|
116 |
data={
|
117 |
# "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
|
118 |
-
"input_ids":
|
119 |
input_ids,
|
120 |
padding_value=self.tokenizer.pad_token_id,
|
121 |
padding_side="left",
|
@@ -216,9 +245,17 @@ class VILAProcessor(ProcessorMixin):
|
|
216 |
vila_chat["from"] = "human"
|
217 |
for content in chat["content"]:
|
218 |
if content["type"] == "image":
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
elif content["type"] == "text":
|
221 |
vila_chat["value"].append(content["text"])
|
|
|
222 |
else:
|
223 |
raise ValueError(f"Unsupported content type: {content['type']}")
|
224 |
elif chat["role"] == "assistant":
|
@@ -228,7 +265,6 @@ class VILAProcessor(ProcessorMixin):
|
|
228 |
vila_chat["value"].append(content["text"])
|
229 |
vila_conv.append(vila_chat)
|
230 |
|
231 |
-
# return self(vila_conv)
|
232 |
return vila_conv
|
233 |
|
234 |
|
|
|
18 |
from .mm_utils import process_image, process_images
|
19 |
from .tokenizer_utils import tokenize_conversation
|
20 |
|
21 |
+
def fetch_image_url_or_fpath(url_or_fpath):
|
22 |
+
if url_or_fpath.startswith("http") or url_or_fpath.startswith("https"):
|
23 |
+
import tempfile
|
24 |
+
import requests
|
25 |
+
|
26 |
+
# Download the image to a temporary file
|
27 |
+
temp_dir = tempfile.mkdtemp()
|
28 |
+
temp_file = os.path.join(temp_dir, os.path.basename(url_or_fpath))
|
29 |
+
|
30 |
+
response = requests.get(url_or_fpath, stream=True)
|
31 |
+
response.raise_for_status()
|
32 |
+
|
33 |
+
with open(temp_file, "wb") as f:
|
34 |
+
for chunk in response.iter_content(chunk_size=8192):
|
35 |
+
f.write(chunk)
|
36 |
+
|
37 |
+
return temp_file
|
38 |
+
elif url_or_fpath.startswith("file://"):
|
39 |
+
fpath = url_or_fpath.replace("file://", "")
|
40 |
+
assert osp.exists(fpath), f"File {fpath} does not exist"
|
41 |
+
return fpath
|
42 |
+
elif osp.exists(url_or_fpath):
|
43 |
+
assert osp.isfile(url_or_fpath), f"File {url_or_fpath} is not a file"
|
44 |
+
return url_or_fpath
|
45 |
+
else:
|
46 |
+
raise ValueError(f"Unsupported image path: {url_or_fpath}")
|
47 |
+
|
48 |
+
|
49 |
+
def __pad_fn(input_ids_list, padding_value=0, target_len=None, padding_side="left"):
|
50 |
# tensor shape is (batch_size, seq_len)
|
51 |
max_len = max([ids.shape[1] for ids in input_ids_list])
|
52 |
if target_len is not None:
|
|
|
74 |
}
|
75 |
|
76 |
|
77 |
+
|
78 |
+
|
79 |
class VILAProcessor(ProcessorMixin):
|
80 |
# attributes = ["image_processor", "tokenizer"]
|
81 |
attributes = []
|
|
|
144 |
return BatchFeature(
|
145 |
data={
|
146 |
# "input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id),
|
147 |
+
"input_ids": __pad_fn(
|
148 |
input_ids,
|
149 |
padding_value=self.tokenizer.pad_token_id,
|
150 |
padding_side="left",
|
|
|
245 |
vila_chat["from"] = "human"
|
246 |
for content in chat["content"]:
|
247 |
if content["type"] == "image":
|
248 |
+
if "path" in content:
|
249 |
+
# VILA style
|
250 |
+
vila_chat["value"].append(Image(fetch_image_url_or_fpath(content["path"])))
|
251 |
+
elif "image" in content:
|
252 |
+
# Qwen style
|
253 |
+
vila_chat["value"].append(Image(fetch_image_url_or_fpath(content["image"])))
|
254 |
+
else:
|
255 |
+
raise ValueError(f"Unsupported content type `image`: {content}, `image` and `path` are required")
|
256 |
elif content["type"] == "text":
|
257 |
vila_chat["value"].append(content["text"])
|
258 |
+
# NOTE(ligeng): video supports are needed here
|
259 |
else:
|
260 |
raise ValueError(f"Unsupported content type: {content['type']}")
|
261 |
elif chat["role"] == "assistant":
|
|
|
265 |
vila_chat["value"].append(content["text"])
|
266 |
vila_conv.append(vila_chat)
|
267 |
|
|
|
268 |
return vila_conv
|
269 |
|
270 |
|