Commit
·
6a479f7
1
Parent(s):
ce5bd59
Upload processor
Browse files- processor.py +6 -21
processor.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
from itertools import chain
|
2 |
from transformers import GitProcessor
|
3 |
|
|
|
4 |
class GIAProcessor(GitProcessor):
|
5 |
-
def __init__(self, image_processor, tokenizer):
|
6 |
super().__init__(image_processor, tokenizer)
|
7 |
-
self.
|
8 |
|
9 |
def _cut_text(self, examples):
|
10 |
results = {
|
@@ -13,28 +14,12 @@ class GIAProcessor(GitProcessor):
|
|
13 |
}
|
14 |
for i in range(len(examples["input_ids"])):
|
15 |
_input_size = len(examples["input_ids"][i])
|
16 |
-
for j in range(max(1, _input_size // self.
|
17 |
-
results["input_ids"].append(examples["input_ids"][i][j*self.
|
18 |
-
results["attention_mask"].append(examples["attention_mask"][i][j * self.
|
19 |
|
20 |
return results
|
21 |
|
22 |
-
# def _group_texts(self, examples):
|
23 |
-
# # Concatenate all texts.
|
24 |
-
# concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
|
25 |
-
# total_length = len(concatenated_examples[list(examples.keys())[0]])
|
26 |
-
# # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
|
27 |
-
# # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
|
28 |
-
# if total_length > self._block_size:
|
29 |
-
# total_length = (total_length // self._block_size) * self._block_size
|
30 |
-
#
|
31 |
-
# # Split by chunks of max_len.
|
32 |
-
# result = {
|
33 |
-
# k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
|
34 |
-
# for k, t in concatenated_examples.items()
|
35 |
-
# }
|
36 |
-
# return result
|
37 |
-
|
38 |
def __call__(self, examples, return_tensors=None, **kwargs):
|
39 |
if "text" in examples and not "images" in examples:
|
40 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|
|
|
1 |
from itertools import chain
|
2 |
from transformers import GitProcessor
|
3 |
|
4 |
+
|
5 |
class GIAProcessor(GitProcessor):
|
6 |
+
def __init__(self, image_processor, tokenizer, max_input_size):
|
7 |
super().__init__(image_processor, tokenizer)
|
8 |
+
self._max_input_size = max_input_size
|
9 |
|
10 |
def _cut_text(self, examples):
|
11 |
results = {
|
|
|
14 |
}
|
15 |
for i in range(len(examples["input_ids"])):
|
16 |
_input_size = len(examples["input_ids"][i])
|
17 |
+
for j in range(max(1, _input_size // self._max_input_size)):
|
18 |
+
results["input_ids"].append(examples["input_ids"][i][j*self._max_input_size:(j + 1) * self._max_input_size])
|
19 |
+
results["attention_mask"].append(examples["attention_mask"][i][j * self._max_input_size:(j + 1) * self._max_input_size])
|
20 |
|
21 |
return results
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def __call__(self, examples, return_tensors=None, **kwargs):
|
24 |
if "text" in examples and not "images" in examples:
|
25 |
encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
|