ClementRomac HF Staff commited on
Commit
6a479f7
·
1 Parent(s): ce5bd59

Upload processor

Browse files
Files changed (1) hide show
  1. processor.py +6 -21
processor.py CHANGED
@@ -1,10 +1,11 @@
1
  from itertools import chain
2
  from transformers import GitProcessor
3
 
 
4
  class GIAProcessor(GitProcessor):
5
- def __init__(self, image_processor, tokenizer):
6
  super().__init__(image_processor, tokenizer)
7
- self._block_size = 1024
8
 
9
  def _cut_text(self, examples):
10
  results = {
@@ -13,28 +14,12 @@ class GIAProcessor(GitProcessor):
13
  }
14
  for i in range(len(examples["input_ids"])):
15
  _input_size = len(examples["input_ids"][i])
16
- for j in range(max(1, _input_size // self._block_size)):
17
- results["input_ids"].append(examples["input_ids"][i][j*self._block_size:(j+1)*self._block_size])
18
- results["attention_mask"].append(examples["attention_mask"][i][j * self._block_size:(j + 1) * self._block_size])
19
 
20
  return results
21
 
22
- # def _group_texts(self, examples):
23
- # # Concatenate all texts.
24
- # concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
25
- # total_length = len(concatenated_examples[list(examples.keys())[0]])
26
- # # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict.
27
- # # We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
28
- # if total_length > self._block_size:
29
- # total_length = (total_length // self._block_size) * self._block_size
30
- #
31
- # # Split by chunks of max_len.
32
- # result = {
33
- # k: [t[i: i + self._block_size] for i in range(0, total_length, self._block_size)]
34
- # for k, t in concatenated_examples.items()
35
- # }
36
- # return result
37
-
38
  def __call__(self, examples, return_tensors=None, **kwargs):
39
  if "text" in examples and not "images" in examples:
40
  encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)
 
1
  from itertools import chain
2
  from transformers import GitProcessor
3
 
4
+
5
  class GIAProcessor(GitProcessor):
6
+ def __init__(self, image_processor, tokenizer, max_input_size):
7
  super().__init__(image_processor, tokenizer)
8
+ self._max_input_size = max_input_size
9
 
10
  def _cut_text(self, examples):
11
  results = {
 
14
  }
15
  for i in range(len(examples["input_ids"])):
16
  _input_size = len(examples["input_ids"][i])
17
+ for j in range(max(1, _input_size // self._max_input_size)):
18
+ results["input_ids"].append(examples["input_ids"][i][j*self._max_input_size:(j + 1) * self._max_input_size])
19
+ results["attention_mask"].append(examples["attention_mask"][i][j * self._max_input_size:(j + 1) * self._max_input_size])
20
 
21
  return results
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def __call__(self, examples, return_tensors=None, **kwargs):
24
  if "text" in examples and not "images" in examples:
25
  encoded_text = self.tokenizer(examples["text"], return_tensors=return_tensors)