|
Unlike other data collators, this specific data collator needs to apply a different padding method to input_values and labels: |
|
|
|
import torch |
|
from dataclasses import dataclass, field |
|
from typing import Any, Dict, List, Optional, Union |
|
@dataclass |
|
class DataCollatorCTCWithPadding: |
|
processor: AutoProcessor |
|
padding: Union[bool, str] = "longest" |
|
|
|
def call(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: |
|
# split inputs and labels since they have to be of different lengths and need |
|
# different padding methods |
|
input_features = [{"input_values": feature["input_values"][0]} for feature in features] |
|
label_features = [{"input_ids": feature["labels"]} for feature in features] |
|
batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt") |
|
labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt") |
|
# replace padding with -100 to ignore loss correctly |
|
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100) |
|
batch["labels"] = labels |
|
return batch |
|
|
|
Now instantiate your DataCollatorForCTCWithPadding: |
|
|
|
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest") |
|
|
|
Evaluate |
|
Including a metric during training is often helpful for evaluating your model's performance. |