cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann) cppe5["train"][15] {'pixel_values': tensor([[[ 0.9132, 0.9132, 0.9132, , -1.9809, -1.9809, -1.9809], [ 0.9132, 0.9132, 0.9132, , -1.9809, -1.9809, -1.9809], [ 0.9132, 0.9132, 0.9132, , -1.9638, -1.9638, -1.9638], , [-1.5699, -1.5699, -1.5699, , -1.9980, -1.9980, -1.9980], [-1.5528, -1.5528, -1.5528, , -1.9980, -1.9809, -1.9809], [-1.5528, -1.5528, -1.5528, , -1.9980, -1.9809, -1.9809]], [[ 1.3081, 1.3081, 1.3081, , -1.8431, -1.8431, -1.8431], [ 1.3081, 1.3081, 1.3081, , -1.8431, -1.8431, -1.8431], [ 1.3081, 1.3081, 1.3081, , -1.8256, -1.8256, -1.8256], , [-1.3179, -1.3179, -1.3179, , -1.8606, -1.8606, -1.8606], [-1.3004, -1.3004, -1.3004, , -1.8606, -1.8431, -1.8431], [-1.3004, -1.3004, -1.3004, , -1.8606, -1.8431, -1.8431]], [[ 1.4200, 1.4200, 1.4200, , -1.6476, -1.6476, -1.6476], [ 1.4200, 1.4200, 1.4200, , -1.6476, -1.6476, -1.6476], [ 1.4200, 1.4200, 1.4200, , -1.6302, -1.6302, -1.6302], , [-1.0201, -1.0201, -1.0201, , -1.5604, -1.5604, -1.5604], [-1.0027, -1.0027, -1.0027, , -1.5604, -1.5430, -1.5430], [-1.0027, -1.0027, -1.0027, , -1.5604, -1.5430, -1.5430]]]), 'pixel_mask': tensor([[1, 1, 1, , 1, 1, 1], [1, 1, 1, , 1, 1, 1], [1, 1, 1, , 1, 1, 1], , [1, 1, 1, , 1, 1, 1], [1, 1, 1, , 1, 1, 1], [1, 1, 1, , 1, 1, 1]]), 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}} You have successfully augmented the individual images and prepared their annotations.