chunhuizng commited on
Commit
917b255
·
verified ·
1 Parent(s): 392ead7

Create audio_only_processor.py

Browse files
Files changed (1) hide show
  1. audio_only_processor.py +81 -0
audio_only_processor.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # audio_only_processor.py
2
+
3
+ import numpy as np
4
+ from typing import List, Optional, Union
5
+ from transformers import WhisperFeatureExtractor, Qwen2TokenizerFast
6
+ from transformers.processing_utils import ProcessorMixin
7
+ from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput
8
+ from transformers.feature_extraction_utils import BatchFeature
9
+
10
+
11
+ class AudioOnlyProcessor(ProcessorMixin):
12
+ """
13
+ A processor class for AudioOnlyThinker. Handles only text + audio input (no image/video support).
14
+ """
15
+
16
+ feature_extractor_class = "WhisperFeatureExtractor"
17
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
18
+ model_input_names = ["input_features", "attention_mask", "input_ids", "feature_attention_mask"]
19
+
20
+ def __init__(self, feature_extractor=None, tokenizer=None, chat_template=None):
21
+ self.audio_token = "<|AUDIO|>"
22
+ self.audio_bos_token = "<|audio_bos|>"
23
+ self.audio_eos_token = "<|audio_eos|>"
24
+ self.tokenizer = tokenizer
25
+ self.feature_extractor = feature_extractor
26
+ self.current_processor = self.tokenizer
27
+ self.chat_template = chat_template
28
+
29
+ def __call__(
30
+ self,
31
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
32
+ audios: Union[np.ndarray, List[np.ndarray]],
33
+ sampling_rate: Optional[int] = 16000,
34
+ padding: Union[bool, str, PaddingStrategy] = False,
35
+ **kwargs,
36
+ ) -> BatchFeature:
37
+ if not isinstance(text, list):
38
+ text = [text]
39
+
40
+ audios_inputs = self.feature_extractor(
41
+ audios, sampling_rate=sampling_rate, return_attention_mask=True, padding="max_length", **kwargs
42
+ )
43
+ audios_inputs["feature_attention_mask"] = audios_inputs.pop("attention_mask")
44
+ audios_inputs["input_features"] = audios_inputs.pop("input_features")
45
+
46
+ input_lengths = (audios_inputs["feature_attention_mask"].sum(-1).numpy() - 1) // 2 + 1
47
+ audio_lengths = (input_lengths - 2) // 2 + 1
48
+
49
+ # Replace <|AUDIO|> token with audio_placeholder repeated by length
50
+ for i in range(len(text)):
51
+ text[i] = text[i].replace(
52
+ self.audio_token,
53
+ "<|audio_placeholder|>" * audio_lengths[0], # assumes 1 audio per input
54
+ 1,
55
+ )
56
+ text[i] = text[i].replace("<|audio_placeholder|>", self.audio_token)
57
+
58
+ text_inputs = self.tokenizer(text, padding=padding, return_tensors=kwargs.get("return_tensors", None))
59
+
60
+ return BatchFeature(data={**text_inputs, **audios_inputs}, tensor_type=kwargs.get("return_tensors"))
61
+
62
+ def apply_chat_template(self, conversations, chat_template=None, **kwargs):
63
+ if isinstance(conversations[0], dict):
64
+ conversations = [conversations]
65
+ return self.tokenizer.apply_chat_template(conversations, chat_template=chat_template, **kwargs)
66
+
67
+ def batch_decode(self, *args, **kwargs):
68
+ return self.tokenizer.batch_decode(*args, **kwargs)
69
+
70
+ def decode(self, *args, **kwargs):
71
+ return self.tokenizer.decode(*args, **kwargs)
72
+
73
+ @classmethod
74
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
75
+ tokenizer = Qwen2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
76
+ feature_extractor = WhisperFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
77
+ return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
78
+
79
+ def save_pretrained(self, save_directory):
80
+ self.tokenizer.save_pretrained(save_directory)
81
+ self.feature_extractor.save_pretrained(save_directory)