File size: 8,269 Bytes
a3e4a80 56ed6b6 cec9fd5 ee6fe1d 56ed6b6 2072c9a 56ed6b6 2072c9a 56ed6b6 2072c9a 56ed6b6 ee6fe1d 56ed6b6 ee6fe1d 56ed6b6 2072c9a 56ed6b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
"""Processor class for MERaLiON."""
from typing import List, Optional, Union
import numpy as np
from transformers.feature_extraction_utils import BatchFeature
from transformers.processing_utils import ProcessorMixin
from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput
# copied from transformers.models.qwen2_audio.processing_qwen2_audio.Qwen2AudioProcessor
class MERaLiONProcessor(ProcessorMixin):
r"""
Constructs a MERaLiON processor which wraps a whisper feature extractor and a gemma tokenizer into a single processor.
[`MERaLiONProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`GemmaTokenizer`]. See the
[`~MERaLiONProcessor.__call__`] and [`~MERaLiONProcessor.decode`] for more information.
Args:
feature_extractor ([`WhisperFeatureExtractor`], *optional*):
The feature extractor is a required input.
tokenizer ([`GemmaTokenizer`], *optional*):
The tokenizer is a required input.
chat_template (`Optional[str]`, *optional*):
The Jinja template to use for formatting the conversation. If not provided, the default chat template
is used.
"""
attributes = ["feature_extractor", "tokenizer"]
feature_extractor_class = "WhisperFeatureExtractor"
tokenizer_class = "AutoTokenizer"
valid_kwargs = [
"fixed_speech_embeds_length",
"speech_token_index",
"time_duration_limit",
"do_normalize"
]
def __init__(
self,
feature_extractor=None,
tokenizer=None,
fixed_speech_embeds_length=100,
speech_token_index=255999,
time_duration_limit=-1,
do_normalize=True
):
self.fixed_speech_embeds_length = fixed_speech_embeds_length
self.speech_token_index = speech_token_index
self.time_duration_limit = time_duration_limit
self.do_normalize = do_normalize
super().__init__(feature_extractor, tokenizer)
self.speech_token = self.tokenizer.added_tokens_decoder[self.speech_token_index].content
def _process_text(self, text):
target_string = self.speech_token * self.fixed_speech_embeds_length
if isinstance(text, list) or isinstance(text, tuple):
pieces = [item.replace(self.speech_token, target_string) for item in text]
return pieces
return text.replace(self.speech_token, target_string)
def _slice_audios(self, audios, time_duration_limit, sampling_rate):
if time_duration_limit <= 0:
return audios
slice_length = time_duration_limit * sampling_rate
if isinstance(audios, np.ndarray) and audios.ndim == 2:
return audios[:, :slice_length]
if isinstance(audios, np.ndarray) and audios.ndim == 1:
return audios[:slice_length]
if isinstance(audios, list):
return [audio[:slice_length] for audio in audios]
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
audios: Union[np.ndarray, List[np.ndarray]] = None,
padding: Union[bool, str, PaddingStrategy] = True,
sampling_rate: Optional[int] = None,
time_duration_limit: Optional[int] = None,
do_normalize: Optional[bool] = None,
**kwargs,
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
and `kwargs` arguments to GemmaTokenizer's [`~GemmaTokenizer.__call__`] if `text` is not `None` to encode
the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the doctsring
of the above two methods for more information.
Args:
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
audios (`np.ndarray`, `List[np.ndarray]`):
The audio or batch of audios to be prepared. Each audio can be a NumPy array.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
sampling_rate (`int`, defaults to 16000):
The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
time_duration_limit (`int`, defaults -1):
The max input time duration in seconds.
do_normalize (`bool`, defaults to `True`):
Whether or not to zero-mean unit-variance normalize the input.
Normalizing can help to significantly improve the performance of the model.
"""
if text is None:
raise ValueError("You need to specify either a `text` input to process.")
if sampling_rate is None:
sampling_rate = self.feature_extractor.sampling_rate
if time_duration_limit is None:
time_duration_limit = self.time_duration_limit
if do_normalize is None:
do_normalize = self.do_normalize
inputs_dict = {}
text = self._process_text(text)
text_input = self.tokenizer(
text=text,
return_tensors="pt",
add_special_tokens=False,
return_attention_mask=True,
padding=padding,
**kwargs
)
inputs_dict["input_ids"] = text_input.input_ids
inputs_dict["attention_mask"] = text_input.attention_mask
if audios is not None:
audios = self._slice_audios(audios, time_duration_limit, sampling_rate)
audio_inputs = self.feature_extractor(
audios,
sampling_rate=sampling_rate,
return_tensors="pt",
return_attention_mask=True,
padding="max_length",
do_normalize=self.do_normalize,
**kwargs
)
audio_inputs["feature_attention_mask"] = audio_inputs.pop(
"attention_mask"
) # rename attention_mask to prevent conflicts later on
inputs_dict.update(audio_inputs)
return BatchFeature(data={**inputs_dict})
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to GemmaTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to GemmaTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
feature_extractor_input_names = self.feature_extractor.model_input_names
return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names + ["feature_attention_mask"])) |