import copy from typing import Any, Callable, Dict, List, Optional, Tuple, Union from typing import Iterator import warnings import numpy as np import torch import torch.utils.checkpoint import torch.utils.checkpoint from torch import nn from torch.nn.utils.rnn import pad_sequence from decimal import Decimal, ROUND_HALF_UP from transformers import LogitsProcessorList, SuppressTokensLogitsProcessor, \ SuppressTokensAtBeginLogitsProcessor from transformers.generation.configuration_utils import GenerationConfig from transformers.generation.configuration_utils import GenerationMode from transformers.generation.logits_process import ( LogitsProcessorList, SuppressTokensAtBeginLogitsProcessor, SuppressTokensLogitsProcessor, ) from transformers.generation.logits_process import WhisperNoSpeechDetection from transformers.generation.stopping_criteria import ( StoppingCriteriaList, ) from transformers.generation.utils import GenerateBeamOutput, BeamScorer, GenerateBeamDecoderOnlyOutput, \ stack_model_outputs, GenerateBeamEncoderDecoderOutput, _split_model_inputs, GenerateNonBeamOutput, \ GenerateEncoderDecoderOutput, GenerateDecoderOnlyOutput from transformers.modeling_outputs import BaseModelOutput from transformers.models.whisper.modeling_whisper import ( WhisperForConditionalGeneration, ) from transformers.models.whisper.generation_whisper import _get_attr_from_logit_processors, _pad_to_max_length from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE from transformers.utils import logging from .utils import WhisperTimeStampLogitsProcessorCustom from .decoding import CTCRescorerLogitsProcessor, LogSoftmaxProcessor logging.set_verbosity_debug() logger = logging.get_logger("transformers") class DiCoWGenerationMixin(WhisperForConditionalGeneration): def _prepare_encoder_decoder_kwargs_for_generation( self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name, generation_config, ) -> Dict[str, Any]: # self.encoder_output_lens = self._get_feat_extract_output_lengths( # model_kwargs['attention_mask_enc'].sum(dim=1) # ).int() generation_config.output_hidden_states = True # pylint: disable=no-memberva model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation( inputs_tensor, model_kwargs, model_input_name, generation_config ) self.encoder_logits = model_kwargs["encoder_outputs"].logits return model_kwargs @staticmethod def _expand_inputs_for_generation( expand_size: int = 1, is_encoder_decoder: bool = False, input_ids: Optional[torch.LongTensor] = None, **model_kwargs, ) -> Tuple[torch.LongTensor, Dict[str, Any]]: """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]""" def _expand_dict_for_generation(dict_to_expand): for key in dict_to_expand: if dict_to_expand[key] is not None and isinstance(dict_to_expand[key], torch.Tensor) and key != "loss": dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0) return dict_to_expand if input_ids is not None: input_ids = input_ids.repeat_interleave(expand_size, dim=0) model_kwargs = _expand_dict_for_generation(model_kwargs) if is_encoder_decoder: if model_kwargs.get("encoder_outputs") is None: raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.") model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"]) if "hidden_states" in model_kwargs["encoder_outputs"]: model_kwargs["encoder_outputs"]["hidden_states"] = tuple( hidden_state.repeat_interleave(expand_size, dim=0) for hidden_state in model_kwargs["encoder_outputs"]["hidden_states"] ) return input_ids, model_kwargs def generate( self, input_features: Optional[torch.Tensor] = None, generation_config: Optional[GenerationConfig] = None, logits_processor: Optional[LogitsProcessorList] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, synced_gpus: bool = False, return_timestamps: Optional[bool] = None, task: Optional[str] = None, language: Optional[str] = None, is_multilingual: Optional[bool] = None, prompt_ids: Optional[torch.Tensor] = None, prompt_condition_type: Optional[str] = None, # first-segment, all-segments condition_on_prev_tokens: Optional[bool] = None, temperature: Optional[Union[float, Tuple[float, ...]]] = None, compression_ratio_threshold: Optional[float] = None, logprob_threshold: Optional[float] = None, no_speech_threshold: Optional[float] = None, num_segment_frames: Optional[int] = None, attention_mask: Optional[torch.Tensor] = None, time_precision: float = 0.02, return_token_timestamps: Optional[bool] = None, return_segments: bool = False, return_dict_in_generate: Optional[bool] = None, assistant_model: Optional["PreTrainedModel"] = None, **kwargs, ): if condition_on_prev_tokens: raise NotImplementedError("Current version does not support conditioning") gen_c, _ = self._prepare_generation_config(generation_config, **kwargs) gen_mode = gen_c.get_generation_mode(assistant_model) if gen_mode not in [GenerationMode.GREEDY_SEARCH, GenerationMode.BEAM_SEARCH]: raise ValueError( f"Provided generation mode {gen_mode} is not supported" f" for WhisperForConditionalGeneration with joint CTC decoding") if "stno_mask" in kwargs: self.stno_mask = kwargs["stno_mask"] if "encoder_outputs" in kwargs: self.encoder_logits = kwargs["encoder_outputs"].logits # pylint: disable=no-member # 0. deprecate old inputs if "inputs" in kwargs: input_features = kwargs.pop("inputs") warnings.warn( "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.", FutureWarning, ) # 1. prepare generation config generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs) # 2. set global generate variables input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0] num_segment_frames = input_stride * self.config.max_source_positions batch_size, total_input_frames = self._retrieve_total_input_frames( input_features=input_features, input_stride=input_stride, kwargs=kwargs ) is_shortform = total_input_frames <= num_segment_frames if is_shortform: # warn user of ignored inputs self._maybe_warn_unused_inputs( condition_on_prev_tokens=condition_on_prev_tokens, temperature=temperature, compression_ratio_threshold=compression_ratio_threshold, logprob_threshold=logprob_threshold, no_speech_threshold=no_speech_threshold, total_input_frames=total_input_frames, ) # 3. Make sure generation config is correctly set # Make sure the generation config is correctly set depending on whether timestamps are to be returned or not self._set_return_outputs( return_dict_in_generate=return_dict_in_generate, return_token_timestamps=return_token_timestamps, is_shortform=is_shortform, logprob_threshold=logprob_threshold, generation_config=generation_config, ) self._set_return_timestamps( return_timestamps=return_timestamps, is_shortform=is_shortform, generation_config=generation_config ) self._set_language_and_task( language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config ) self._set_num_frames( return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs ) self._set_thresholds_and_condition( generation_config=generation_config, logprob_threshold=logprob_threshold, compression_ratio_threshold=compression_ratio_threshold, no_speech_threshold=no_speech_threshold, condition_on_prev_tokens=condition_on_prev_tokens, ) self._set_prompt_condition_type( generation_config=generation_config, prompt_condition_type=prompt_condition_type, ) # pass self.config for backward compatibility init_tokens = self._retrieve_init_tokens( input_features, batch_size=batch_size, generation_config=generation_config, config=self.config, num_segment_frames=num_segment_frames, kwargs=kwargs, ) # passing `decoder_input_ids` is deprecated - the only exception is for assisted generation # where the input ids are handled explicitly by the generate method self._check_decoder_input_ids(kwargs=kwargs) # 3. Retrieve logits processors device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device begin_index = init_tokens.shape[1] logits_processor = self._retrieve_logit_processors( generation_config=generation_config, logits_processor=logits_processor, begin_index=begin_index, # begin index is index of first generated decoder token is_shortform=is_shortform, num_beams=kwargs.get("num_beams", 1), device=device, ) # 5. If we're in shortform mode, simple generate the whole input at once and return the output if is_shortform: if temperature is not None: generation_config.temperature = temperature decoder_input_ids = kwargs.pop("decoder_input_ids", None) if decoder_input_ids is None: decoder_input_ids = init_tokens if prompt_ids is not None: decoder_input_ids = torch.cat( [prompt_ids[None].repeat(decoder_input_ids.shape[0], 1), decoder_input_ids], dim=-1 ) max_new_tokens = generation_config.max_new_tokens if generation_config.max_new_tokens is not None else 0 if max_new_tokens + decoder_input_ids.shape[-1] > self.config.max_target_positions: raise ValueError( f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` " f"is {max_new_tokens}. Thus, the combined length of " f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the " f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. " "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, " f"so that their combined length is less than {self.config.max_target_positions}." ) outputs = super().generate( input_features, generation_config=generation_config, logits_processor=logits_processor, stopping_criteria=stopping_criteria, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, synced_gpus=synced_gpus, decoder_input_ids=decoder_input_ids, **kwargs, ) if generation_config.return_token_timestamps and hasattr(generation_config, "alignment_heads"): outputs["token_timestamps"] = self._extract_token_timestamps( outputs, generation_config.alignment_heads, num_frames=generation_config.num_frames ) # print("\n".join(self.tokenizer.batch_decode(outputs,skip_special_tokens=True, decode_with_timestamps=True))) return outputs # 6. Else we're in longform mode which is more complex. # We need to chunk the audio input depending on when the model generates timestamp tokens # 6.1 Set and retrieve global longform generation variables self._set_condition_on_prev_tokens( condition_on_prev_tokens=condition_on_prev_tokens, generation_config=generation_config ) timestamp_begin = generation_config.no_timestamps_token_id + 1 temperatures = [temperature] if not isinstance(temperature, (list, tuple)) else temperature temperature = temperatures[0] batch_size = input_features.shape[0] max_frames, seek = self._retrieve_max_frames_and_seek( batch_size=batch_size, attention_mask=attention_mask, total_input_frames=total_input_frames ) # 6.2 Preppare running variables, list for generation cur_bsz = batch_size current_segments = self._prepare_segments( prompt_ids=prompt_ids, batch_size=batch_size, generation_config=generation_config, ) batch_idx_map = list(range(batch_size)) do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(batch_size)] # 6.2 Transcribe audio until we reach the end of all input audios while (seek < max_frames).any(): # 6.3 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order # to know which original audio is being decoded # Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk input_features, cur_bsz, batch_idx_map = self._maybe_reduce_batch( input_features=input_features, seek=seek, max_frames=max_frames, cur_bsz=cur_bsz, batch_idx_map=batch_idx_map, ) time_offset = seek * time_precision / input_stride seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames) # 6.4 cut out next 30s segment from input features segment_input = self._get_input_segment( input_features=input_features, seek=seek, seek_num_frames=seek_num_frames, num_segment_frames=num_segment_frames, cur_bsz=cur_bsz, batch_idx_map=batch_idx_map, ) # 6.5 prepare decoder input ids suppress_tokens = _get_attr_from_logit_processors( logits_processor, SuppressTokensLogitsProcessor, "suppress_tokens" ) decoder_input_ids, kwargs = self._prepare_decoder_input_ids( cur_bsz=cur_bsz, init_tokens=init_tokens, current_segments=current_segments, batch_idx_map=batch_idx_map, do_condition_on_prev_tokens=do_condition_on_prev_tokens, prompt_ids=prompt_ids, generation_config=generation_config, config=self.config, device=segment_input.device, suppress_tokens=suppress_tokens, kwargs=kwargs, ) # 6.6 set max new tokens or max length self._set_max_new_tokens_and_length( config=self.config, decoder_input_ids=decoder_input_ids, generation_config=generation_config, ) # 6.7 Set current `begin_index` for all logit processors for proc in logits_processor: if hasattr(proc, "set_begin_index"): proc.set_begin_index(decoder_input_ids.shape[-1]) # 6.8 Run generate with fallback seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens = self.generate_with_fallback( segment_input=segment_input, decoder_input_ids=decoder_input_ids, cur_bsz=cur_bsz, batch_idx_map=batch_idx_map, seek=seek, num_segment_frames=num_segment_frames, max_frames=max_frames, temperatures=temperatures, generation_config=generation_config, logits_processor=logits_processor, stopping_criteria=stopping_criteria, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, synced_gpus=synced_gpus, return_token_timestamps=return_token_timestamps, do_condition_on_prev_tokens=do_condition_on_prev_tokens, kwargs=kwargs, ) # 6.9 In every generated sequence, split by timestamp tokens and extract segments if self.config.mt_num_speakers ==1: for i, seek_sequence in enumerate(seek_sequences): prev_i = batch_idx_map[i] if should_skip[i]: seek[prev_i] += seek_num_frames[prev_i] continue segments, segment_offset = self._retrieve_segment( seek_sequence=seek_sequence, seek_outputs=seek_outputs, time_offset=time_offset, timestamp_begin=timestamp_begin, seek_num_frames=seek_num_frames, time_precision=time_precision, input_stride=input_stride, prev_idx=prev_i, idx=i, return_token_timestamps=return_token_timestamps, ) current_segments[prev_i] += segments seek[prev_i] += segment_offset else: # We have to make sure all speakers are synchronized thus we have to find minumum of seeks that each instance like for j, seek_seqs in enumerate([seek_sequences[i*self.config.mt_num_speakers:(i+1)*self.config.mt_num_speakers] for i in range(len(seek_sequences)//self.config.mt_num_speakers)]): indexes = [j*self.config.mt_num_speakers + i for i in range(self.config.mt_num_speakers)] prev_ids = [batch_idx_map[i] for i in indexes] if all([should_skip[i] for i in indexes]): for i, prev_i in zip(indexes, prev_ids): seek[prev_i] += seek_num_frames[prev_i] continue segments, segment_offset = self._retrieve_segment_mt( seek_sequences=seek_seqs, seek_outputs=seek_outputs, time_offset=time_offset, timestamp_begin=timestamp_begin, seek_num_frames=seek_num_frames, time_precision=time_precision, input_stride=input_stride, prev_ids=prev_ids, ids=indexes, return_token_timestamps=return_token_timestamps, ) for prev_i, i in zip(prev_ids, range(self.config.mt_num_speakers)): current_segments[prev_i] += segments[i] seek[prev_i] += segment_offset[i] # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output final_segments = ( [x[1:] for x in current_segments] if (prompt_ids is not None and generation_config.prompt_condition_type == "first-segment") else current_segments ) sequences = _pad_to_max_length( final_segments, generation_config.pad_token_id, device=self.device, padding="right" ) # 8. If we return all segments, the predicted output sequences are put under `"sequences"`. output = {"sequences": sequences, "segments": final_segments} self.encoder_logits = None if isinstance(output, dict): output = self._fix_timestamps_from_segmentation(output) return output @staticmethod def _find_common_seek(sequences, seeks): """ Finds the minimum seek that does not overlap with other sequences, and falls back to (segment.start - 0.2) if needed. Assumes: - 'seeks' is a list of (seek_time_int, sequence_index), - seek_time_int is in timestamp * 100 format (e.g., 125.5s -> 12550). """ def is_valid_seek(seek_time, exclude_seq_idx): for idx, seq in enumerate(sequences): if idx == exclude_seq_idx: continue for segment in seq: start = getattr(segment, 'start', segment['start']) end = getattr(segment, 'end', segment['end']) if seek_time < start: break # Segments are sorted by end if start < seek_time < end: return False return True # Step 1: Find minimum seek # if all seek values are the same, return it immediately seeks = [s if isinstance(s, int) else s.item() for s in seeks] if len(set(seeks)) == 1: return seeks[0] min_seek_val = min(seeks) min_seek_idx = seeks.index(min_seek_val) min_seek_real = min_seek_val / 100 if is_valid_seek(min_seek_real, min_seek_idx): return min_seek_val # Step 2: Try fallback seeks from all sequences (segment.start - 0.1s) fallback_seeks = set() for idx, seq in enumerate(sequences): for segment in seq: start = getattr(segment, 'start', segment['start']) if isinstance(start, torch.Tensor): start = start.item() candidate = round(start, 2) fallback_seeks.add((candidate, idx, True)) end = getattr(segment, 'end', segment['end']) if isinstance(end, torch.Tensor): end = end.item() if end < min_seek_real: candidate = round(end, 2) fallback_seeks.add((candidate, idx, True)) valid_fallbacks = [ (int(s * 100), idx, is_start) for s, idx, is_start in fallback_seeks if is_valid_seek(s, min_seek_idx) ] if valid_fallbacks: return max(valid_fallbacks) # Step 3: Nothing valid return 0 @staticmethod def remove_segments_after_seek(sequences, seek, eps=100): """ Keep only segments that finish before given timestamp. Args: sequences: List of lists, each containing segments (dict or object with 'start' and 'end'). seek: Integer seek timestamp (e.g., timestamp * 100). Returns: None. Modifies the sequences in-place. """ return [[seg for seg in seq if (getattr(seg, 'end', seg['end']) * 100 <= seek +eps)] for seq in sequences] @staticmethod def _retrieve_segment_wo_seek( seek_sequence, seek_outputs, time_offset, timestamp_begin, seek_num_frames, time_precision, input_stride, prev_idx, idx, return_token_timestamps, ): # find the predicted "end of segment" predictions of Whisper # "end of segment" predictions occur whenever Whisper predicts a timestamp token timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin) single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True] timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] timestamp_segment_indices.add_(1) token_timestamps = seek_outputs[idx]["token_timestamps"] if return_token_timestamps else [] # If whisper predicted a "end of segment" via a timestep token, let's go ever each # "end of segment" prediction and slice the decoding into segments accordingly if len(timestamp_segment_indices) > 0: # if the output contains two consecutive timestamp tokens slices = timestamp_segment_indices.tolist() segments = [] if single_timestamp_ending: slices.append(len(seek_sequence)) last_slice = 0 # Add each segment to list of all segments for current_slice in slices: sliced_tokens = seek_sequence[last_slice:current_slice] start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin end_timestamp_pos = sliced_tokens[-1].item() - timestamp_begin segments.append( { "start": time_offset[prev_idx] + start_timestamp_pos * time_precision, "end": time_offset[prev_idx] + end_timestamp_pos * time_precision, "tokens": sliced_tokens, "result": seek_outputs[idx], } ) if return_token_timestamps: segments[-1]["token_timestamps"] = ( token_timestamps[last_slice:current_slice] + time_offset[prev_idx] ) last_slice = current_slice if not single_timestamp_ending: # generate all predictions after the last predicted "end of segment" and seek by 30s sliced_tokens = seek_sequence[last_slice:] start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin end_timestamp_pos = seek_num_frames[prev_idx] // 2 segments.append( { "start": time_offset[prev_idx] + start_timestamp_pos * time_precision, "end": time_offset[prev_idx] + end_timestamp_pos * time_precision, "tokens": sliced_tokens, "result": seek_outputs[idx], } ) segment_offset = seek_num_frames[prev_idx] else: # If whisper does not predict any "end of segment" token, then # the whole decoding is considered a segment and we add it to the list of segments timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()] start_timestamp_pos = 0.0 last_timestamp_pos = seek_num_frames[prev_idx] // 2 if timestamps.numel() > 1: start_timestamp_pos = timestamps[-2].item() - timestamp_begin last_timestamp_pos = timestamps[-1].item() - timestamp_begin elif timestamps.numel() == 1: # no consecutive timestamps but it has a timestamp; use the last one. start_timestamp_pos = timestamps[-1].item() - timestamp_begin segments = [ { "start": time_offset[prev_idx] + start_timestamp_pos * time_precision, "end": time_offset[prev_idx] + last_timestamp_pos * time_precision, "tokens": seek_sequence, "result": seek_outputs[idx], } ] segment_offset = seek_num_frames[prev_idx] return segments, segment_offset def _retrieve_segment_mt( self, seek_sequences, seek_outputs, time_offset, timestamp_begin, seek_num_frames, time_precision, input_stride, prev_ids, ids, return_token_timestamps, ): sequences, seeks = [], [] for sequence, prev_id, idx in zip(seek_sequences, prev_ids, ids): seq, seek = self._retrieve_segment( seek_sequence=sequence, seek_outputs=seek_outputs, time_offset=time_offset, timestamp_begin=timestamp_begin, seek_num_frames=seek_num_frames, time_precision=time_precision, input_stride=input_stride, prev_idx=prev_id, idx=idx, return_token_timestamps=return_token_timestamps, ) sequences.append(seq) seeks.append(seek +int(time_offset[prev_id] * 100)) # best_seek = self._find_common_seek(sequences, seeks) best_seek = seeks[0] # print(f"Best seek {best_seek}") if best_seek - (min(time_offset[prev_ids]) *100) < 100: # we cannot rollback, we have to decode segments as they are sequences, seeks = [], [] for sequence, prev_id, idx in zip(seek_sequences, prev_ids, ids): seq, seek = self._retrieve_segment_wo_seek( seek_sequence=sequence, seek_outputs=seek_outputs, time_offset=time_offset, timestamp_begin=timestamp_begin, seek_num_frames=seek_num_frames, time_precision=time_precision, input_stride=input_stride, prev_idx=prev_id, idx=idx, return_token_timestamps=return_token_timestamps, ) sequences.append(seq) seeks.append(seek) return sequences, seeks seqs_new = self.remove_segments_after_seek(sequences, best_seek) seeks = [best_seek - int(min(time_offset[prev_ids]) * 100) for _ in seeks] return seqs_new, seeks def _beam_search( self, input_ids: torch.LongTensor, beam_scorer: BeamScorer, logits_processor: LogitsProcessorList, stopping_criteria: StoppingCriteriaList, generation_config: GenerationConfig, synced_gpus: bool, logits_warper: Optional[LogitsProcessorList] = None, **model_kwargs, ) -> Union[GenerateBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **beam search decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. Parameters: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): The sequence used as a prompt for the generation. beam_scorer (`BeamScorer`): An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. logits_processor (`LogitsProcessorList`): An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] used to modify the prediction scores of the language modeling head applied at each generation step. stopping_criteria (`StoppingCriteriaList`: An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] used to tell if the generation loop should stop. generation_config ([`~generation.GenerationConfig`]): The generation configuration to be used as parametrization of the decoding method. synced_gpus (`bool`): Whether to continue running the while loop until max_length (needed for ZeRO stage 3) logits_warper (`LogitsProcessorList`, *optional*): An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used to warp the prediction score distribution of the language modeling head applied before multinomial sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in `generation_config`) model_kwargs: Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. """ # init values pad_token_id = generation_config.pad_token_id eos_token_id = generation_config.eos_token_id output_attentions = generation_config.output_attentions output_hidden_states = generation_config.output_hidden_states output_scores = generation_config.output_scores output_logits = generation_config.output_logits return_dict_in_generate = generation_config.return_dict_in_generate sequential = generation_config.low_memory do_sample = generation_config.do_sample if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): raise ValueError( "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is " f"{logits_warper})." ) batch_size = len(beam_scorer._beam_hyps) num_beams = beam_scorer.num_beams batch_beam_size, cur_len = input_ids.shape model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) if num_beams * batch_size != batch_beam_size: raise ValueError( f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." ) # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None raw_logits = () if (return_dict_in_generate and output_logits) else None beam_indices = ( tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None ) decoder_attentions = () if (return_dict_in_generate and output_attentions) else None cross_attentions = () if (return_dict_in_generate and output_attentions) else None decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None # if model is an encoder-decoder, retrieve encoder attention weights and hidden states if return_dict_in_generate and self.config.is_encoder_decoder: encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None encoder_hidden_states = ( model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None ) # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens # of the first beam are considered to avoid sampling the exact same tokens across all beams. beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view((batch_size * num_beams,)) this_peer_finished = False decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) # if sequential is True, split the input to batches of batch_size and run sequentially if sequential: if any( model_name in self.__class__.__name__.lower() for model_name in [ "fsmt", "reformer", "bloom", "ctrl", "gpt_bigcode", "transo_xl", "xlnet", "cpm", "jamba", ] ): raise RuntimeError( f"Currently generation for {self.__class__.__name__} is not supported " f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature." ) inputs_per_sub_batches = _split_model_inputs( model_inputs, split_size=batch_size, full_batch_size=batch_beam_size ) outputs_per_sub_batch = [ self( **inputs_per_sub_batch, return_dict=True, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) for inputs_per_sub_batch in inputs_per_sub_batches ] outputs = stack_model_outputs(outputs_per_sub_batch) else: # Unchanged original behavior outputs = self( **model_inputs, return_dict=True, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) if synced_gpus and this_peer_finished: cur_len = cur_len + 1 continue # don't waste resources running the code we don't need next_token_logits = outputs.logits[:, -1, :] next_token_scores = nn.functional.log_softmax( next_token_logits, dim=-1 ) # (batch_size * num_beams, vocab_size) next_token_scores_processed = logits_processor(input_ids, next_token_scores) if do_sample: next_token_scores_processed = logits_warper(input_ids, next_token_scores_processed) next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as( next_token_scores_processed ) # Store scores, attentions and hidden_states when required if return_dict_in_generate: if output_scores: scores += (next_token_scores_processed,) if output_logits: raw_logits += (next_token_logits,) if output_attentions: decoder_attentions += ( (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) ) if self.config.is_encoder_decoder: cross_attentions += (outputs.cross_attentions,) if output_hidden_states: decoder_hidden_states += ( (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,) ) # reshape for beam search vocab_size = next_token_scores.shape[-1] next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) # Beam token selection: pick 1 + eos_token_id.shape[0] next tokens for each beam so we have at least 1 # non eos token per beam. n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0 n_tokens_to_keep = max(2, 1 + n_eos_tokens) * num_beams if do_sample: probs = nn.functional.softmax(next_token_scores, dim=-1) next_tokens = torch.multinomial(probs, num_samples=n_tokens_to_keep) next_token_scores = torch.gather(next_token_scores, -1, next_tokens) next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1) next_tokens = torch.gather(next_tokens, -1, _indices) else: next_token_scores, next_tokens = torch.topk( next_token_scores, n_tokens_to_keep, dim=1, largest=True, sorted=True ) next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor") next_tokens = next_tokens % vocab_size # stateless beam_outputs = beam_scorer.process( input_ids, next_token_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id, beam_indices=beam_indices, decoder_prompt_len=decoder_prompt_len, ) beam_scores = beam_outputs["next_beam_scores"] beam_next_tokens = beam_outputs["next_beam_tokens"] beam_idx = beam_outputs["next_beam_indices"] # Based on the beam idx and next tokens reshuffle the ctc prev states and scores if hasattr(self, "ctc_rescorer"): self.ctc_rescorer.update_state(beam_next_tokens, beam_idx) input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) model_kwargs = self._update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder, ) if model_kwargs.get("past_key_values", None) is not None: model_kwargs["past_key_values"] = self._temporary_reorder_cache( model_kwargs["past_key_values"], beam_idx ) if return_dict_in_generate and output_scores: beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices)))) # increase cur_len cur_len = cur_len + 1 if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)): this_peer_finished = True sequence_outputs = beam_scorer.finalize( input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id, max_length=stopping_criteria.max_length, beam_indices=beam_indices, decoder_prompt_len=decoder_prompt_len, ) if return_dict_in_generate: if not output_scores: sequence_outputs["sequence_scores"] = None if self.config.is_encoder_decoder: return GenerateBeamEncoderDecoderOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, logits=raw_logits, beam_indices=sequence_outputs["beam_indices"], encoder_attentions=encoder_attentions, encoder_hidden_states=encoder_hidden_states, decoder_attentions=decoder_attentions, cross_attentions=cross_attentions, decoder_hidden_states=decoder_hidden_states, past_key_values=model_kwargs.get("past_key_values"), ) else: return GenerateBeamDecoderOnlyOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, logits=raw_logits, beam_indices=sequence_outputs["beam_indices"], attentions=decoder_attentions, hidden_states=decoder_hidden_states, past_key_values=model_kwargs.get("past_key_values"), ) else: return sequence_outputs["sequences"] def _sample( self, input_ids: torch.LongTensor, logits_processor: LogitsProcessorList, stopping_criteria: StoppingCriteriaList, generation_config: GenerationConfig, synced_gpus: bool, streamer: Optional["BaseStreamer"], logits_warper: Optional[LogitsProcessorList] = None, **model_kwargs, ) -> Union[GenerateNonBeamOutput, torch.LongTensor]: r""" Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. Parameters: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): The sequence used as a prompt for the generation. logits_processor (`LogitsProcessorList`): An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] used to modify the prediction scores of the language modeling head applied at each generation step. stopping_criteria (`StoppingCriteriaList`): An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] used to tell if the generation loop should stop. generation_config ([`~generation.GenerationConfig`]): The generation configuration to be used as parametrization of the decoding method. synced_gpus (`bool`): Whether to continue running the while loop until max_length (needed for ZeRO stage 3) streamer (`BaseStreamer`, *optional*): Streamer object that will be used to stream the generated sequences. Generated tokens are passed through `streamer.put(token_ids)` and the streamer is responsible for any further processing. logits_warper (`LogitsProcessorList`, *optional*): An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used to warp the prediction score distribution of the language modeling head applied before multinomial sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in `generation_config`) model_kwargs: Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`. Return: [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. """ # init values pad_token_id = generation_config.pad_token_id output_attentions = generation_config.output_attentions output_hidden_states = generation_config.output_hidden_states output_scores = generation_config.output_scores output_logits = generation_config.output_logits return_dict_in_generate = generation_config.return_dict_in_generate has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) do_sample = generation_config.do_sample if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): raise ValueError( "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is " f"{logits_warper})." ) # init attention / hidden states / scores tuples scores = () if (return_dict_in_generate and output_scores) else None raw_logits = () if (return_dict_in_generate and output_logits) else None decoder_attentions = () if (return_dict_in_generate and output_attentions) else None cross_attentions = () if (return_dict_in_generate and output_attentions) else None decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None # if model is an encoder-decoder, retrieve encoder attention weights and hidden states if return_dict_in_generate and self.config.is_encoder_decoder: encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None encoder_hidden_states = ( model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None ) # keep track of which sequences are already finished batch_size = input_ids.shape[0] this_peer_finished = False unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): # prepare model inputs model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) # forward pass to get next token outputs = self( **model_inputs, return_dict=True, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) if synced_gpus and this_peer_finished: continue # don't waste resources running the code we don't need next_token_logits = outputs.logits[:, -1, :] # pre-process distribution next_token_scores = logits_processor(input_ids, next_token_logits) if do_sample: next_token_scores = logits_warper(input_ids, next_token_scores) # Store scores, attentions and hidden_states when required if return_dict_in_generate: if output_scores: scores += (next_token_scores,) if output_logits: raw_logits += (next_token_logits,) if output_attentions: decoder_attentions += ( (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) ) if self.config.is_encoder_decoder: cross_attentions += (outputs.cross_attentions,) if output_hidden_states: decoder_hidden_states += ( (outputs.decoder_hidden_states,) if self.config.is_encoder_decoder else (outputs.hidden_states,) ) # token selection if do_sample: probs = nn.functional.softmax(next_token_scores, dim=-1) next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) else: next_tokens = torch.argmax(next_token_scores, dim=-1) # finished sentences should have their next token be a padding token if has_eos_stopping_criteria: next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) # Based on the next tokens select the ctc prev states and scores if hasattr(self, "ctc_rescorer"): self.ctc_rescorer.update_state(next_tokens, torch.arange(next_tokens.shape[0])) # update generated ids, model inputs, and length for next step input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) if streamer is not None: streamer.put(next_tokens.cpu()) model_kwargs = self._update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder, ) unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) this_peer_finished = unfinished_sequences.max() == 0 if streamer is not None: streamer.end() if return_dict_in_generate: if self.config.is_encoder_decoder: return GenerateEncoderDecoderOutput( sequences=input_ids, scores=scores, logits=raw_logits, encoder_attentions=encoder_attentions, encoder_hidden_states=encoder_hidden_states, decoder_attentions=decoder_attentions, cross_attentions=cross_attentions, decoder_hidden_states=decoder_hidden_states, past_key_values=model_kwargs.get("past_key_values"), ) else: return GenerateDecoderOnlyOutput( sequences=input_ids, scores=scores, logits=raw_logits, attentions=decoder_attentions, hidden_states=decoder_hidden_states, past_key_values=model_kwargs.get("past_key_values"), ) else: return input_ids def prepare_kwargs_for_generate(self, segment_input, cur_bsz, batch_idx_map, seek, num_segment_frames, max_frames, kwargs): kwargs["attention_mask_enc"] = torch.ones(cur_bsz, segment_input.size(-1), device=segment_input.device) seek_vad = seek // 2 num_frames_vad = num_segment_frames // 2 max_frames_vad = max_frames // 2 seek_num_frames = (max_frames_vad - seek_vad).clamp(max=num_frames_vad) stno_masks = [] for i in range(cur_bsz): prev_i = batch_idx_map[i] segment_input_slice = kwargs["stno_mask"][prev_i: prev_i + 1, :, seek_vad[prev_i]: seek_vad[prev_i] + seek_num_frames[prev_i]] if segment_input_slice.shape[-1] < num_frames_vad: orig_len = segment_input_slice.shape[-1] # pad to 3000 if necessary segment_input_slice = torch.nn.functional.pad( segment_input_slice, pad=(0, num_frames_vad - orig_len) ) # set corresponding padding tokens to 1 in vad mask representing silence segment_input_slice[0, 0, orig_len:] = 1.0 stno_masks.append(segment_input_slice) kwargs["stno_mask"] = torch.cat(stno_masks, dim=0) self.stno_mask_seek = kwargs["stno_mask"] if "per_group_sizes" in kwargs: group_sizes = kwargs["per_group_sizes"].clone() group_sizes[:] = 0 cummulative_group_sizes = ( kwargs["per_group_sizes"].max().repeat(kwargs["per_group_sizes"].shape[0])).cumsum(dim=0) for i in batch_idx_map: group_idx = (cummulative_group_sizes > i).nonzero().min() group_sizes[group_idx] += 1 kwargs["per_group_sizes"] = group_sizes if self.vad_seek_callback is not None: self.vad_seek_callback(kwargs["stno_mask"]) return kwargs def generate_with_fallback( self, segment_input, decoder_input_ids, cur_bsz, batch_idx_map, seek, num_segment_frames, max_frames, temperatures, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, return_token_timestamps, do_condition_on_prev_tokens, kwargs, ): kwargs = copy.copy(kwargs) kwargs = self.prepare_kwargs_for_generate(segment_input, cur_bsz, batch_idx_map, seek, num_segment_frames, max_frames, kwargs) seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens = super().generate_with_fallback( segment_input, decoder_input_ids, cur_bsz, batch_idx_map, seek, num_segment_frames, max_frames, temperatures, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, return_token_timestamps, do_condition_on_prev_tokens, kwargs, ) self.stno_mask_seek =None # for i, seq in enumerate(seek_outputs): # print(f"Sequence {i}: {self.tokenizer.decode(seq, decode_with_timestamps=True)}") # print("-"*50) return seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens def _retrieve_init_tokens(self, input_features, batch_size, generation_config, config, num_segment_frames, kwargs): def replace_or_add(lst: List[int], num: int, itr: Iterator[int]): """short function to replace num with a itr in lst""" found = any(i in lst for i in itr) if found: lst = [num if i in itr else i for i in lst] else: lst.append(num) return lst def language_to_id(language: str) -> int: language = language.lower() if language in generation_config.lang_to_id.keys(): language_token = language elif language in TO_LANGUAGE_CODE.keys(): language_token = f"<|{TO_LANGUAGE_CODE[language]}|>" elif language in TO_LANGUAGE_CODE.values(): language_token = f"<|{language}|>" else: is_language_code = len(language) == 2 raise ValueError( f"Unsupported language: {language}. Language should be one of:" f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}." ) if language_token not in generation_config.lang_to_id: raise ValueError( f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`." "(You should just add it to the generation config)" ) return generation_config.lang_to_id[language_token] task = getattr(generation_config, "task", None) language = getattr(generation_config, "language", None) forced_decoder_ids = generation_config.forced_decoder_ids if forced_decoder_ids is not None: if language is None and task is None and forced_decoder_ids[0][1] is None: logger.warning_once( "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English." "This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`." ) elif hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None: forced_decoder_ids = config.forced_decoder_ids elif forced_decoder_ids is not None and language is not None: logger.info( f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}." ) forced_decoder_ids = None init_tokens = [generation_config.decoder_start_token_id] # Update init_tokens with languages lang_ids = None if forced_decoder_ids is not None: return forced_decoder_ids # from v4.39 the forced decoder ids are always None in favour of decoder input ids generation_config.forced_decoder_ids = None is_lang_id_undefined = len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None) # Make sure language is a list of strings of the correct length if isinstance(language, (list, tuple)): if any(l is None for l in language): raise TypeError( "Expected `language` to be `None`, a single string (e.g. `'en'`), or a list of strings with length equal to the batch size (e.g. `('en', 'fr')` for a batch size of 2). Got a list containing `None`." ) if len(language) != batch_size: raise ValueError( "When passing a list of languages, the length of the list must match the batch size. " f"Expected length of {batch_size}, but got {len(language)} languages." ) languages = language elif language is None: # Language will be detected for each item in batch languages = [None] * batch_size else: languages = [language] # Use a length-1 list now, broadcast later # Separate init_tokens for each language init_tokens = [copy.copy(init_tokens) for _ in languages] if language is not None and lang_ids is not None: lang_ids = [language_to_id(l) for l in languages] elif hasattr(generation_config, "lang_to_id") and is_lang_id_undefined: # language is not defined or intentially set to `None` to trigger language detection lang_ids = self.detect_language( input_features=input_features, encoder_outputs=kwargs.get("encoder_outputs", None), generation_config=generation_config, num_segment_frames=num_segment_frames, ).tolist() if lang_ids is not None: # append or replace lang_ids to init_tokens for i in range(len(init_tokens)): if len(init_tokens[i]) > 1: init_tokens[i][1] = lang_ids[i] else: init_tokens[i].append(lang_ids[i]) del languages # Update init_tokens with task for i in range(len(init_tokens)): if task is not None: if task in TASK_IDS: init_tokens[i].append(generation_config.task_to_id[generation_config.task]) task_id = generation_config.task_to_id[generation_config.task] # if task is defined it'll overwrite task ids that might have already been defined via the generation_config replace_or_add(init_tokens[i], task_id, generation_config.task_to_id.values()) else: raise ValueError(f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`") elif language is not None and hasattr(generation_config, "task_to_id"): # if language is defined, but no task id is in `init_tokens`, default to transcribe if not any(ti in init_tokens[i] for ti in generation_config.task_to_id.values()): init_tokens[i].append(generation_config.task_to_id["transcribe"]) # let's make sure we don't pass `None` tokens as prompt tokens init_tokens[i] = [t for t in init_tokens[i] if t is not None] return torch.as_tensor(init_tokens, dtype=torch.long, device=self.device).expand(batch_size, -1) def detect_language( self, input_features: Optional[torch.FloatTensor] = None, encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]] = None, generation_config: Optional[GenerationConfig] = None, num_segment_frames: int = 3000, ) -> torch.Tensor: """ Detects language from log-mel input features or encoder_outputs Parameters: input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*): Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details. encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. generation_config (`~generation.GenerationConfig`, *optional*): The generation configuration to be used as base parametrization for the generation call. `**kwargs` passed to generate matching the attributes of `generation_config` will override them. If `generation_config` is not provided, the default will be used, which had the following loading priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s default values, whose documentation should be checked to parameterize generation. num_segment_frames (`int`, defaults to 3000): The number of log-mel frames the model expects Return: A `torch.LongTensor` representing the detected language ids. """ if input_features is None and encoder_outputs is None: raise ValueError("You have to specify either `input_features` or `encoder_outputs`") elif input_features is not None and encoder_outputs is not None: raise ValueError("Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!") elif input_features is not None: inputs = {"input_features": input_features[:, :, :num_segment_frames]} batch_size = input_features.shape[0] elif encoder_outputs is not None: inputs = {"encoder_outputs": encoder_outputs} batch_size = ( encoder_outputs[0].shape[0] if isinstance(encoder_outputs, BaseModelOutput) else encoder_outputs[0] ) generation_config = generation_config or self.generation_config decoder_input_ids = ( torch.ones((batch_size, 1), device=self.device, dtype=torch.long) * generation_config.decoder_start_token_id ) with torch.no_grad(): logits = self(**inputs, decoder_input_ids=decoder_input_ids, stno_mask=self.stno_mask_seek if self.stno_mask_seek is not None else self.stno_mask[:, :, :num_segment_frames // 2]).logits[ :, -1] non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool) non_lang_mask[list(generation_config.lang_to_id.values())] = False logits[:, non_lang_mask] = -np.inf lang_ids = logits.argmax(-1) return lang_ids def _get_logits_processor( self, generation_config: GenerationConfig, input_ids_seq_length: int, encoder_input_ids: torch.LongTensor, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], logits_processor: Optional[LogitsProcessorList], device: str = None, model_kwargs: Optional[Dict[str, Any]] = None, negative_prompt_ids: Optional[torch.Tensor] = None, negative_prompt_attention_mask: Optional[torch.Tensor] = None, ) -> LogitsProcessorList: # pylint: disable=no-member gen_config_copy = copy.deepcopy(generation_config) gen_config_copy.forced_decoder_ids = None processors = super()._get_logits_processor( gen_config_copy, input_ids_seq_length, encoder_input_ids, prefix_allowed_tokens_fn, logits_processor, device, model_kwargs, negative_prompt_ids, negative_prompt_attention_mask, ) if hasattr(generation_config, "ctc_weight") and generation_config.ctc_weight > 0: enc_logits = self.encoder_logits if generation_config.num_beams <= 1: processors.append(LogSoftmaxProcessor()) else: enc_logits = enc_logits.repeat_interleave(generation_config.num_beams, dim=0) self.ctc_rescorer = CTCRescorerLogitsProcessor( enc_logits, torch.full((enc_logits.shape[0],), fill_value=enc_logits.shape[1], device=enc_logits.device), enc_logits.shape[-1] - 1, generation_config.pad_token_id.item(), generation_config.eos_token_id.item(), generation_config.decoder_start_token_id.item(), self.tokenizer, generation_config.ctc_margin, generation_config.ctc_weight, generation_config.num_beams, False, ) processors.append(self.ctc_rescorer) return processors def _retrieve_logit_processors(self, generation_config, logits_processor, begin_index, is_shortform, num_beams, device): if generation_config.return_timestamps is True: timestamp_processor = WhisperTimeStampLogitsProcessorCustom(generation_config, begin_index=begin_index) logits_processor = ( [timestamp_processor] if logits_processor is None else [timestamp_processor] + logits_processor ) if generation_config.suppress_tokens is not None: suppress_tokens_processor = SuppressTokensLogitsProcessor(generation_config.suppress_tokens, device=device) logits_processor = ( [suppress_tokens_processor] if logits_processor is None else [suppress_tokens_processor] + logits_processor ) generation_config.suppress_tokens = None if generation_config.begin_suppress_tokens is not None: begin_suppress_processor = SuppressTokensAtBeginLogitsProcessor( generation_config.begin_suppress_tokens, begin_index=begin_index, device=device ) logits_processor = ( [begin_suppress_processor] if logits_processor is None else [begin_suppress_processor] + logits_processor ) generation_config.begin_suppress_tokens = None if generation_config.no_speech_threshold is not None and not is_shortform: no_speech_detector = WhisperNoSpeechDetection( no_speech_token=generation_config.no_timestamps_token_id - 1, begin_index=begin_index, scores_is_logprobs=num_beams > 1, ) logits_processor = ( [no_speech_detector] if logits_processor is None else [no_speech_detector] + logits_processor ) no_speech_detector.set_model(self) return logits_processor @staticmethod def round_to_nearest_0_02(x): d = Decimal(str(x)) # Use str(x) to preserve input precision step = Decimal('0.02') # Divide, round, multiply back rounded = (d / step).to_integral_value(rounding=ROUND_HALF_UP) * step return rounded def _fix_timestamps_from_segmentation(self, sequences): """ Adjusts token sequences with global timestamps to fit within Whisper's 0–30s timestamp token range. This function modifies the input sequences by inserting appropriate timestamp tokens and offset corrections to ensure the decoded token order is correct, without splitting any segment. It aligns all timestamps to 0.02-second precision, inserts placeholder segments to bridge time gaps between 30-second windows, and maintains segment continuity during encoding. Args: sequences (dict): A dictionary containing: - 'segments': A list of segment lists, each segment being a dict with 'start', 'end', and 'tokens'. - 'sequences': A tensor used to determine device for padding. Returns: torch.Tensor: A batch of padded token sequences with corrected timestamp alignment. """ # Get the token ID for the "<|0.00|>" timestamp used to detect dummy segments first_timestamp_token = self.tokenizer.get_vocab()["<|0.00|>"] results = [] # Filter out segments that are either empty or consist only of the "<|0.00|>" token for idx, sequence_segs in enumerate(sequences['segments']): sequences['segments'][idx] = [ seg for seg in sequence_segs if len(seg['tokens']) > 0 and (len(seg['tokens']) != 1 or seg['tokens'][0] != first_timestamp_token) ] # Iterate over each group of segments (e.g., one per utterance) for idx, sequence_segs in enumerate(sequences['segments']): result = [] prev_segment_end_time = None correction = Decimal(0.0) for i, seg in enumerate(sequence_segs): # Round start and end times to nearest 0.02 seconds start_time = self.round_to_nearest_0_02(seg['start'].item()) end_time = self.round_to_nearest_0_02(seg['end'].item()) tokens = seg['tokens'] # Determine which 30s window this segment falls into current_block = (start_time + correction) // 30 if prev_segment_end_time is not None: # If not the first segment, calculate difference in 30s windows prev_block = prev_segment_end_time // 30 num_dummies = current_block - prev_block - 1 # Insert (30, [], 30) marker if we're moving to a new block if current_block > prev_block: result.append((30, [], 30)) # Insert dummy segments to bridge skipped 30s blocks for _ in range(int(num_dummies)): result.append((0, [], 30)) else: # For the first segment, add dummy blocks if it starts after 30s for _ in range(int(start_time // 30)): result.append((0, [], 30)) # Determine whether segment fits in one block or wraps to the next if (start_time + correction) // 30 == (end_time + correction) // 30: # Segment fits within a single 30s window result.append(((start_time + correction) % 30, tokens, (end_time + correction) % 30)) else: # Segment would wrap across a 30s boundary new_seg_start = (correction + start_time) % 30 new_seg_end = end_time - start_time if new_seg_end >= new_seg_start: # Seek back to the beginning of the segment window result.append((new_seg_start, [], new_seg_start)) result.append((0, tokens, new_seg_end)) # Apply correction to align future timestamps to new 30s block correction = self.round_to_nearest_0_02(-(start_time % 30)) else: # Otherwise, just insert with adjusted times result.append((new_seg_start, tokens, new_seg_end)) correction = self.round_to_nearest_0_02(30 - (start_time % 30)) # print(f'Processed segment {i}, result: {self.tokenizer.decode(self.tokenizer("".join([f"<|{seg[0]:.2f}|>{self.tokenizer.decode(seg[1])}<|{seg[2]:.2f}|>" for seg in result]))["input_ids"], decode_with_timestamps=True)[-250:]}') # Update the previous segment's end time for next iteration prev_segment_end_time = end_time + correction # Convert result segments into a token sequence with proper timestamp formatting encoded = self.tokenizer( "".join([f"<|{seg[0]:.2f}|>{self.tokenizer.decode(seg[1])}<|{seg[2]:.2f}|>" for seg in result]) )['input_ids'] results.append(encoded) # Pad all sequences to the same length for batching sequences = pad_sequence( [torch.tensor(res, device=sequences['sequences'].device) for res in results], batch_first=True, padding_value=self.tokenizer.pad_token_id ) return sequences @staticmethod def _retrieve_segment( seek_sequence, seek_outputs, time_offset, timestamp_begin, seek_num_frames, time_precision, input_stride, prev_idx, idx, return_token_timestamps, ): # find the predicted "end of segment" predictions of Whisper # "end of segment" predictions occur whenever Whisper predicts a timestamp token timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin) single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True] timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] timestamp_segment_indices.add_(1) token_timestamps = seek_outputs[idx]["token_timestamps"] if return_token_timestamps else [] # If whisper predicted a "end of segment" via a timestep token, let's go ever each # "end of segment" prediction and slice the decoding into segments accordingly if len(timestamp_segment_indices) > 0: # if the output contains two consecutive timestamp tokens slices = timestamp_segment_indices.tolist() segments = [] if single_timestamp_ending: slices.append(len(seek_sequence)) last_slice = 0 # Add each segment to list of all segments for current_slice in slices: sliced_tokens = seek_sequence[last_slice:current_slice] start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin end_timestamp_pos = sliced_tokens[-1].item() - timestamp_begin segments.append( { "start": time_offset[prev_idx] + start_timestamp_pos * time_precision, "end": time_offset[prev_idx] + end_timestamp_pos * time_precision, "tokens": sliced_tokens, "result": seek_outputs[idx], } ) if return_token_timestamps: segments[-1]["token_timestamps"] = ( token_timestamps[last_slice:current_slice] + time_offset[prev_idx] ) last_slice = current_slice if single_timestamp_ending: # single timestamp at the end means no speech after the last timestamp. segment_offset = seek_num_frames[prev_idx] else: # otherwise, ignore the unfinished segment and seek to the last timestamp # here we throw away all predictions after the last predicted "end of segment" # since we are cutting right in the middle of an audio last_timestamp_pos = seek_sequence[last_slice - 1].item() - timestamp_begin segment_offset = last_timestamp_pos * input_stride else: # If whisper does not predict any "end of segment" token, then # the whole decoding is considered a segment and we add it to the list of segments timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()] start_timestamp_pos = 0.0 last_timestamp_pos = seek_num_frames[prev_idx] // 2 skip = False segment_offset = seek_num_frames[prev_idx] if timestamps.numel() > 1: start_timestamp_pos = timestamps[-2].item() - timestamp_begin last_timestamp_pos = timestamps[-1].item() - timestamp_begin elif timestamps.numel() == 1: # no consecutive timestamps but it has a timestamp; use the last one. start_timestamp_pos = timestamps[-1].item() - timestamp_begin if start_timestamp_pos > 200: # segment does not fit into decoding window, so we need to rollback segment_offset = start_timestamp_pos * input_stride - 100 # timestamp might be inaccurate skip = True else: # empty sequence, or sequence w/o timestamps skip = True if skip: segments = [] else: segments = [ { "start": time_offset[prev_idx] + start_timestamp_pos * time_precision, "end": time_offset[prev_idx] + last_timestamp_pos * time_precision, "tokens": seek_sequence, "result": seek_outputs[idx], } ] if return_token_timestamps: segments[-1]["token_timestamps"] = token_timestamps + time_offset[prev_idx] segment_offset = seek_num_frames[prev_idx] if segment_offset <= 0: msg = f"Timestamps: {timestamps}, Segments: {segments}" raise ValueError(f"Segment offset: {segment_offset} <= 0. This should not happen!\n{msg}") return segments, segment_offset def _postprocess_outputs(self, seek_outputs, decoder_input_ids, return_token_timestamps, generation_config): # remove all previously passed decoder input ids if isinstance(seek_outputs, torch.Tensor): seek_outputs = seek_outputs[:, decoder_input_ids.shape[-1]:] seek_outputs = torch.hstack(( seek_outputs, torch.full((seek_outputs.shape[0], 1), fill_value=generation_config.pad_token_id, dtype=seek_outputs.dtype, device=seek_outputs.device ) )) # first_eos = (seek_outputs == generation_config.eos_token_id).int().argmax(dim=1) # biggest_timestamp = generation_config.no_timestamps_token_id + 1 + 30 * 50 # empty_transcriptions = first_eos == 0 # seek_outputs[empty_transcriptions, 0] = generation_config.no_timestamps_token_id + 1 # 0.00 timestamp # seek_outputs[empty_transcriptions, 1] = biggest_timestamp # 30.00 timestamp # seek_outputs[empty_transcriptions, 2] = generation_config.eos_token_id # 30.00 timestamp return seek_outputs, seek_outputs if return_token_timestamps and hasattr(generation_config, "alignment_heads"): num_frames = getattr(generation_config, "num_frames", None) seek_outputs["token_timestamps"] = self._extract_token_timestamps( seek_outputs, generation_config.alignment_heads, num_frames=num_frames ) seek_outputs["token_timestamps"] = seek_outputs["token_timestamps"][:, decoder_input_ids.shape[-1]:] seek_outputs["sequences"] = seek_outputs["sequences"][:, decoder_input_ids.shape[-1]:] def split_by_batch_index(values, key, batch_idx): if key == "scores": return [v[batch_idx].cpu() for v in values] elif key == "past_key_values": # we don't save `past_key_values` as this is too costly return None elif isinstance(values[batch_idx], tuple) and torch.is_tensor(values[batch_idx][0]): return tuple(tuple(w[batch_idx][None].cpu() for w in v) for v in values) return values[batch_idx].cpu() sequence_tokens = seek_outputs["sequences"] seek_outputs = [ {k: split_by_batch_index(v, k, i) for k, v in seek_outputs.items()} for i in range(sequence_tokens.shape[0]) ] return sequence_tokens, seek_outputs