DiCoW_v3_MLC / generation.py
Lakoc's picture
Upload DiCoWForConditionalGeneration
702de8f verified
raw
history blame
86.9 kB
import copy
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from typing import Iterator
import warnings
import numpy as np
import torch
import torch.utils.checkpoint
import torch.utils.checkpoint
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from decimal import Decimal, ROUND_HALF_UP
from transformers import LogitsProcessorList, SuppressTokensLogitsProcessor, \
SuppressTokensAtBeginLogitsProcessor
from transformers.generation.configuration_utils import GenerationConfig
from transformers.generation.configuration_utils import GenerationMode
from transformers.generation.logits_process import (
LogitsProcessorList,
SuppressTokensAtBeginLogitsProcessor,
SuppressTokensLogitsProcessor, )
from transformers.generation.logits_process import WhisperNoSpeechDetection
from transformers.generation.stopping_criteria import (
StoppingCriteriaList,
)
from transformers.generation.utils import GenerateBeamOutput, BeamScorer, GenerateBeamDecoderOnlyOutput, \
stack_model_outputs, GenerateBeamEncoderDecoderOutput, _split_model_inputs, GenerateNonBeamOutput, \
GenerateEncoderDecoderOutput, GenerateDecoderOnlyOutput
from transformers.modeling_outputs import BaseModelOutput
from transformers.models.whisper.modeling_whisper import (
WhisperForConditionalGeneration,
)
from transformers.models.whisper.generation_whisper import _get_attr_from_logit_processors, _pad_to_max_length
from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
from transformers.utils import logging
from .utils import WhisperTimeStampLogitsProcessorCustom
from .decoding import CTCRescorerLogitsProcessor, LogSoftmaxProcessor
logging.set_verbosity_debug()
logger = logging.get_logger("transformers")
class DiCoWGenerationMixin(WhisperForConditionalGeneration):
def _prepare_encoder_decoder_kwargs_for_generation(
self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name, generation_config,
) -> Dict[str, Any]:
# self.encoder_output_lens = self._get_feat_extract_output_lengths(
# model_kwargs['attention_mask_enc'].sum(dim=1)
# ).int()
generation_config.output_hidden_states = True
# pylint: disable=no-memberva
model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation(
inputs_tensor, model_kwargs, model_input_name, generation_config
)
self.encoder_logits = model_kwargs["encoder_outputs"].logits
return model_kwargs
@staticmethod
def _expand_inputs_for_generation(
expand_size: int = 1,
is_encoder_decoder: bool = False,
input_ids: Optional[torch.LongTensor] = None,
**model_kwargs,
) -> Tuple[torch.LongTensor, Dict[str, Any]]:
"""Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
def _expand_dict_for_generation(dict_to_expand):
for key in dict_to_expand:
if dict_to_expand[key] is not None and isinstance(dict_to_expand[key], torch.Tensor) and key != "loss":
dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
return dict_to_expand
if input_ids is not None:
input_ids = input_ids.repeat_interleave(expand_size, dim=0)
model_kwargs = _expand_dict_for_generation(model_kwargs)
if is_encoder_decoder:
if model_kwargs.get("encoder_outputs") is None:
raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
if "hidden_states" in model_kwargs["encoder_outputs"]:
model_kwargs["encoder_outputs"]["hidden_states"] = tuple(
hidden_state.repeat_interleave(expand_size, dim=0) for hidden_state in
model_kwargs["encoder_outputs"]["hidden_states"]
)
return input_ids, model_kwargs
def generate(
self,
input_features: Optional[torch.Tensor] = None,
generation_config: Optional[GenerationConfig] = None,
logits_processor: Optional[LogitsProcessorList] = None,
stopping_criteria: Optional[StoppingCriteriaList] = None,
prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
synced_gpus: bool = False,
return_timestamps: Optional[bool] = None,
task: Optional[str] = None,
language: Optional[str] = None,
is_multilingual: Optional[bool] = None,
prompt_ids: Optional[torch.Tensor] = None,
prompt_condition_type: Optional[str] = None, # first-segment, all-segments
condition_on_prev_tokens: Optional[bool] = None,
temperature: Optional[Union[float, Tuple[float, ...]]] = None,
compression_ratio_threshold: Optional[float] = None,
logprob_threshold: Optional[float] = None,
no_speech_threshold: Optional[float] = None,
num_segment_frames: Optional[int] = None,
attention_mask: Optional[torch.Tensor] = None,
time_precision: float = 0.02,
return_token_timestamps: Optional[bool] = None,
return_segments: bool = False,
return_dict_in_generate: Optional[bool] = None,
assistant_model: Optional["PreTrainedModel"] = None,
**kwargs,
):
if condition_on_prev_tokens:
raise NotImplementedError("Current version does not support conditioning")
gen_c, _ = self._prepare_generation_config(generation_config, **kwargs)
gen_mode = gen_c.get_generation_mode(assistant_model)
if gen_mode not in [GenerationMode.GREEDY_SEARCH, GenerationMode.BEAM_SEARCH]:
raise ValueError(
f"Provided generation mode {gen_mode} is not supported"
f" for WhisperForConditionalGeneration with joint CTC decoding")
if "stno_mask" in kwargs:
self.stno_mask = kwargs["stno_mask"]
if "encoder_outputs" in kwargs:
self.encoder_logits = kwargs["encoder_outputs"].logits
# pylint: disable=no-member
# 0. deprecate old inputs
if "inputs" in kwargs:
input_features = kwargs.pop("inputs")
warnings.warn(
"The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
FutureWarning,
)
# 1. prepare generation config
generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
# 2. set global generate variables
input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
num_segment_frames = input_stride * self.config.max_source_positions
batch_size, total_input_frames = self._retrieve_total_input_frames(
input_features=input_features, input_stride=input_stride, kwargs=kwargs
)
is_shortform = total_input_frames <= num_segment_frames
if is_shortform:
# warn user of ignored inputs
self._maybe_warn_unused_inputs(
condition_on_prev_tokens=condition_on_prev_tokens,
temperature=temperature,
compression_ratio_threshold=compression_ratio_threshold,
logprob_threshold=logprob_threshold,
no_speech_threshold=no_speech_threshold,
total_input_frames=total_input_frames,
)
# 3. Make sure generation config is correctly set
# Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
self._set_return_outputs(
return_dict_in_generate=return_dict_in_generate,
return_token_timestamps=return_token_timestamps,
is_shortform=is_shortform,
logprob_threshold=logprob_threshold,
generation_config=generation_config,
)
self._set_return_timestamps(
return_timestamps=return_timestamps, is_shortform=is_shortform, generation_config=generation_config
)
self._set_language_and_task(
language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config
)
self._set_num_frames(
return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs
)
self._set_thresholds_and_condition(
generation_config=generation_config,
logprob_threshold=logprob_threshold,
compression_ratio_threshold=compression_ratio_threshold,
no_speech_threshold=no_speech_threshold,
condition_on_prev_tokens=condition_on_prev_tokens,
)
self._set_prompt_condition_type(
generation_config=generation_config,
prompt_condition_type=prompt_condition_type,
)
# pass self.config for backward compatibility
init_tokens = self._retrieve_init_tokens(
input_features,
batch_size=batch_size,
generation_config=generation_config,
config=self.config,
num_segment_frames=num_segment_frames,
kwargs=kwargs,
)
# passing `decoder_input_ids` is deprecated - the only exception is for assisted generation
# where the input ids are handled explicitly by the generate method
self._check_decoder_input_ids(kwargs=kwargs)
# 3. Retrieve logits processors
device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device
begin_index = init_tokens.shape[1]
logits_processor = self._retrieve_logit_processors(
generation_config=generation_config,
logits_processor=logits_processor,
begin_index=begin_index, # begin index is index of first generated decoder token
is_shortform=is_shortform,
num_beams=kwargs.get("num_beams", 1),
device=device,
)
# 5. If we're in shortform mode, simple generate the whole input at once and return the output
if is_shortform:
if temperature is not None:
generation_config.temperature = temperature
decoder_input_ids = kwargs.pop("decoder_input_ids", None)
if decoder_input_ids is None:
decoder_input_ids = init_tokens
if prompt_ids is not None:
decoder_input_ids = torch.cat(
[prompt_ids[None].repeat(decoder_input_ids.shape[0], 1), decoder_input_ids], dim=-1
)
max_new_tokens = generation_config.max_new_tokens if generation_config.max_new_tokens is not None else 0
if max_new_tokens + decoder_input_ids.shape[-1] > self.config.max_target_positions:
raise ValueError(
f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` "
f"is {max_new_tokens}. Thus, the combined length of "
f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. "
"You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
f"so that their combined length is less than {self.config.max_target_positions}."
)
outputs = super().generate(
input_features,
generation_config=generation_config,
logits_processor=logits_processor,
stopping_criteria=stopping_criteria,
prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
synced_gpus=synced_gpus,
decoder_input_ids=decoder_input_ids,
**kwargs,
)
if generation_config.return_token_timestamps and hasattr(generation_config, "alignment_heads"):
outputs["token_timestamps"] = self._extract_token_timestamps(
outputs, generation_config.alignment_heads, num_frames=generation_config.num_frames
)
# print("\n".join(self.tokenizer.batch_decode(outputs,skip_special_tokens=True, decode_with_timestamps=True)))
return outputs
# 6. Else we're in longform mode which is more complex.
# We need to chunk the audio input depending on when the model generates timestamp tokens
# 6.1 Set and retrieve global longform generation variables
self._set_condition_on_prev_tokens(
condition_on_prev_tokens=condition_on_prev_tokens, generation_config=generation_config
)
timestamp_begin = generation_config.no_timestamps_token_id + 1
temperatures = [temperature] if not isinstance(temperature, (list, tuple)) else temperature
temperature = temperatures[0]
batch_size = input_features.shape[0]
max_frames, seek = self._retrieve_max_frames_and_seek(
batch_size=batch_size, attention_mask=attention_mask, total_input_frames=total_input_frames
)
# 6.2 Preppare running variables, list for generation
cur_bsz = batch_size
current_segments = self._prepare_segments(
prompt_ids=prompt_ids,
batch_size=batch_size,
generation_config=generation_config,
)
batch_idx_map = list(range(batch_size))
do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(batch_size)]
# 6.2 Transcribe audio until we reach the end of all input audios
while (seek < max_frames).any():
# 6.3 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop
# in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order
# to know which original audio is being decoded
# Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk
input_features, cur_bsz, batch_idx_map = self._maybe_reduce_batch(
input_features=input_features,
seek=seek,
max_frames=max_frames,
cur_bsz=cur_bsz,
batch_idx_map=batch_idx_map,
)
time_offset = seek * time_precision / input_stride
seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
# 6.4 cut out next 30s segment from input features
segment_input = self._get_input_segment(
input_features=input_features,
seek=seek,
seek_num_frames=seek_num_frames,
num_segment_frames=num_segment_frames,
cur_bsz=cur_bsz,
batch_idx_map=batch_idx_map,
)
# 6.5 prepare decoder input ids
suppress_tokens = _get_attr_from_logit_processors(
logits_processor, SuppressTokensLogitsProcessor, "suppress_tokens"
)
decoder_input_ids, kwargs = self._prepare_decoder_input_ids(
cur_bsz=cur_bsz,
init_tokens=init_tokens,
current_segments=current_segments,
batch_idx_map=batch_idx_map,
do_condition_on_prev_tokens=do_condition_on_prev_tokens,
prompt_ids=prompt_ids,
generation_config=generation_config,
config=self.config,
device=segment_input.device,
suppress_tokens=suppress_tokens,
kwargs=kwargs,
)
# 6.6 set max new tokens or max length
self._set_max_new_tokens_and_length(
config=self.config,
decoder_input_ids=decoder_input_ids,
generation_config=generation_config,
)
# 6.7 Set current `begin_index` for all logit processors
for proc in logits_processor:
if hasattr(proc, "set_begin_index"):
proc.set_begin_index(decoder_input_ids.shape[-1])
# 6.8 Run generate with fallback
seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens = self.generate_with_fallback(
segment_input=segment_input,
decoder_input_ids=decoder_input_ids,
cur_bsz=cur_bsz,
batch_idx_map=batch_idx_map,
seek=seek,
num_segment_frames=num_segment_frames,
max_frames=max_frames,
temperatures=temperatures,
generation_config=generation_config,
logits_processor=logits_processor,
stopping_criteria=stopping_criteria,
prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
synced_gpus=synced_gpus,
return_token_timestamps=return_token_timestamps,
do_condition_on_prev_tokens=do_condition_on_prev_tokens,
kwargs=kwargs,
)
# 6.9 In every generated sequence, split by timestamp tokens and extract segments
if self.config.mt_num_speakers ==1:
for i, seek_sequence in enumerate(seek_sequences):
prev_i = batch_idx_map[i]
if should_skip[i]:
seek[prev_i] += seek_num_frames[prev_i]
continue
segments, segment_offset = self._retrieve_segment(
seek_sequence=seek_sequence,
seek_outputs=seek_outputs,
time_offset=time_offset,
timestamp_begin=timestamp_begin,
seek_num_frames=seek_num_frames,
time_precision=time_precision,
input_stride=input_stride,
prev_idx=prev_i,
idx=i,
return_token_timestamps=return_token_timestamps,
)
current_segments[prev_i] += segments
seek[prev_i] += segment_offset
else:
# We have to make sure all speakers are synchronized thus we have to find minumum of seeks that each instance like
for j, seek_seqs in enumerate([seek_sequences[i*self.config.mt_num_speakers:(i+1)*self.config.mt_num_speakers] for i in range(len(seek_sequences)//self.config.mt_num_speakers)]):
indexes = [j*self.config.mt_num_speakers + i for i in range(self.config.mt_num_speakers)]
prev_ids = [batch_idx_map[i] for i in indexes]
if all([should_skip[i] for i in indexes]):
for i, prev_i in zip(indexes, prev_ids):
seek[prev_i] += seek_num_frames[prev_i]
continue
segments, segment_offset = self._retrieve_segment_mt(
seek_sequences=seek_seqs,
seek_outputs=seek_outputs,
time_offset=time_offset,
timestamp_begin=timestamp_begin,
seek_num_frames=seek_num_frames,
time_precision=time_precision,
input_stride=input_stride,
prev_ids=prev_ids,
ids=indexes,
return_token_timestamps=return_token_timestamps,
)
for prev_i, i in zip(prev_ids, range(self.config.mt_num_speakers)):
current_segments[prev_i] += segments[i]
seek[prev_i] += segment_offset[i]
# 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
# output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
final_segments = (
[x[1:] for x in current_segments]
if (prompt_ids is not None and generation_config.prompt_condition_type == "first-segment")
else current_segments
)
sequences = _pad_to_max_length(
final_segments, generation_config.pad_token_id, device=self.device, padding="right"
)
# 8. If we return all segments, the predicted output sequences are put under `"sequences"`.
output = {"sequences": sequences, "segments": final_segments}
self.encoder_logits = None
if isinstance(output, dict):
output = self._fix_timestamps_from_segmentation(output)
return output
@staticmethod
def _find_common_seek(sequences, seeks):
"""
Finds the minimum seek that does not overlap with other sequences,
and falls back to (segment.start - 0.2) if needed. Assumes:
- 'seeks' is a list of (seek_time_int, sequence_index),
- seek_time_int is in timestamp * 100 format (e.g., 125.5s -> 12550).
"""
def is_valid_seek(seek_time, exclude_seq_idx):
for idx, seq in enumerate(sequences):
if idx == exclude_seq_idx:
continue
for segment in seq:
start = getattr(segment, 'start', segment['start'])
end = getattr(segment, 'end', segment['end'])
if seek_time < start:
break # Segments are sorted by end
if start < seek_time < end:
return False
return True
# Step 1: Find minimum seek
# if all seek values are the same, return it immediately
seeks = [s if isinstance(s, int) else s.item() for s in seeks]
if len(set(seeks)) == 1:
return seeks[0]
min_seek_val = min(seeks)
min_seek_idx = seeks.index(min_seek_val)
min_seek_real = min_seek_val / 100
if is_valid_seek(min_seek_real, min_seek_idx):
return min_seek_val
# Step 2: Try fallback seeks from all sequences (segment.start - 0.1s)
fallback_seeks = set()
for idx, seq in enumerate(sequences):
for segment in seq:
start = getattr(segment, 'start', segment['start'])
if isinstance(start, torch.Tensor):
start = start.item()
candidate = round(start, 2)
fallback_seeks.add((candidate, idx, True))
end = getattr(segment, 'end', segment['end'])
if isinstance(end, torch.Tensor):
end = end.item()
if end < min_seek_real:
candidate = round(end, 2)
fallback_seeks.add((candidate, idx, True))
valid_fallbacks = [
(int(s * 100), idx, is_start) for s, idx, is_start in fallback_seeks
if is_valid_seek(s, min_seek_idx)
]
if valid_fallbacks:
return max(valid_fallbacks)
# Step 3: Nothing valid
return 0
@staticmethod
def remove_segments_after_seek(sequences, seek, eps=100):
"""
Keep only segments that finish before given timestamp.
Args:
sequences: List of lists, each containing segments (dict or object with 'start' and 'end').
seek: Integer seek timestamp (e.g., timestamp * 100).
Returns:
None. Modifies the sequences in-place.
"""
return [[seg for seg in seq if (getattr(seg, 'end', seg['end']) * 100 <= seek +eps)] for seq in sequences]
@staticmethod
def _retrieve_segment_wo_seek(
seek_sequence,
seek_outputs,
time_offset,
timestamp_begin,
seek_num_frames,
time_precision,
input_stride,
prev_idx,
idx,
return_token_timestamps,
):
# find the predicted "end of segment" predictions of Whisper
# "end of segment" predictions occur whenever Whisper predicts a timestamp token
timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin)
single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
timestamp_segment_indices.add_(1)
token_timestamps = seek_outputs[idx]["token_timestamps"] if return_token_timestamps else []
# If whisper predicted a "end of segment" via a timestep token, let's go ever each
# "end of segment" prediction and slice the decoding into segments accordingly
if len(timestamp_segment_indices) > 0:
# if the output contains two consecutive timestamp tokens
slices = timestamp_segment_indices.tolist()
segments = []
if single_timestamp_ending:
slices.append(len(seek_sequence))
last_slice = 0
# Add each segment to list of all segments
for current_slice in slices:
sliced_tokens = seek_sequence[last_slice:current_slice]
start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin
end_timestamp_pos = sliced_tokens[-1].item() - timestamp_begin
segments.append(
{
"start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
"end": time_offset[prev_idx] + end_timestamp_pos * time_precision,
"tokens": sliced_tokens,
"result": seek_outputs[idx],
}
)
if return_token_timestamps:
segments[-1]["token_timestamps"] = (
token_timestamps[last_slice:current_slice] + time_offset[prev_idx]
)
last_slice = current_slice
if not single_timestamp_ending:
# generate all predictions after the last predicted "end of segment" and seek by 30s
sliced_tokens = seek_sequence[last_slice:]
start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin
end_timestamp_pos = seek_num_frames[prev_idx] // 2
segments.append(
{
"start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
"end": time_offset[prev_idx] + end_timestamp_pos * time_precision,
"tokens": sliced_tokens,
"result": seek_outputs[idx],
}
)
segment_offset = seek_num_frames[prev_idx]
else:
# If whisper does not predict any "end of segment" token, then
# the whole decoding is considered a segment and we add it to the list of segments
timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()]
start_timestamp_pos = 0.0
last_timestamp_pos = seek_num_frames[prev_idx] // 2
if timestamps.numel() > 1:
start_timestamp_pos = timestamps[-2].item() - timestamp_begin
last_timestamp_pos = timestamps[-1].item() - timestamp_begin
elif timestamps.numel() == 1:
# no consecutive timestamps but it has a timestamp; use the last one.
start_timestamp_pos = timestamps[-1].item() - timestamp_begin
segments = [
{
"start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
"end": time_offset[prev_idx] + last_timestamp_pos * time_precision,
"tokens": seek_sequence,
"result": seek_outputs[idx],
}
]
segment_offset = seek_num_frames[prev_idx]
return segments, segment_offset
def _retrieve_segment_mt(
self,
seek_sequences,
seek_outputs,
time_offset,
timestamp_begin,
seek_num_frames,
time_precision,
input_stride,
prev_ids,
ids,
return_token_timestamps,
):
sequences, seeks = [], []
for sequence, prev_id, idx in zip(seek_sequences, prev_ids, ids):
seq, seek = self._retrieve_segment(
seek_sequence=sequence,
seek_outputs=seek_outputs,
time_offset=time_offset,
timestamp_begin=timestamp_begin,
seek_num_frames=seek_num_frames,
time_precision=time_precision,
input_stride=input_stride,
prev_idx=prev_id,
idx=idx,
return_token_timestamps=return_token_timestamps,
)
sequences.append(seq)
seeks.append(seek +int(time_offset[prev_id] * 100))
# best_seek = self._find_common_seek(sequences, seeks)
best_seek = seeks[0]
# print(f"Best seek {best_seek}")
if best_seek - (min(time_offset[prev_ids]) *100) < 100:
# we cannot rollback, we have to decode segments as they are
sequences, seeks = [], []
for sequence, prev_id, idx in zip(seek_sequences, prev_ids, ids):
seq, seek = self._retrieve_segment_wo_seek(
seek_sequence=sequence,
seek_outputs=seek_outputs,
time_offset=time_offset,
timestamp_begin=timestamp_begin,
seek_num_frames=seek_num_frames,
time_precision=time_precision,
input_stride=input_stride,
prev_idx=prev_id,
idx=idx,
return_token_timestamps=return_token_timestamps,
)
sequences.append(seq)
seeks.append(seek)
return sequences, seeks
seqs_new = self.remove_segments_after_seek(sequences, best_seek)
seeks = [best_seek - int(min(time_offset[prev_ids]) * 100) for _ in seeks]
return seqs_new, seeks
def _beam_search(
self,
input_ids: torch.LongTensor,
beam_scorer: BeamScorer,
logits_processor: LogitsProcessorList,
stopping_criteria: StoppingCriteriaList,
generation_config: GenerationConfig,
synced_gpus: bool,
logits_warper: Optional[LogitsProcessorList] = None,
**model_kwargs,
) -> Union[GenerateBeamOutput, torch.LongTensor]:
r"""
Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
beam_scorer (`BeamScorer`):
An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
logits_processor (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step.
stopping_criteria (`StoppingCriteriaList`:
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop.
generation_config ([`~generation.GenerationConfig`]):
The generation configuration to be used as parametrization of the decoding method.
synced_gpus (`bool`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
logits_warper (`LogitsProcessorList`, *optional*):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
to warp the prediction score distribution of the language modeling head applied before multinomial
sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in
`generation_config`)
model_kwargs:
Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
an encoder-decoder model the kwargs should include `encoder_outputs`.
Return:
[`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
`torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
[`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`.
"""
# init values
pad_token_id = generation_config.pad_token_id
eos_token_id = generation_config.eos_token_id
output_attentions = generation_config.output_attentions
output_hidden_states = generation_config.output_hidden_states
output_scores = generation_config.output_scores
output_logits = generation_config.output_logits
return_dict_in_generate = generation_config.return_dict_in_generate
sequential = generation_config.low_memory
do_sample = generation_config.do_sample
if do_sample is True and not isinstance(logits_warper, LogitsProcessorList):
raise ValueError(
"`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is "
f"{logits_warper})."
)
batch_size = len(beam_scorer._beam_hyps)
num_beams = beam_scorer.num_beams
batch_beam_size, cur_len = input_ids.shape
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
if num_beams * batch_size != batch_beam_size:
raise ValueError(
f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
)
# init attention / hidden states / scores tuples
scores = () if (return_dict_in_generate and output_scores) else None
raw_logits = () if (return_dict_in_generate and output_logits) else None
beam_indices = (
tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
)
decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
cross_attentions = () if (return_dict_in_generate and output_attentions) else None
decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
# if model is an encoder-decoder, retrieve encoder attention weights and hidden states
if return_dict_in_generate and self.config.is_encoder_decoder:
encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
encoder_hidden_states = (
model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
)
# initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
# of the first beam are considered to avoid sampling the exact same tokens across all beams.
beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
beam_scores[:, 1:] = -1e9
beam_scores = beam_scores.view((batch_size * num_beams,))
this_peer_finished = False
decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
# if sequential is True, split the input to batches of batch_size and run sequentially
if sequential:
if any(
model_name in self.__class__.__name__.lower()
for model_name in [
"fsmt",
"reformer",
"bloom",
"ctrl",
"gpt_bigcode",
"transo_xl",
"xlnet",
"cpm",
"jamba",
]
):
raise RuntimeError(
f"Currently generation for {self.__class__.__name__} is not supported "
f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature."
)
inputs_per_sub_batches = _split_model_inputs(
model_inputs, split_size=batch_size, full_batch_size=batch_beam_size
)
outputs_per_sub_batch = [
self(
**inputs_per_sub_batch,
return_dict=True,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
for inputs_per_sub_batch in inputs_per_sub_batches
]
outputs = stack_model_outputs(outputs_per_sub_batch)
else: # Unchanged original behavior
outputs = self(
**model_inputs,
return_dict=True,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
if synced_gpus and this_peer_finished:
cur_len = cur_len + 1
continue # don't waste resources running the code we don't need
next_token_logits = outputs.logits[:, -1, :]
next_token_scores = nn.functional.log_softmax(
next_token_logits, dim=-1
) # (batch_size * num_beams, vocab_size)
next_token_scores_processed = logits_processor(input_ids, next_token_scores)
if do_sample:
next_token_scores_processed = logits_warper(input_ids, next_token_scores_processed)
next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
next_token_scores_processed
)
# Store scores, attentions and hidden_states when required
if return_dict_in_generate:
if output_scores:
scores += (next_token_scores_processed,)
if output_logits:
raw_logits += (next_token_logits,)
if output_attentions:
decoder_attentions += (
(outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
)
if self.config.is_encoder_decoder:
cross_attentions += (outputs.cross_attentions,)
if output_hidden_states:
decoder_hidden_states += (
(outputs.decoder_hidden_states,)
if self.config.is_encoder_decoder
else (outputs.hidden_states,)
)
# reshape for beam search
vocab_size = next_token_scores.shape[-1]
next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
# Beam token selection: pick 1 + eos_token_id.shape[0] next tokens for each beam so we have at least 1
# non eos token per beam.
n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
n_tokens_to_keep = max(2, 1 + n_eos_tokens) * num_beams
if do_sample:
probs = nn.functional.softmax(next_token_scores, dim=-1)
next_tokens = torch.multinomial(probs, num_samples=n_tokens_to_keep)
next_token_scores = torch.gather(next_token_scores, -1, next_tokens)
next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1)
next_tokens = torch.gather(next_tokens, -1, _indices)
else:
next_token_scores, next_tokens = torch.topk(
next_token_scores, n_tokens_to_keep, dim=1, largest=True, sorted=True
)
next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
next_tokens = next_tokens % vocab_size
# stateless
beam_outputs = beam_scorer.process(
input_ids,
next_token_scores,
next_tokens,
next_indices,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
beam_indices=beam_indices,
decoder_prompt_len=decoder_prompt_len,
)
beam_scores = beam_outputs["next_beam_scores"]
beam_next_tokens = beam_outputs["next_beam_tokens"]
beam_idx = beam_outputs["next_beam_indices"]
# Based on the beam idx and next tokens reshuffle the ctc prev states and scores
if hasattr(self, "ctc_rescorer"):
self.ctc_rescorer.update_state(beam_next_tokens, beam_idx)
input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
model_kwargs = self._update_model_kwargs_for_generation(
outputs,
model_kwargs,
is_encoder_decoder=self.config.is_encoder_decoder,
)
if model_kwargs.get("past_key_values", None) is not None:
model_kwargs["past_key_values"] = self._temporary_reorder_cache(
model_kwargs["past_key_values"], beam_idx
)
if return_dict_in_generate and output_scores:
beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
# increase cur_len
cur_len = cur_len + 1
if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
this_peer_finished = True
sequence_outputs = beam_scorer.finalize(
input_ids,
beam_scores,
next_tokens,
next_indices,
pad_token_id=pad_token_id,
eos_token_id=eos_token_id,
max_length=stopping_criteria.max_length,
beam_indices=beam_indices,
decoder_prompt_len=decoder_prompt_len,
)
if return_dict_in_generate:
if not output_scores:
sequence_outputs["sequence_scores"] = None
if self.config.is_encoder_decoder:
return GenerateBeamEncoderDecoderOutput(
sequences=sequence_outputs["sequences"],
sequences_scores=sequence_outputs["sequence_scores"],
scores=scores,
logits=raw_logits,
beam_indices=sequence_outputs["beam_indices"],
encoder_attentions=encoder_attentions,
encoder_hidden_states=encoder_hidden_states,
decoder_attentions=decoder_attentions,
cross_attentions=cross_attentions,
decoder_hidden_states=decoder_hidden_states,
past_key_values=model_kwargs.get("past_key_values"),
)
else:
return GenerateBeamDecoderOnlyOutput(
sequences=sequence_outputs["sequences"],
sequences_scores=sequence_outputs["sequence_scores"],
scores=scores,
logits=raw_logits,
beam_indices=sequence_outputs["beam_indices"],
attentions=decoder_attentions,
hidden_states=decoder_hidden_states,
past_key_values=model_kwargs.get("past_key_values"),
)
else:
return sequence_outputs["sequences"]
def _sample(
self,
input_ids: torch.LongTensor,
logits_processor: LogitsProcessorList,
stopping_criteria: StoppingCriteriaList,
generation_config: GenerationConfig,
synced_gpus: bool,
streamer: Optional["BaseStreamer"],
logits_warper: Optional[LogitsProcessorList] = None,
**model_kwargs,
) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
r"""
Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
Parameters:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
logits_processor (`LogitsProcessorList`):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
used to modify the prediction scores of the language modeling head applied at each generation step.
stopping_criteria (`StoppingCriteriaList`):
An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
used to tell if the generation loop should stop.
generation_config ([`~generation.GenerationConfig`]):
The generation configuration to be used as parametrization of the decoding method.
synced_gpus (`bool`):
Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
streamer (`BaseStreamer`, *optional*):
Streamer object that will be used to stream the generated sequences. Generated tokens are passed
through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
logits_warper (`LogitsProcessorList`, *optional*):
An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
to warp the prediction score distribution of the language modeling head applied before multinomial
sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in
`generation_config`)
model_kwargs:
Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
an encoder-decoder model the kwargs should include `encoder_outputs`.
Return:
[`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
A `torch.LongTensor` containing the generated tokens (default behaviour) or a
[`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
`return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
`model.config.is_encoder_decoder=True`.
"""
# init values
pad_token_id = generation_config.pad_token_id
output_attentions = generation_config.output_attentions
output_hidden_states = generation_config.output_hidden_states
output_scores = generation_config.output_scores
output_logits = generation_config.output_logits
return_dict_in_generate = generation_config.return_dict_in_generate
has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
do_sample = generation_config.do_sample
if do_sample is True and not isinstance(logits_warper, LogitsProcessorList):
raise ValueError(
"`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is "
f"{logits_warper})."
)
# init attention / hidden states / scores tuples
scores = () if (return_dict_in_generate and output_scores) else None
raw_logits = () if (return_dict_in_generate and output_logits) else None
decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
cross_attentions = () if (return_dict_in_generate and output_attentions) else None
decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
# if model is an encoder-decoder, retrieve encoder attention weights and hidden states
if return_dict_in_generate and self.config.is_encoder_decoder:
encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
encoder_hidden_states = (
model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
)
# keep track of which sequences are already finished
batch_size = input_ids.shape[0]
this_peer_finished = False
unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
# prepare model inputs
model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
# forward pass to get next token
outputs = self(
**model_inputs,
return_dict=True,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
)
if synced_gpus and this_peer_finished:
continue # don't waste resources running the code we don't need
next_token_logits = outputs.logits[:, -1, :]
# pre-process distribution
next_token_scores = logits_processor(input_ids, next_token_logits)
if do_sample:
next_token_scores = logits_warper(input_ids, next_token_scores)
# Store scores, attentions and hidden_states when required
if return_dict_in_generate:
if output_scores:
scores += (next_token_scores,)
if output_logits:
raw_logits += (next_token_logits,)
if output_attentions:
decoder_attentions += (
(outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
)
if self.config.is_encoder_decoder:
cross_attentions += (outputs.cross_attentions,)
if output_hidden_states:
decoder_hidden_states += (
(outputs.decoder_hidden_states,)
if self.config.is_encoder_decoder
else (outputs.hidden_states,)
)
# token selection
if do_sample:
probs = nn.functional.softmax(next_token_scores, dim=-1)
next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
else:
next_tokens = torch.argmax(next_token_scores, dim=-1)
# finished sentences should have their next token be a padding token
if has_eos_stopping_criteria:
next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
# Based on the next tokens select the ctc prev states and scores
if hasattr(self, "ctc_rescorer"):
self.ctc_rescorer.update_state(next_tokens, torch.arange(next_tokens.shape[0]))
# update generated ids, model inputs, and length for next step
input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
if streamer is not None:
streamer.put(next_tokens.cpu())
model_kwargs = self._update_model_kwargs_for_generation(
outputs,
model_kwargs,
is_encoder_decoder=self.config.is_encoder_decoder,
)
unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
this_peer_finished = unfinished_sequences.max() == 0
if streamer is not None:
streamer.end()
if return_dict_in_generate:
if self.config.is_encoder_decoder:
return GenerateEncoderDecoderOutput(
sequences=input_ids,
scores=scores,
logits=raw_logits,
encoder_attentions=encoder_attentions,
encoder_hidden_states=encoder_hidden_states,
decoder_attentions=decoder_attentions,
cross_attentions=cross_attentions,
decoder_hidden_states=decoder_hidden_states,
past_key_values=model_kwargs.get("past_key_values"),
)
else:
return GenerateDecoderOnlyOutput(
sequences=input_ids,
scores=scores,
logits=raw_logits,
attentions=decoder_attentions,
hidden_states=decoder_hidden_states,
past_key_values=model_kwargs.get("past_key_values"),
)
else:
return input_ids
def prepare_kwargs_for_generate(self,
segment_input,
cur_bsz,
batch_idx_map,
seek,
num_segment_frames,
max_frames,
kwargs):
kwargs["attention_mask_enc"] = torch.ones(cur_bsz, segment_input.size(-1), device=segment_input.device)
seek_vad = seek // 2
num_frames_vad = num_segment_frames // 2
max_frames_vad = max_frames // 2
seek_num_frames = (max_frames_vad - seek_vad).clamp(max=num_frames_vad)
stno_masks = []
for i in range(cur_bsz):
prev_i = batch_idx_map[i]
segment_input_slice = kwargs["stno_mask"][prev_i: prev_i + 1, :,
seek_vad[prev_i]: seek_vad[prev_i] + seek_num_frames[prev_i]]
if segment_input_slice.shape[-1] < num_frames_vad:
orig_len = segment_input_slice.shape[-1]
# pad to 3000 if necessary
segment_input_slice = torch.nn.functional.pad(
segment_input_slice, pad=(0, num_frames_vad - orig_len)
)
# set corresponding padding tokens to 1 in vad mask representing silence
segment_input_slice[0, 0, orig_len:] = 1.0
stno_masks.append(segment_input_slice)
kwargs["stno_mask"] = torch.cat(stno_masks, dim=0)
self.stno_mask_seek = kwargs["stno_mask"]
if "per_group_sizes" in kwargs:
group_sizes = kwargs["per_group_sizes"].clone()
group_sizes[:] = 0
cummulative_group_sizes = (
kwargs["per_group_sizes"].max().repeat(kwargs["per_group_sizes"].shape[0])).cumsum(dim=0)
for i in batch_idx_map:
group_idx = (cummulative_group_sizes > i).nonzero().min()
group_sizes[group_idx] += 1
kwargs["per_group_sizes"] = group_sizes
if self.vad_seek_callback is not None:
self.vad_seek_callback(kwargs["stno_mask"])
return kwargs
def generate_with_fallback(
self,
segment_input,
decoder_input_ids,
cur_bsz,
batch_idx_map,
seek,
num_segment_frames,
max_frames,
temperatures,
generation_config,
logits_processor,
stopping_criteria,
prefix_allowed_tokens_fn,
synced_gpus,
return_token_timestamps,
do_condition_on_prev_tokens,
kwargs,
):
kwargs = copy.copy(kwargs)
kwargs = self.prepare_kwargs_for_generate(segment_input, cur_bsz, batch_idx_map, seek, num_segment_frames,
max_frames, kwargs)
seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens = super().generate_with_fallback(
segment_input,
decoder_input_ids,
cur_bsz,
batch_idx_map,
seek,
num_segment_frames,
max_frames,
temperatures,
generation_config,
logits_processor,
stopping_criteria,
prefix_allowed_tokens_fn,
synced_gpus,
return_token_timestamps,
do_condition_on_prev_tokens,
kwargs,
)
self.stno_mask_seek =None
# for i, seq in enumerate(seek_outputs):
# print(f"Sequence {i}: {self.tokenizer.decode(seq, decode_with_timestamps=True)}")
# print("-"*50)
return seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens
def _retrieve_init_tokens(self, input_features, batch_size, generation_config, config, num_segment_frames, kwargs):
def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
"""short function to replace num with a itr in lst"""
found = any(i in lst for i in itr)
if found:
lst = [num if i in itr else i for i in lst]
else:
lst.append(num)
return lst
def language_to_id(language: str) -> int:
language = language.lower()
if language in generation_config.lang_to_id.keys():
language_token = language
elif language in TO_LANGUAGE_CODE.keys():
language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
elif language in TO_LANGUAGE_CODE.values():
language_token = f"<|{language}|>"
else:
is_language_code = len(language) == 2
raise ValueError(
f"Unsupported language: {language}. Language should be one of:"
f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
)
if language_token not in generation_config.lang_to_id:
raise ValueError(
f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
"(You should just add it to the generation config)"
)
return generation_config.lang_to_id[language_token]
task = getattr(generation_config, "task", None)
language = getattr(generation_config, "language", None)
forced_decoder_ids = generation_config.forced_decoder_ids
if forced_decoder_ids is not None:
if language is None and task is None and forced_decoder_ids[0][1] is None:
logger.warning_once(
"Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English."
"This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`."
)
elif hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None:
forced_decoder_ids = config.forced_decoder_ids
elif forced_decoder_ids is not None and language is not None:
logger.info(
f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}."
)
forced_decoder_ids = None
init_tokens = [generation_config.decoder_start_token_id]
# Update init_tokens with languages
lang_ids = None
if forced_decoder_ids is not None:
return forced_decoder_ids
# from v4.39 the forced decoder ids are always None in favour of decoder input ids
generation_config.forced_decoder_ids = None
is_lang_id_undefined = len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None)
# Make sure language is a list of strings of the correct length
if isinstance(language, (list, tuple)):
if any(l is None for l in language):
raise TypeError(
"Expected `language` to be `None`, a single string (e.g. `'en'`), or a list of strings with length equal to the batch size (e.g. `('en', 'fr')` for a batch size of 2). Got a list containing `None`."
)
if len(language) != batch_size:
raise ValueError(
"When passing a list of languages, the length of the list must match the batch size. "
f"Expected length of {batch_size}, but got {len(language)} languages."
)
languages = language
elif language is None:
# Language will be detected for each item in batch
languages = [None] * batch_size
else:
languages = [language] # Use a length-1 list now, broadcast later
# Separate init_tokens for each language
init_tokens = [copy.copy(init_tokens) for _ in languages]
if language is not None and lang_ids is not None:
lang_ids = [language_to_id(l) for l in languages]
elif hasattr(generation_config, "lang_to_id") and is_lang_id_undefined:
# language is not defined or intentially set to `None` to trigger language detection
lang_ids = self.detect_language(
input_features=input_features,
encoder_outputs=kwargs.get("encoder_outputs", None),
generation_config=generation_config,
num_segment_frames=num_segment_frames,
).tolist()
if lang_ids is not None:
# append or replace lang_ids to init_tokens
for i in range(len(init_tokens)):
if len(init_tokens[i]) > 1:
init_tokens[i][1] = lang_ids[i]
else:
init_tokens[i].append(lang_ids[i])
del languages
# Update init_tokens with task
for i in range(len(init_tokens)):
if task is not None:
if task in TASK_IDS:
init_tokens[i].append(generation_config.task_to_id[generation_config.task])
task_id = generation_config.task_to_id[generation_config.task]
# if task is defined it'll overwrite task ids that might have already been defined via the generation_config
replace_or_add(init_tokens[i], task_id, generation_config.task_to_id.values())
else:
raise ValueError(f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`")
elif language is not None and hasattr(generation_config, "task_to_id"):
# if language is defined, but no task id is in `init_tokens`, default to transcribe
if not any(ti in init_tokens[i] for ti in generation_config.task_to_id.values()):
init_tokens[i].append(generation_config.task_to_id["transcribe"])
# let's make sure we don't pass `None` tokens as prompt tokens
init_tokens[i] = [t for t in init_tokens[i] if t is not None]
return torch.as_tensor(init_tokens, dtype=torch.long, device=self.device).expand(batch_size, -1)
def detect_language(
self,
input_features: Optional[torch.FloatTensor] = None,
encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]] = None,
generation_config: Optional[GenerationConfig] = None,
num_segment_frames: int = 3000,
) -> torch.Tensor:
"""
Detects language from log-mel input features or encoder_outputs
Parameters:
input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*):
Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by
loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
[`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
generation_config (`~generation.GenerationConfig`, *optional*):
The generation configuration to be used as base parametrization for the generation call. `**kwargs`
passed to generate matching the attributes of `generation_config` will override them. If
`generation_config` is not provided, the default will be used, which had the following loading
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
default values, whose documentation should be checked to parameterize generation.
num_segment_frames (`int`, defaults to 3000):
The number of log-mel frames the model expects
Return:
A `torch.LongTensor` representing the detected language ids.
"""
if input_features is None and encoder_outputs is None:
raise ValueError("You have to specify either `input_features` or `encoder_outputs`")
elif input_features is not None and encoder_outputs is not None:
raise ValueError("Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!")
elif input_features is not None:
inputs = {"input_features": input_features[:, :, :num_segment_frames]}
batch_size = input_features.shape[0]
elif encoder_outputs is not None:
inputs = {"encoder_outputs": encoder_outputs}
batch_size = (
encoder_outputs[0].shape[0] if isinstance(encoder_outputs, BaseModelOutput) else encoder_outputs[0]
)
generation_config = generation_config or self.generation_config
decoder_input_ids = (
torch.ones((batch_size, 1), device=self.device, dtype=torch.long)
* generation_config.decoder_start_token_id
)
with torch.no_grad():
logits = self(**inputs, decoder_input_ids=decoder_input_ids,
stno_mask=self.stno_mask_seek if self.stno_mask_seek is not None else self.stno_mask[:, :,
:num_segment_frames // 2]).logits[
:, -1]
non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool)
non_lang_mask[list(generation_config.lang_to_id.values())] = False
logits[:, non_lang_mask] = -np.inf
lang_ids = logits.argmax(-1)
return lang_ids
def _get_logits_processor(
self,
generation_config: GenerationConfig,
input_ids_seq_length: int,
encoder_input_ids: torch.LongTensor,
prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]],
logits_processor: Optional[LogitsProcessorList],
device: str = None,
model_kwargs: Optional[Dict[str, Any]] = None,
negative_prompt_ids: Optional[torch.Tensor] = None,
negative_prompt_attention_mask: Optional[torch.Tensor] = None,
) -> LogitsProcessorList:
# pylint: disable=no-member
gen_config_copy = copy.deepcopy(generation_config)
gen_config_copy.forced_decoder_ids = None
processors = super()._get_logits_processor(
gen_config_copy,
input_ids_seq_length,
encoder_input_ids,
prefix_allowed_tokens_fn,
logits_processor,
device,
model_kwargs,
negative_prompt_ids,
negative_prompt_attention_mask,
)
if hasattr(generation_config, "ctc_weight") and generation_config.ctc_weight > 0:
enc_logits = self.encoder_logits
if generation_config.num_beams <= 1:
processors.append(LogSoftmaxProcessor())
else:
enc_logits = enc_logits.repeat_interleave(generation_config.num_beams, dim=0)
self.ctc_rescorer = CTCRescorerLogitsProcessor(
enc_logits,
torch.full((enc_logits.shape[0],), fill_value=enc_logits.shape[1],
device=enc_logits.device),
enc_logits.shape[-1] - 1,
generation_config.pad_token_id.item(),
generation_config.eos_token_id.item(),
generation_config.decoder_start_token_id.item(),
self.tokenizer,
generation_config.ctc_margin,
generation_config.ctc_weight,
generation_config.num_beams,
False,
)
processors.append(self.ctc_rescorer)
return processors
def _retrieve_logit_processors(self, generation_config, logits_processor, begin_index, is_shortform, num_beams, device):
if generation_config.return_timestamps is True:
timestamp_processor = WhisperTimeStampLogitsProcessorCustom(generation_config, begin_index=begin_index)
logits_processor = (
[timestamp_processor] if logits_processor is None else [timestamp_processor] + logits_processor
)
if generation_config.suppress_tokens is not None:
suppress_tokens_processor = SuppressTokensLogitsProcessor(generation_config.suppress_tokens, device=device)
logits_processor = (
[suppress_tokens_processor]
if logits_processor is None
else [suppress_tokens_processor] + logits_processor
)
generation_config.suppress_tokens = None
if generation_config.begin_suppress_tokens is not None:
begin_suppress_processor = SuppressTokensAtBeginLogitsProcessor(
generation_config.begin_suppress_tokens, begin_index=begin_index, device=device
)
logits_processor = (
[begin_suppress_processor]
if logits_processor is None
else [begin_suppress_processor] + logits_processor
)
generation_config.begin_suppress_tokens = None
if generation_config.no_speech_threshold is not None and not is_shortform:
no_speech_detector = WhisperNoSpeechDetection(
no_speech_token=generation_config.no_timestamps_token_id - 1,
begin_index=begin_index,
scores_is_logprobs=num_beams > 1,
)
logits_processor = (
[no_speech_detector] if logits_processor is None else [no_speech_detector] + logits_processor
)
no_speech_detector.set_model(self)
return logits_processor
@staticmethod
def round_to_nearest_0_02(x):
d = Decimal(str(x)) # Use str(x) to preserve input precision
step = Decimal('0.02')
# Divide, round, multiply back
rounded = (d / step).to_integral_value(rounding=ROUND_HALF_UP) * step
return rounded
def _fix_timestamps_from_segmentation(self, sequences):
"""
Adjusts token sequences with global timestamps to fit within Whisper's 0–30s timestamp token range.
This function modifies the input sequences by inserting appropriate timestamp tokens and
offset corrections to ensure the decoded token order is correct, without splitting any segment.
It aligns all timestamps to 0.02-second precision, inserts placeholder segments to bridge
time gaps between 30-second windows, and maintains segment continuity during encoding.
Args:
sequences (dict): A dictionary containing:
- 'segments': A list of segment lists, each segment being a dict with 'start', 'end', and 'tokens'.
- 'sequences': A tensor used to determine device for padding.
Returns:
torch.Tensor: A batch of padded token sequences with corrected timestamp alignment.
"""
# Get the token ID for the "<|0.00|>" timestamp used to detect dummy segments
first_timestamp_token = self.tokenizer.get_vocab()["<|0.00|>"]
results = []
# Filter out segments that are either empty or consist only of the "<|0.00|>" token
for idx, sequence_segs in enumerate(sequences['segments']):
sequences['segments'][idx] = [
seg for seg in sequence_segs
if len(seg['tokens']) > 0 and (len(seg['tokens']) != 1 or seg['tokens'][0] != first_timestamp_token)
]
# Iterate over each group of segments (e.g., one per utterance)
for idx, sequence_segs in enumerate(sequences['segments']):
result = []
prev_segment_end_time = None
correction = Decimal(0.0)
for i, seg in enumerate(sequence_segs):
# Round start and end times to nearest 0.02 seconds
start_time = self.round_to_nearest_0_02(seg['start'].item())
end_time = self.round_to_nearest_0_02(seg['end'].item())
tokens = seg['tokens']
# Determine which 30s window this segment falls into
current_block = (start_time + correction) // 30
if prev_segment_end_time is not None:
# If not the first segment, calculate difference in 30s windows
prev_block = prev_segment_end_time // 30
num_dummies = current_block - prev_block - 1
# Insert (30, [], 30) marker if we're moving to a new block
if current_block > prev_block:
result.append((30, [], 30))
# Insert dummy segments to bridge skipped 30s blocks
for _ in range(int(num_dummies)):
result.append((0, [], 30))
else:
# For the first segment, add dummy blocks if it starts after 30s
for _ in range(int(start_time // 30)):
result.append((0, [], 30))
# Determine whether segment fits in one block or wraps to the next
if (start_time + correction) // 30 == (end_time + correction) // 30:
# Segment fits within a single 30s window
result.append(((start_time + correction) % 30, tokens, (end_time + correction) % 30))
else:
# Segment would wrap across a 30s boundary
new_seg_start = (correction + start_time) % 30
new_seg_end = end_time - start_time
if new_seg_end >= new_seg_start:
# Seek back to the beginning of the segment window
result.append((new_seg_start, [], new_seg_start))
result.append((0, tokens, new_seg_end))
# Apply correction to align future timestamps to new 30s block
correction = self.round_to_nearest_0_02(-(start_time % 30))
else:
# Otherwise, just insert with adjusted times
result.append((new_seg_start, tokens, new_seg_end))
correction = self.round_to_nearest_0_02(30 - (start_time % 30))
# print(f'Processed segment {i}, result: {self.tokenizer.decode(self.tokenizer("".join([f"<|{seg[0]:.2f}|>{self.tokenizer.decode(seg[1])}<|{seg[2]:.2f}|>" for seg in result]))["input_ids"], decode_with_timestamps=True)[-250:]}')
# Update the previous segment's end time for next iteration
prev_segment_end_time = end_time + correction
# Convert result segments into a token sequence with proper timestamp formatting
encoded = self.tokenizer(
"".join([f"<|{seg[0]:.2f}|>{self.tokenizer.decode(seg[1])}<|{seg[2]:.2f}|>" for seg in result])
)['input_ids']
results.append(encoded)
# Pad all sequences to the same length for batching
sequences = pad_sequence(
[torch.tensor(res, device=sequences['sequences'].device) for res in results],
batch_first=True,
padding_value=self.tokenizer.pad_token_id
)
return sequences
@staticmethod
def _retrieve_segment(
seek_sequence,
seek_outputs,
time_offset,
timestamp_begin,
seek_num_frames,
time_precision,
input_stride,
prev_idx,
idx,
return_token_timestamps,
):
# find the predicted "end of segment" predictions of Whisper
# "end of segment" predictions occur whenever Whisper predicts a timestamp token
timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin)
single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
timestamp_segment_indices.add_(1)
token_timestamps = seek_outputs[idx]["token_timestamps"] if return_token_timestamps else []
# If whisper predicted a "end of segment" via a timestep token, let's go ever each
# "end of segment" prediction and slice the decoding into segments accordingly
if len(timestamp_segment_indices) > 0:
# if the output contains two consecutive timestamp tokens
slices = timestamp_segment_indices.tolist()
segments = []
if single_timestamp_ending:
slices.append(len(seek_sequence))
last_slice = 0
# Add each segment to list of all segments
for current_slice in slices:
sliced_tokens = seek_sequence[last_slice:current_slice]
start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin
end_timestamp_pos = sliced_tokens[-1].item() - timestamp_begin
segments.append(
{
"start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
"end": time_offset[prev_idx] + end_timestamp_pos * time_precision,
"tokens": sliced_tokens,
"result": seek_outputs[idx],
}
)
if return_token_timestamps:
segments[-1]["token_timestamps"] = (
token_timestamps[last_slice:current_slice] + time_offset[prev_idx]
)
last_slice = current_slice
if single_timestamp_ending:
# single timestamp at the end means no speech after the last timestamp.
segment_offset = seek_num_frames[prev_idx]
else:
# otherwise, ignore the unfinished segment and seek to the last timestamp
# here we throw away all predictions after the last predicted "end of segment"
# since we are cutting right in the middle of an audio
last_timestamp_pos = seek_sequence[last_slice - 1].item() - timestamp_begin
segment_offset = last_timestamp_pos * input_stride
else:
# If whisper does not predict any "end of segment" token, then
# the whole decoding is considered a segment and we add it to the list of segments
timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()]
start_timestamp_pos = 0.0
last_timestamp_pos = seek_num_frames[prev_idx] // 2
skip = False
segment_offset = seek_num_frames[prev_idx]
if timestamps.numel() > 1:
start_timestamp_pos = timestamps[-2].item() - timestamp_begin
last_timestamp_pos = timestamps[-1].item() - timestamp_begin
elif timestamps.numel() == 1:
# no consecutive timestamps but it has a timestamp; use the last one.
start_timestamp_pos = timestamps[-1].item() - timestamp_begin
if start_timestamp_pos > 200:
# segment does not fit into decoding window, so we need to rollback
segment_offset = start_timestamp_pos * input_stride - 100 # timestamp might be inaccurate
skip = True
else:
# empty sequence, or sequence w/o timestamps
skip = True
if skip:
segments = []
else:
segments = [
{
"start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
"end": time_offset[prev_idx] + last_timestamp_pos * time_precision,
"tokens": seek_sequence,
"result": seek_outputs[idx],
}
]
if return_token_timestamps:
segments[-1]["token_timestamps"] = token_timestamps + time_offset[prev_idx]
segment_offset = seek_num_frames[prev_idx]
if segment_offset <= 0:
msg = f"Timestamps: {timestamps}, Segments: {segments}"
raise ValueError(f"Segment offset: {segment_offset} <= 0. This should not happen!\n{msg}")
return segments, segment_offset
def _postprocess_outputs(self, seek_outputs, decoder_input_ids, return_token_timestamps, generation_config):
# remove all previously passed decoder input ids
if isinstance(seek_outputs, torch.Tensor):
seek_outputs = seek_outputs[:, decoder_input_ids.shape[-1]:]
seek_outputs = torch.hstack((
seek_outputs,
torch.full((seek_outputs.shape[0], 1),
fill_value=generation_config.pad_token_id,
dtype=seek_outputs.dtype,
device=seek_outputs.device
)
))
# first_eos = (seek_outputs == generation_config.eos_token_id).int().argmax(dim=1)
# biggest_timestamp = generation_config.no_timestamps_token_id + 1 + 30 * 50
# empty_transcriptions = first_eos == 0
# seek_outputs[empty_transcriptions, 0] = generation_config.no_timestamps_token_id + 1 # 0.00 timestamp
# seek_outputs[empty_transcriptions, 1] = biggest_timestamp # 30.00 timestamp
# seek_outputs[empty_transcriptions, 2] = generation_config.eos_token_id # 30.00 timestamp
return seek_outputs, seek_outputs
if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
num_frames = getattr(generation_config, "num_frames", None)
seek_outputs["token_timestamps"] = self._extract_token_timestamps(
seek_outputs, generation_config.alignment_heads, num_frames=num_frames
)
seek_outputs["token_timestamps"] = seek_outputs["token_timestamps"][:, decoder_input_ids.shape[-1]:]
seek_outputs["sequences"] = seek_outputs["sequences"][:, decoder_input_ids.shape[-1]:]
def split_by_batch_index(values, key, batch_idx):
if key == "scores":
return [v[batch_idx].cpu() for v in values]
elif key == "past_key_values":
# we don't save `past_key_values` as this is too costly
return None
elif isinstance(values[batch_idx], tuple) and torch.is_tensor(values[batch_idx][0]):
return tuple(tuple(w[batch_idx][None].cpu() for w in v) for v in values)
return values[batch_idx].cpu()
sequence_tokens = seek_outputs["sequences"]
seek_outputs = [
{k: split_by_batch_index(v, k, i) for k, v in seek_outputs.items()}
for i in range(sequence_tokens.shape[0])
]
return sequence_tokens, seek_outputs